crypto/bn/asm/x86_64-gcc.c

   1 #if !(defined(__GNUC__) && __GNUC__>=2)
   2 # include "../bn_asm.c" /* kind of dirty hack for Sun Studio */
   3 #else
   4 /*
   5  * x86_64 BIGNUM accelerator version 0.1, December 2002.
   6  *
   7  * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   8  * project.
   9  *
  10  * Rights for redistribution and usage in source and binary forms are
  11  * granted according to the OpenSSL license. Warranty of any kind is
  12  * disclaimed.
  13  *
  14  * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
  15  *    versions, like 1.0...
  16  * A. Well, that's because this code is basically a quick-n-dirty
  17  *    proof-of-concept hack. As you can see it's implemented with
  18  *    inline assembler, which means that you're bound to GCC and that
  19  *    there might be enough room for further improvement.
  20  *
  21  * Q. Why inline assembler?
  22  * A. x86_64 features own ABI which I'm not familiar with. This is
  23  *    why I decided to let the compiler take care of subroutine
  24  *    prologue/epilogue as well as register allocation. For reference.
  25  *    Win64 implements different ABI for AMD64, different from Linux.
  26  *
  27  * Q. How much faster does it get?
  28  * A. 'apps/openssl speed rsa dsa' output with no-asm:
  29  *
  30  *                        sign    verify    sign/s verify/s
  31  *      rsa  512 bits   0.0006s   0.0001s   1683.8  18456.2
  32  *      rsa 1024 bits   0.0028s   0.0002s    356.0   6407.0
  33  *      rsa 2048 bits   0.0172s   0.0005s     58.0   1957.8
  34  *      rsa 4096 bits   0.1155s   0.0018s      8.7    555.6
  35  *                        sign    verify    sign/s verify/s
  36  *      dsa  512 bits   0.0005s   0.0006s   2100.8   1768.3
  37  *      dsa 1024 bits   0.0014s   0.0018s    692.3    559.2
  38  *      dsa 2048 bits   0.0049s   0.0061s    204.7    165.0
  39  *
  40  *    'apps/openssl speed rsa dsa' output with this module:
  41  *
  42  *                        sign    verify    sign/s verify/s
  43  *      rsa  512 bits   0.0004s   0.0000s   2767.1  33297.9
  44  *      rsa 1024 bits   0.0012s   0.0001s    867.4  14674.7
  45  *      rsa 2048 bits   0.0061s   0.0002s    164.0   5270.0
  46  *      rsa 4096 bits   0.0384s   0.0006s     26.1   1650.8
  47  *                        sign    verify    sign/s verify/s
  48  *      dsa  512 bits   0.0002s   0.0003s   4442.2   3786.3
  49  *      dsa 1024 bits   0.0005s   0.0007s   1835.1   1497.4
  50  *      dsa 2048 bits   0.0016s   0.0020s    620.4    504.6
  51  *
  52  *    For the reference. IA-32 assembler implementation performs
  53  *    very much like 64-bit code compiled with no-asm on the same
  54  *    machine.
  55  */
  56
  57 #ifdef _WIN64
  58 #define BN_ULONG unsigned long long
  59 #else
  60 #define BN_ULONG unsigned long
  61 #endif
  62
  63 /*
  64  * "m"(a), "+m"(r)      is the way to favor DirectPath µ-code;
  65  * "g"(0)               let the compiler to decide where does it
  66  *                      want to keep the value of zero;
  67  */
  68 #define mul_add(r,a,word,carry) do {    \
  69         register BN_ULONG high,low;     \
  70         asm ("mulq %3"                  \
  71                 : "=a"(low),"=d"(high)  \
  72                 : "a"(word),"m"(a)      \
  73                 : "cc");                \
  74         asm ("addq %2,%0; adcq %3,%1"   \
  75                 : "+r"(carry),"+d"(high)\
  76                 : "a"(low),"g"(0)       \
  77                 : "cc");                \
  78         asm ("addq %2,%0; adcq %3,%1"   \
  79                 : "+m"(r),"+d"(high)    \
  80                 : "r"(carry),"g"(0)     \
  81                 : "cc");                \
  82         carry=high;                     \
  83         } while (0)
  84
  85 #define mul(r,a,word,carry) do {        \
  86         register BN_ULONG high,low;     \
  87         asm ("mulq %3"                  \
  88                 : "=a"(low),"=d"(high)  \
  89                 : "a"(word),"g"(a)      \
  90                 : "cc");                \
  91         asm ("addq %2,%0; adcq %3,%1"   \
  92                 : "+r"(carry),"+d"(high)\
  93                 : "a"(low),"g"(0)       \
  94                 : "cc");                \
  95         (r)=carry, carry=high;          \
  96         } while (0)
  97
  98 #define sqr(r0,r1,a)                    \
  99         asm ("mulq %2"                  \
 100                 : "=a"(r0),"=d"(r1)     \
 101                 : "a"(a)                \
 102                 : "cc");
 103
 104 BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 105         {
 106         BN_ULONG c1=0;
 107
 108         if (num <= 0) return(c1);
 109
 110         while (num&~3)
 111                 {
 112                 mul_add(rp[0],ap[0],w,c1);
 113                 mul_add(rp[1],ap[1],w,c1);
 114                 mul_add(rp[2],ap[2],w,c1);
 115                 mul_add(rp[3],ap[3],w,c1);
 116                 ap+=4; rp+=4; num-=4;
 117                 }
 118         if (num)
 119                 {
 120                 mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
 121                 mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
 122                 mul_add(rp[2],ap[2],w,c1); return c1;
 123                 }
 124
 125         return(c1);
 126         }
 127
 128 BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
 129         {
 130         BN_ULONG c1=0;
 131
 132         if (num <= 0) return(c1);
 133
 134         while (num&~3)
 135                 {
 136                 mul(rp[0],ap[0],w,c1);
 137                 mul(rp[1],ap[1],w,c1);
 138                 mul(rp[2],ap[2],w,c1);
 139                 mul(rp[3],ap[3],w,c1);
 140                 ap+=4; rp+=4; num-=4;
 141                 }
 142         if (num)
 143                 {
 144                 mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
 145                 mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
 146                 mul(rp[2],ap[2],w,c1);
 147                 }
 148         return(c1);
 149         }
 150
 151 void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
 152         {
 153         if (n <= 0) return;
 154
 155         while (n&~3)
 156                 {
 157                 sqr(r[0],r[1],a[0]);
 158                 sqr(r[2],r[3],a[1]);
 159                 sqr(r[4],r[5],a[2]);
 160                 sqr(r[6],r[7],a[3]);
 161                 a+=4; r+=8; n-=4;
 162                 }
 163         if (n)
 164                 {
 165                 sqr(r[0],r[1],a[0]); if (--n == 0) return;
 166                 sqr(r[2],r[3],a[1]); if (--n == 0) return;
 167                 sqr(r[4],r[5],a[2]);
 168                 }
 169         }
 170
 171 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d)
 172 {       BN_ULONG ret,waste;
 173
 174         asm ("divq      %4"
 175                 : "=a"(ret),"=d"(waste)
 176                 : "a"(l),"d"(h),"g"(d)
 177                 : "cc");
 178
 179         return ret;
 180 }
 181
 182 BN_ULONG bn_add_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
 183 { BN_ULONG ret=0,i=0;
 184
 185         if (n <= 0) return 0;
 186
 187         asm (
 188         "       subq    %2,%2           \n"
 189         ".p2align 4                     \n"
 190         "1:     movq    (%4,%2,8),%0    \n"
 191         "       adcq    (%5,%2,8),%0    \n"
 192         "       movq    %0,(%3,%2,8)    \n"
 193         "       leaq    1(%2),%2        \n"
 194         "       loop    1b              \n"
 195         "       sbbq    %0,%0           \n"
 196                 : "=&a"(ret),"+c"(n),"=&r"(i)
 197                 : "r"(rp),"r"(ap),"r"(bp)
 198                 : "cc"
 199         );
 200
 201   return ret&1;
 202 }
 203
 204 #ifndef SIMICS
 205 BN_ULONG bn_sub_words (BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int n)
 206 { BN_ULONG ret=0,i=0;
 207
 208         if (n <= 0) return 0;
 209
 210         asm (
 211         "       subq    %2,%2           \n"
 212         ".p2align 4                     \n"
 213         "1:     movq    (%4,%2,8),%0    \n"
 214         "       sbbq    (%5,%2,8),%0    \n"
 215         "       movq    %0,(%3,%2,8)    \n"
 216         "       leaq    1(%2),%2        \n"
 217         "       loop    1b              \n"
 218         "       sbbq    %0,%0           \n"
 219                 : "=&a"(ret),"+c"(n),"=&r"(i)
 220                 : "r"(rp),"r"(ap),"r"(bp)
 221                 : "cc"
 222         );
 223
 224   return ret&1;
 225 }
 226 #else
 227 /* Simics 1.4<7 has buggy sbbq:-( */
 228 #define BN_MASK2 0xffffffffffffffffL
 229 BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 230         {
 231         BN_ULONG t1,t2;
 232         int c=0;
 233
 234         if (n <= 0) return((BN_ULONG)0);
 235
 236         for (;;)
 237                 {
 238                 t1=a[0]; t2=b[0];
 239                 r[0]=(t1-t2-c)&BN_MASK2;
 240                 if (t1 != t2) c=(t1 < t2);
 241                 if (--n <= 0) break;
 242
 243                 t1=a[1]; t2=b[1];
 244                 r[1]=(t1-t2-c)&BN_MASK2;
 245                 if (t1 != t2) c=(t1 < t2);
 246                 if (--n <= 0) break;
 247
 248                 t1=a[2]; t2=b[2];
 249                 r[2]=(t1-t2-c)&BN_MASK2;
 250                 if (t1 != t2) c=(t1 < t2);
 251                 if (--n <= 0) break;
 252
 253                 t1=a[3]; t2=b[3];
 254                 r[3]=(t1-t2-c)&BN_MASK2;
 255                 if (t1 != t2) c=(t1 < t2);
 256                 if (--n <= 0) break;
 257
 258                 a+=4;
 259                 b+=4;
 260                 r+=4;
 261                 }
 262         return(c);
 263         }
 264 #endif
 265
 266 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 267 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 268 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 269 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
 270
 271 #if 0
 272 /* original macros are kept for reference purposes */
 273 #define mul_add_c(a,b,c0,c1,c2) {       \
 274         BN_ULONG ta=(a),tb=(b);         \
 275         t1 = ta * tb;                   \
 276         t2 = BN_UMULT_HIGH(ta,tb);      \
 277         c0 += t1; t2 += (c0<t1)?1:0;    \
 278         c1 += t2; c2 += (c1<t2)?1:0;    \
 279         }
 280
 281 #define mul_add_c2(a,b,c0,c1,c2) {      \
 282         BN_ULONG ta=(a),tb=(b),t0;      \
 283         t1 = BN_UMULT_HIGH(ta,tb);      \
 284         t0 = ta * tb;                   \
 285         t2 = t1+t1; c2 += (t2<t1)?1:0;  \
 286         t1 = t0+t0; t2 += (t1<t0)?1:0;  \
 287         c0 += t1; t2 += (c0<t1)?1:0;    \
 288         c1 += t2; c2 += (c1<t2)?1:0;    \
 289         }
 290 #else
 291 #define mul_add_c(a,b,c0,c1,c2) do {    \
 292         asm ("mulq %3"                  \
 293                 : "=a"(t1),"=d"(t2)     \
 294                 : "a"(a),"m"(b)         \
 295                 : "cc");                \
 296         asm ("addq %2,%0; adcq %3,%1"   \
 297                 : "+r"(c0),"+d"(t2)     \
 298                 : "a"(t1),"g"(0)        \
 299                 : "cc");                \
 300         asm ("addq %2,%0; adcq %3,%1"   \
 301                 : "+r"(c1),"+r"(c2)     \
 302                 : "d"(t2),"g"(0)        \
 303                 : "cc");                \
 304         } while (0)
 305
 306 #define sqr_add_c(a,i,c0,c1,c2) do {    \
 307         asm ("mulq %2"                  \
 308                 : "=a"(t1),"=d"(t2)     \
 309                 : "a"(a[i])             \
 310                 : "cc");                \
 311         asm ("addq %2,%0; adcq %3,%1"   \
 312                 : "+r"(c0),"+d"(t2)     \
 313                 : "a"(t1),"g"(0)        \
 314                 : "cc");                \
 315         asm ("addq %2,%0; adcq %3,%1"   \
 316                 : "+r"(c1),"+r"(c2)     \
 317                 : "d"(t2),"g"(0)        \
 318                 : "cc");                \
 319         } while (0)
 320
 321 #define mul_add_c2(a,b,c0,c1,c2) do {   \
 322         asm ("mulq %3"                  \
 323                 : "=a"(t1),"=d"(t2)     \
 324                 : "a"(a),"m"(b)         \
 325                 : "cc");                \
 326         asm ("addq %0,%0; adcq %2,%1"   \
 327                 : "+d"(t2),"+r"(c2)     \
 328                 : "g"(0)                \
 329                 : "cc");                \
 330         asm ("addq %0,%0; adcq %2,%1"   \
 331                 : "+a"(t1),"+d"(t2)     \
 332                 : "g"(0)                \
 333                 : "cc");                \
 334         asm ("addq %2,%0; adcq %3,%1"   \
 335                 : "+r"(c0),"+d"(t2)     \
 336                 : "a"(t1),"g"(0)        \
 337                 : "cc");                \
 338         asm ("addq %2,%0; adcq %3,%1"   \
 339                 : "+r"(c1),"+r"(c2)     \
 340                 : "d"(t2),"g"(0)        \
 341                 : "cc");                \
 342         } while (0)
 343 #endif
 344
 345 #define sqr_add_c2(a,i,j,c0,c1,c2)      \
 346         mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 347
 348 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 349         {
 350         BN_ULONG t1,t2;
 351         BN_ULONG c1,c2,c3;
 352
 353         c1=0;
 354         c2=0;
 355         c3=0;
 356         mul_add_c(a[0],b[0],c1,c2,c3);
 357         r[0]=c1;
 358         c1=0;
 359         mul_add_c(a[0],b[1],c2,c3,c1);
 360         mul_add_c(a[1],b[0],c2,c3,c1);
 361         r[1]=c2;
 362         c2=0;
 363         mul_add_c(a[2],b[0],c3,c1,c2);
 364         mul_add_c(a[1],b[1],c3,c1,c2);
 365         mul_add_c(a[0],b[2],c3,c1,c2);
 366         r[2]=c3;
 367         c3=0;
 368         mul_add_c(a[0],b[3],c1,c2,c3);
 369         mul_add_c(a[1],b[2],c1,c2,c3);
 370         mul_add_c(a[2],b[1],c1,c2,c3);
 371         mul_add_c(a[3],b[0],c1,c2,c3);
 372         r[3]=c1;
 373         c1=0;
 374         mul_add_c(a[4],b[0],c2,c3,c1);
 375         mul_add_c(a[3],b[1],c2,c3,c1);
 376         mul_add_c(a[2],b[2],c2,c3,c1);
 377         mul_add_c(a[1],b[3],c2,c3,c1);
 378         mul_add_c(a[0],b[4],c2,c3,c1);
 379         r[4]=c2;
 380         c2=0;
 381         mul_add_c(a[0],b[5],c3,c1,c2);
 382         mul_add_c(a[1],b[4],c3,c1,c2);
 383         mul_add_c(a[2],b[3],c3,c1,c2);
 384         mul_add_c(a[3],b[2],c3,c1,c2);
 385         mul_add_c(a[4],b[1],c3,c1,c2);
 386         mul_add_c(a[5],b[0],c3,c1,c2);
 387         r[5]=c3;
 388         c3=0;
 389         mul_add_c(a[6],b[0],c1,c2,c3);
 390         mul_add_c(a[5],b[1],c1,c2,c3);
 391         mul_add_c(a[4],b[2],c1,c2,c3);
 392         mul_add_c(a[3],b[3],c1,c2,c3);
 393         mul_add_c(a[2],b[4],c1,c2,c3);
 394         mul_add_c(a[1],b[5],c1,c2,c3);
 395         mul_add_c(a[0],b[6],c1,c2,c3);
 396         r[6]=c1;
 397         c1=0;
 398         mul_add_c(a[0],b[7],c2,c3,c1);
 399         mul_add_c(a[1],b[6],c2,c3,c1);
 400         mul_add_c(a[2],b[5],c2,c3,c1);
 401         mul_add_c(a[3],b[4],c2,c3,c1);
 402         mul_add_c(a[4],b[3],c2,c3,c1);
 403         mul_add_c(a[5],b[2],c2,c3,c1);
 404         mul_add_c(a[6],b[1],c2,c3,c1);
 405         mul_add_c(a[7],b[0],c2,c3,c1);
 406         r[7]=c2;
 407         c2=0;
 408         mul_add_c(a[7],b[1],c3,c1,c2);
 409         mul_add_c(a[6],b[2],c3,c1,c2);
 410         mul_add_c(a[5],b[3],c3,c1,c2);
 411         mul_add_c(a[4],b[4],c3,c1,c2);
 412         mul_add_c(a[3],b[5],c3,c1,c2);
 413         mul_add_c(a[2],b[6],c3,c1,c2);
 414         mul_add_c(a[1],b[7],c3,c1,c2);
 415         r[8]=c3;
 416         c3=0;
 417         mul_add_c(a[2],b[7],c1,c2,c3);
 418         mul_add_c(a[3],b[6],c1,c2,c3);
 419         mul_add_c(a[4],b[5],c1,c2,c3);
 420         mul_add_c(a[5],b[4],c1,c2,c3);
 421         mul_add_c(a[6],b[3],c1,c2,c3);
 422         mul_add_c(a[7],b[2],c1,c2,c3);
 423         r[9]=c1;
 424         c1=0;
 425         mul_add_c(a[7],b[3],c2,c3,c1);
 426         mul_add_c(a[6],b[4],c2,c3,c1);
 427         mul_add_c(a[5],b[5],c2,c3,c1);
 428         mul_add_c(a[4],b[6],c2,c3,c1);
 429         mul_add_c(a[3],b[7],c2,c3,c1);
 430         r[10]=c2;
 431         c2=0;
 432         mul_add_c(a[4],b[7],c3,c1,c2);
 433         mul_add_c(a[5],b[6],c3,c1,c2);
 434         mul_add_c(a[6],b[5],c3,c1,c2);
 435         mul_add_c(a[7],b[4],c3,c1,c2);
 436         r[11]=c3;
 437         c3=0;
 438         mul_add_c(a[7],b[5],c1,c2,c3);
 439         mul_add_c(a[6],b[6],c1,c2,c3);
 440         mul_add_c(a[5],b[7],c1,c2,c3);
 441         r[12]=c1;
 442         c1=0;
 443         mul_add_c(a[6],b[7],c2,c3,c1);
 444         mul_add_c(a[7],b[6],c2,c3,c1);
 445         r[13]=c2;
 446         c2=0;
 447         mul_add_c(a[7],b[7],c3,c1,c2);
 448         r[14]=c3;
 449         r[15]=c1;
 450         }
 451
 452 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
 453         {
 454         BN_ULONG t1,t2;
 455         BN_ULONG c1,c2,c3;
 456
 457         c1=0;
 458         c2=0;
 459         c3=0;
 460         mul_add_c(a[0],b[0],c1,c2,c3);
 461         r[0]=c1;
 462         c1=0;
 463         mul_add_c(a[0],b[1],c2,c3,c1);
 464         mul_add_c(a[1],b[0],c2,c3,c1);
 465         r[1]=c2;
 466         c2=0;
 467         mul_add_c(a[2],b[0],c3,c1,c2);
 468         mul_add_c(a[1],b[1],c3,c1,c2);
 469         mul_add_c(a[0],b[2],c3,c1,c2);
 470         r[2]=c3;
 471         c3=0;
 472         mul_add_c(a[0],b[3],c1,c2,c3);
 473         mul_add_c(a[1],b[2],c1,c2,c3);
 474         mul_add_c(a[2],b[1],c1,c2,c3);
 475         mul_add_c(a[3],b[0],c1,c2,c3);
 476         r[3]=c1;
 477         c1=0;
 478         mul_add_c(a[3],b[1],c2,c3,c1);
 479         mul_add_c(a[2],b[2],c2,c3,c1);
 480         mul_add_c(a[1],b[3],c2,c3,c1);
 481         r[4]=c2;
 482         c2=0;
 483         mul_add_c(a[2],b[3],c3,c1,c2);
 484         mul_add_c(a[3],b[2],c3,c1,c2);
 485         r[5]=c3;
 486         c3=0;
 487         mul_add_c(a[3],b[3],c1,c2,c3);
 488         r[6]=c1;
 489         r[7]=c2;
 490         }
 491
 492 void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
 493         {
 494         BN_ULONG t1,t2;
 495         BN_ULONG c1,c2,c3;
 496
 497         c1=0;
 498         c2=0;
 499         c3=0;
 500         sqr_add_c(a,0,c1,c2,c3);
 501         r[0]=c1;
 502         c1=0;
 503         sqr_add_c2(a,1,0,c2,c3,c1);
 504         r[1]=c2;
 505         c2=0;
 506         sqr_add_c(a,1,c3,c1,c2);
 507         sqr_add_c2(a,2,0,c3,c1,c2);
 508         r[2]=c3;
 509         c3=0;
 510         sqr_add_c2(a,3,0,c1,c2,c3);
 511         sqr_add_c2(a,2,1,c1,c2,c3);
 512         r[3]=c1;
 513         c1=0;
 514         sqr_add_c(a,2,c2,c3,c1);
 515         sqr_add_c2(a,3,1,c2,c3,c1);
 516         sqr_add_c2(a,4,0,c2,c3,c1);
 517         r[4]=c2;
 518         c2=0;
 519         sqr_add_c2(a,5,0,c3,c1,c2);
 520         sqr_add_c2(a,4,1,c3,c1,c2);
 521         sqr_add_c2(a,3,2,c3,c1,c2);
 522         r[5]=c3;
 523         c3=0;
 524         sqr_add_c(a,3,c1,c2,c3);
 525         sqr_add_c2(a,4,2,c1,c2,c3);
 526         sqr_add_c2(a,5,1,c1,c2,c3);
 527         sqr_add_c2(a,6,0,c1,c2,c3);
 528         r[6]=c1;
 529         c1=0;
 530         sqr_add_c2(a,7,0,c2,c3,c1);
 531         sqr_add_c2(a,6,1,c2,c3,c1);
 532         sqr_add_c2(a,5,2,c2,c3,c1);
 533         sqr_add_c2(a,4,3,c2,c3,c1);
 534         r[7]=c2;
 535         c2=0;
 536         sqr_add_c(a,4,c3,c1,c2);
 537         sqr_add_c2(a,5,3,c3,c1,c2);
 538         sqr_add_c2(a,6,2,c3,c1,c2);
 539         sqr_add_c2(a,7,1,c3,c1,c2);
 540         r[8]=c3;
 541         c3=0;
 542         sqr_add_c2(a,7,2,c1,c2,c3);
 543         sqr_add_c2(a,6,3,c1,c2,c3);
 544         sqr_add_c2(a,5,4,c1,c2,c3);
 545         r[9]=c1;
 546         c1=0;
 547         sqr_add_c(a,5,c2,c3,c1);
 548         sqr_add_c2(a,6,4,c2,c3,c1);
 549         sqr_add_c2(a,7,3,c2,c3,c1);
 550         r[10]=c2;
 551         c2=0;
 552         sqr_add_c2(a,7,4,c3,c1,c2);
 553         sqr_add_c2(a,6,5,c3,c1,c2);
 554         r[11]=c3;
 555         c3=0;
 556         sqr_add_c(a,6,c1,c2,c3);
 557         sqr_add_c2(a,7,5,c1,c2,c3);
 558         r[12]=c1;
 559         c1=0;
 560         sqr_add_c2(a,7,6,c2,c3,c1);
 561         r[13]=c2;
 562         c2=0;
 563         sqr_add_c(a,7,c3,c1,c2);
 564         r[14]=c3;
 565         r[15]=c1;
 566         }
 567
 568 void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
 569         {
 570         BN_ULONG t1,t2;
 571         BN_ULONG c1,c2,c3;
 572
 573         c1=0;
 574         c2=0;
 575         c3=0;
 576         sqr_add_c(a,0,c1,c2,c3);
 577         r[0]=c1;
 578         c1=0;
 579         sqr_add_c2(a,1,0,c2,c3,c1);
 580         r[1]=c2;
 581         c2=0;
 582         sqr_add_c(a,1,c3,c1,c2);
 583         sqr_add_c2(a,2,0,c3,c1,c2);
 584         r[2]=c3;
 585         c3=0;
 586         sqr_add_c2(a,3,0,c1,c2,c3);
 587         sqr_add_c2(a,2,1,c1,c2,c3);
 588         r[3]=c1;
 589         c1=0;
 590         sqr_add_c(a,2,c2,c3,c1);
 591         sqr_add_c2(a,3,1,c2,c3,c1);
 592         r[4]=c2;
 593         c2=0;
 594         sqr_add_c2(a,3,2,c3,c1,c2);
 595         r[5]=c3;
 596         c3=0;
 597         sqr_add_c(a,3,c1,c2,c3);
 598         r[6]=c1;
 599         r[7]=c2;
 600         }
 601 #endif