Support for "multiply high" instruction, see BN_UMULT_HIGH comment in
authorAndy Polyakov <appro@openssl.org>
Wed, 2 Feb 2000 16:18:12 +0000 (16:18 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 2 Feb 2000 16:18:12 +0000 (16:18 +0000)
crypto/bn/bn_lcl.h for further details. It should be noted that for
the moment of this writing the code was tested only on Alpha. If
compiled with DEC C the C implementation exhibits 12% performance
improvement over the crypto/bn/asm/alpha.s (on EV56 box running
AlphaLinux). GNU C is (unfortunately) 8% behind the assembler
implementation. But it's OpenVMS Alpha users who *may* benefit most
as 'apps/openssl speed rsa' exhibits 6 (six) times performance
improvement over the original VMS bignum implementation. Where "*may*"
means "as soon as code is enabled though #define SIXTY_FOUR_BIT and
crypto/bn/asm/vms.mar is skipped."

crypto/bn/bn_asm.c
crypto/bn/bn_div.c
crypto/bn/bn_lcl.h

index 4d3da16..a62fdcf 100644 (file)
@@ -60,7 +60,7 @@
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
-#ifdef BN_LLONG 
+#if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
 
 BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
        {
@@ -69,18 +69,19 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
        bn_check_num(num);
        if (num <= 0) return(c1);
 
-       for (;;)
+       while (num&~3)
                {
                mul_add(rp[0],ap[0],w,c1);
-               if (--num == 0) break;
                mul_add(rp[1],ap[1],w,c1);
-               if (--num == 0) break;
                mul_add(rp[2],ap[2],w,c1);
-               if (--num == 0) break;
                mul_add(rp[3],ap[3],w,c1);
-               if (--num == 0) break;
-               ap+=4;
-               rp+=4;
+               ap+=4; rp+=4; num-=4;
+               }
+       if (num)
+               {
+               mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1;
+               mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1;
+               mul_add(rp[2],ap[2],w,c1); return c1;
                }
        
        return(c1);
@@ -93,19 +94,19 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w)
        bn_check_num(num);
        if (num <= 0) return(c1);
 
-       /* for (;;) */
-       while (1) /* circumvent egcs-1.1.2 bug */
+       while (num&~3)
                {
                mul(rp[0],ap[0],w,c1);
-               if (--num == 0) break;
                mul(rp[1],ap[1],w,c1);
-               if (--num == 0) break;
                mul(rp[2],ap[2],w,c1);
-               if (--num == 0) break;
                mul(rp[3],ap[3],w,c1);
-               if (--num == 0) break;
-               ap+=4;
-               rp+=4;
+               ap+=4; rp+=4; num-=4;
+               }
+       if (num)
+               {
+               mul(rp[0],ap[0],w,c1); if (--num == 0) return c1;
+               mul(rp[1],ap[1],w,c1); if (--num == 0) return c1;
+               mul(rp[2],ap[2],w,c1);
                }
        return(c1);
        } 
@@ -114,28 +115,19 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n)
         {
        bn_check_num(n);
        if (n <= 0) return;
-       for (;;)
+       while (n&~3)
                {
-               BN_ULLONG t;
-
-               t=(BN_ULLONG)(a[0])*(a[0]);
-               r[0]=Lw(t); r[1]=Hw(t);
-               if (--n == 0) break;
-
-               t=(BN_ULLONG)(a[1])*(a[1]);
-               r[2]=Lw(t); r[3]=Hw(t);
-               if (--n == 0) break;
-
-               t=(BN_ULLONG)(a[2])*(a[2]);
-               r[4]=Lw(t); r[5]=Hw(t);
-               if (--n == 0) break;
-
-               t=(BN_ULLONG)(a[3])*(a[3]);
-               r[6]=Lw(t); r[7]=Hw(t);
-               if (--n == 0) break;
-
-               a+=4;
-               r+=8;
+               sqr(r[0],r[1],a[0]);
+               sqr(r[2],r[3],a[1]);
+               sqr(r[4],r[5],a[2]);
+               sqr(r[6],r[7],a[3]);
+               a+=4; r+=8; n-=4;
+               }
+       if (n)
+               {
+               sqr(r[0],r[1],a[0]); if (--n == 0) return;
+               sqr(r[2],r[3],a[1]); if (--n == 0) return;
+               sqr(r[4],r[5],a[2]);
                }
        }
 
@@ -460,6 +452,38 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n)
 
 #define sqr_add_c2(a,i,j,c0,c1,c2) \
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
+#elif defined(BN_UMULT_HIGH)
+
+#define mul_add_c(a,b,c0,c1,c2)        {       \
+       BN_ULONG ta=(a),tb=(b);         \
+       t1 = ta * tb;                   \
+       t2 = BN_UMULT_HIGH(ta,tb);      \
+       c0 += t1; t2 += (c0<t1)?1:0;    \
+       c1 += t2; c2 += (c1<t2)?1:0;    \
+       }
+
+#define mul_add_c2(a,b,c0,c1,c2) {     \
+       BN_ULONG ta=(a),tb=(b),t0;      \
+       t1 = BN_UMULT_HIGH(ta,tb);      \
+       t0 = ta * tb;                   \
+       t2 = t1+t1; c2 += (t2<t1)?1:0;  \
+       t1 = t0+t0; t2 += (t1<t0)?1:0;  \
+       c0 += t1; t2 += (c0<t1)?1:0;    \
+       c1 += t2; c2 += (c1<t2)?1:0;    \
+       }
+
+#define sqr_add_c(a,i,c0,c1,c2)        {       \
+       BN_ULONG ta=(a)[i];             \
+       t1 = ta * ta;                   \
+       t2 = BN_UMULT_HIGH(ta,ta);      \
+       c0 += t1; t2 += (c0<t1)?1:0;    \
+       c1 += t2; c2 += (c1<t2)?1:0;    \
+       }
+
+#define sqr_add_c2(a,i,j,c0,c1,c2)     \
+       mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
 #else
 #define mul_add_c(a,b,c0,c1,c2) \
        t1=LBITS(a); t2=HBITS(a); \
index d8c31e1..39d7602 100644 (file)
@@ -280,9 +280,14 @@ int BN_div(BIGNUM *dv, BIGNUM *rm, const BIGNUM *num, const BIGNUM *divisor,
                 */
                rem=(n1-q*d0)&BN_MASK2;
 #endif
+#ifdef BN_UMULT_HIGH
+               t2l = d1 * q;
+               t2h = BN_UMULT_HIGH(d1,q);
+#else
                t2l=LBITS(d1); t2h=HBITS(d1);
                ql =LBITS(q);  qh =HBITS(q);
                mul64(t2l,t2h,ql,qh); /* t2=(BN_ULLONG)d1*q; */
+#endif
 
                for (;;)
                        {
index 85a3726..d24cffd 100644 (file)
@@ -86,6 +86,54 @@ extern "C" {
 #endif
 #endif
 
+#if !defined(NO_ASM) && !defined(PEDANTIC)
+/*
+ * BN_UMULT_HIGH section.
+ *
+ * No, I'm not trying to overwhelm you when stating that the
+ * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
+ * you to be impressed when I say that if the compiler doesn't
+ * support 2*N integer type, then you have to replace every N*N
+ * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
+ * and additions which unavoidably results in severe performance
+ * penalties. Of course provided that the hardware is capable of
+ * producing 2*N result... That's when you normally start
+ * considering assembler implementation. However! It should be
+ * pointed out that some CPUs (most notably Alpha, PowerPC and
+ * upcoming IA-64 family:-) provide *separate* instruction
+ * calculating the upper half of the product placing the result
+ * into a general purpose register. Now *if* the compiler supports
+ * inline assembler, then it's not impossible to implement the
+ * "bignum" routines (and have the compiler optimize 'em)
+ * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
+ * macro is about:-)
+ *
+ *                                     <appro@fy.chalmers.se>
+ */
+# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+#  if defined(__DECC)
+#   include <c_asm.h>
+#   define BN_UMULT_HIGH(a,b)  (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
+#  elif defined(__GNUC__)
+#   define BN_UMULT_HIGH(a,b)  ({      \
+       register BN_ULONG ret;          \
+       asm ("umulh     %1,%2,%0"       \
+            : "=r"(ret)                \
+            : "r"(a), "r"(b));         \
+       ret;                    })
+#  endif       /* compiler */
+# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
+#  if defined(__GNUC__)
+#   define BN_UMULT_HIGH(a,b)  ({      \
+       register BN_ULONG ret;          \
+       asm ("mulhdu    %0,%1,%2"       \
+            : "=r"(ret)                \
+            : "r"(a), "r"(b));         \
+       ret;                    })
+#  endif       /* compiler */
+# endif                /* cpu */
+#endif         /* NO_ASM */
+
 /*************************************************************
  * Using the long long type
  */
@@ -151,6 +199,43 @@ extern "C" {
        (c)= Hw(t); \
        }
 
+#define sqr(r0,r1,a) { \
+       BN_ULLONG t; \
+       t=(BN_ULLONG)(a)*(a); \
+       (r0)=Lw(t); \
+       (r1)=Hw(t); ]
+       }
+
+#elif defined(BN_UMULT_HIGH)
+#define mul_add(r,a,w,c) {             \
+       BN_ULONG high,low,ret,tmp=(a);  \
+       ret =  (r);                     \
+       high=  BN_UMULT_HIGH(w,tmp);    \
+       ret += (c);                     \
+       low =  (w) * tmp;               \
+       (c) =  (ret<(c))?1:0;           \
+       (c) += high;                    \
+       ret += low;                     \
+       (c) += (ret<low)?1:0;           \
+       (r) =  ret;                     \
+       }
+
+#define mul(r,a,w,c)   {               \
+       BN_ULONG high,low,ret,ta=(a);   \
+       low =  (w) * ta;                \
+       high=  BN_UMULT_HIGH(w,ta);     \
+       ret =  low + (c);               \
+       (c) =  high;                    \
+       (c) += (ret<low)?1:0;           \
+       (r) =  ret;                     \
+       }
+
+#define sqr(r0,r1,a)   {               \
+       BN_ULONG tmp=(a);               \
+       (r0) = tmp * tmp;               \
+       (r1) = BN_UMULT_HIGH(tmp,tmp);  \
+       }
+
 #else
 /*************************************************************
  * No long long type