Bignum library bug fix. IRIX 6 passes "make test" now!
authorUlf Möller <ulf@openssl.org>
Thu, 20 May 1999 01:43:07 +0000 (01:43 +0000)
committerUlf Möller <ulf@openssl.org>
Thu, 20 May 1999 01:43:07 +0000 (01:43 +0000)
This also avoids the problems with SC4.2 and unpatched SC5.

Submitted by: Andy Polyakov <appro@fy.chalmers.se>

CHANGES
Configure
STATUS
config
crypto/bn/bn.h
crypto/bn/bn_lib.c
crypto/bn/bn_prime.c

diff --git a/CHANGES b/CHANGES
index 9f04291d993caefaf01398ba0aa2ba821f0c21a9..6c398c064c0f0bd2be9fcc5c8511b571d2b6a2a9 100644 (file)
--- a/CHANGES
+++ b/CHANGES
                                    [23-Dec-1998] down below; but in later
                                    versions, these hyphens are gone.]
 
+  *) Bignum library bug fix. IRIX 6 passes "make test" now!
+     This also avoids the problems with SC4.2 and unpatched SC5.  
+     [Andy Polyakov <appro@fy.chalmers.se>]
+
   *) New functions sk_num, sk_value and sk_set to replace the previous macros.
      These are required because of the typesafe stack would otherwise break 
      existing code. If old code used a structure member which used to be STACK
index 0a4f15e91e5232af264b80c878b525d3f24c9790..2cc831a8772fb84fc332bc47e7d0bbf3404cc688 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -112,15 +112,12 @@ my %table=(
 "debug-solaris-usparc-gcc","gcc:-O3 -g -mcpu=ultrasparc -Wall -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc.o::",
 
 # DO NOT use /xO[34] on sparc with SC3.0.  It is broken, and will not pass the tests
-"solaris-sparc-cc","cc:-fast -O -Xa -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_UNROLL BF_PTR:::",
+"solaris-sparc-sc3","cc:-fast -O -Xa -DB_ENDIAN:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_UNROLL BF_PTR:::",
 # SC4 is ok, better than gcc even on bn as long as you tell it -xarch=v8
 # -fast slows things like DES down quite a lot
-# Don't use -xtarget=ultra with SC4.2. It is broken, and will break exptest.
-"solaris-sparc-sc4","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
-"solaris-usparc-sc4","cc:-xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o::",
-# SC5.0 note: Compiler common patch 107357-01 or later is required!
-"solaris-usparc-sc5","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:",
-"solaris64-usparc-sc5","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:",
+"solaris-sparc-cc","cc:-xarch=v8 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::",
+"solaris-usparc-cc","cc:-xtarget=ultra -xarch=v8plus -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC -DBN_DIV2W:-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:",
+"solaris64-usparc-cc","cc:-xtarget=ultra -xarch=v9 -xstrconst -xO5 -xdepend -Xa -DB_ENDIAN -DULTRASPARC:-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:",
 
 # Sunos configs, assuming sparc for the gcc one.
 ##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST:(unknown)::DES_UNROLL:::",
@@ -133,12 +130,12 @@ my %table=(
 # 3 times faster, use if at all possible.
 #"irix-gcc","gcc:-O2 -mips2::SIXTY_FOUR_BIT BN_LLONG RC4_INDEX RC4_CHAR:::",
 "irix-gcc","gcc:-O2 -DTERMIOS -DB_ENDIAN:(unknown)::BN_LLONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR:::",
-"irix64-gcc","gcc:-mips3 -O2 -DTERMIOS -DB_ENDIAN:(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:::",
 "irix-cc", "cc:-O2 -use_readonly_const -DTERMIOS -DB_ENDIAN:(unknown)::BN_LLONG DES_PTR DES_RISC2 DES_UNROLL BF_PTR:::",
-"irix64-cc", "cc:-O2 -use_readonly_const -DTERMIOS -DB_ENDIAN:(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:::",
+"irix-mips3-gcc","gcc:-mips3 -O2 -DTERMIOS -DB_ENDIAN:(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:::",
+"irix-mips3-cc", "cc:-n32 -mips3 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN:(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:::",
 "debug-irix-cc", "cc:-w2 -g -DCRYPTO_MDEBUG -DTERMIOS -DB_ENDIAN:(unknown):::::",
-# This is the n64 mode build.
-"irix-n64-cc", "cc:-64 -O2 -use_readonly_const -DTERMIOS:(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT:::",
+# This is the n64 mode build. (Untested!)
+"irix64-mips4-cc", "cc:-64 -mips4 -O2 -use_readonly_const -DTERMIOS:(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT:::",
 
 # HPUX 9.X config.
 # Don't use the bundled cc.  It is broken.  Use HP ANSI C if possible, or gcc.
diff --git a/STATUS b/STATUS
index 553232d520ccfce86cb0edc7463e7b211a2ba260..d68ab036c154fffee904838927a56d2017857dcb 100644 (file)
--- a/STATUS
+++ b/STATUS
@@ -1,6 +1,6 @@
 
   OpenSSL STATUS                           Last modified at
-  ______________                           $Date: 1999/05/18 08:52:01 $
+  ______________                           $Date: 1999/05/20 01:42:57 $
 
   DEVELOPMENT STATE
 
     o  OpenSSL 0.9.2b: Released on March    22th, 1999
     o  OpenSSL 0.9.1c: Released on December 23th, 1998
 
-     [ Proposed new numbering scheme: <major>.<minor>[<patchlevel>]
-       0.9.1c is   0913
-       1.0    is 010000
-       1.0  a is 010001
-       1.8  z is 01081a ]
-
   RELEASE SHOWSTOPPERS
 
     o BSD/OS: assembler functions must not have leading underscores
-    o exptest and rsa_oaep_test fail with irix64-*
-      (Don Badrak <dbadrak@geo.census.gov>: "Re: Problems to compile openssl
-         on IRIX 6.2", openssl-users)
 
   AVAILABLE PATCHES
 
diff --git a/config b/config
index 3ddf4bb46e4b5c0c633a884df386700009264cd6..484f7937f2c1e7ddeada9c756d6aec1ec66ad9c5 100755 (executable)
--- a/config
+++ b/config
@@ -286,9 +286,9 @@ else
   if [ "$SYSTEM" = "SunOS" ]
   then
    case `cc -V 2>&1` in
-    *4*) CC=sc4;;
-    *5*) CC=sc5;;
-    *) CC=cc;;
+    *4*) CC=cc;;
+    *5*) CC=cc;;
+    *) CC=sc3;;
    esac
   fi
 fi
index 65481153cedb4a1f3c02d85bb6a9a5b6e40f40c3..230a591e422dde4e31393749cc9e00d33f2aaa45 100644 (file)
@@ -119,11 +119,11 @@ extern "C" {
 /* This is where the long long data type is 64 bits, but long is 32.
  * For machines where there are 64bit registers, this is the mode to use.
  * IRIX, on R4000 and above should use this mode, along with the relevent
- * assember code :-).  Do NOT define BN_ULLONG.
+ * assember code :-).  Do NOT define BN_LLONG.
  */
 #ifdef SIXTY_FOUR_BIT
-#define BN_LLONG
-/* #define BN_ULLONG   unsigned long long */
+#undef BN_LLONG
+#undef BN_ULLONG
 #define BN_ULONG       unsigned long long
 #define BN_LONG                long long
 #define BN_BITS                128
index bd53124f1e103bdcc809a60494cb5418de694fae..64c9fd9dc17a42fd5c6c0be45e305e476e72933c 100644 (file)
@@ -150,7 +150,7 @@ char *BN_options(void)
 
 int BN_num_bits_word(BN_ULONG l)
        {
-       static char bits[256]={
+       static const char bits[256]={
                0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,
                5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
                6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
@@ -343,8 +343,9 @@ void BN_CTX_free(BN_CTX *c)
 
 BIGNUM *bn_expand2(BIGNUM *b, int words)
        {
-       BN_ULONG *A,*B,*a;
-       int i,j;
+       BN_ULONG *A,*a;
+       const BN_ULONG *B;
+       int i;
 
        bn_check_top(b);
 
@@ -362,15 +363,38 @@ BIGNUM *bn_expand2(BIGNUM *b, int words)
                        BNerr(BN_F_BN_EXPAND2,ERR_R_MALLOC_FAILURE);
                        return(NULL);
                        }
-memset(A,0x5c,sizeof(BN_ULONG)*(words+1));
 #if 1
                B=b->d;
                /* Check if the previous number needs to be copied */
                if (B != NULL)
                        {
+#if 0
                        /* This lot is an unrolled loop to copy b->top 
                         * BN_ULONGs from B to A
                         */
+/*
+ * I have nothing against unrolling but it's usually done for
+ * several reasons, namely:
+ * - minimize percentage of decision making code, i.e. branches;
+ * - avoid cache trashing;
+ * - make it possible to schedule loads earlier;
+ * Now let's examine the code below. The cornerstone of C is
+ * "programmer is always right" and that's what we love it for:-)
+ * For this very reason C compilers have to be paranoid when it
+ * comes to data aliasing and assume the worst. Yeah, but what
+ * does it mean in real life? This means that loop body below will
+ * be compiled to sequence of loads immediately followed by stores
+ * as compiler assumes the worst, something in A==B+1 style. As a
+ * result CPU pipeline is going to starve for incoming data. Secondly
+ * if A and B happen to share same cache line such code is going to
+ * cause severe cache trashing. Both factors have severe impact on
+ * performance of modern CPUs and this is the reason why this
+ * particulare piece of code is #ifdefed away and replaced by more
+ * "friendly" version found in #else section below. This comment
+ * also applies to BN_copy function.
+ *
+ *                                     <appro@fy.chalmers.se>
+ */
                        for (i=b->top&(~7); i>0; i-=8)
                                {
                                A[0]=B[0]; A[1]=B[1]; A[2]=B[2]; A[3]=B[3];
@@ -407,6 +431,30 @@ memset(A,0x5c,sizeof(BN_ULONG)*(words+1));
                                 */
                                ;
                                }
+#else
+                       for (i=b->top>>2; i>0; i--,A+=4,B+=4)
+                               {
+                               /*
+                                * The fact that the loop is unrolled
+                                * 4-wise is a tribute to Intel. It's
+                                * the one that doesn't have enough
+                                * registers to accomodate more data.
+                                * I'd unroll it 8-wise otherwise:-)
+                                *
+                                *              <appro@fy.chalmers.se>
+                                */
+                               BN_ULONG a0,a1,a2,a3;
+                               a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
+                               A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
+                               }
+                       switch (b->top&3)
+                               {
+                               case 3: A[2]=B[2];
+                               case 2: A[1]=B[1];
+                               case 1: A[0]=B[0];
+                               case 0: ; /* ultrix cc workaround, see above */
+                               }
+#endif
                        Free(b->d);
                        }
 
@@ -415,22 +463,19 @@ memset(A,0x5c,sizeof(BN_ULONG)*(words+1));
 
                /* Now need to zero any data between b->top and b->max */
 
-               B= &(b->d[b->top]);
-               j=(b->max - b->top) & ~7;
-               for (i=0; i<j; i+=8)
+               A= &(b->d[b->top]);
+               for (i=(b->max - b->top)>>3; i>0; i--,A+=8)
                        {
-                       B[0]=0; B[1]=0; B[2]=0; B[3]=0;
-                       B[4]=0; B[5]=0; B[6]=0; B[7]=0;
-                       B+=8;
-                       }
-               j=(b->max - b->top) & 7;
-               for (i=0; i<j; i++)
-                       {
-                       B[0]=0;
-                       B++;
+                       A[0]=0; A[1]=0; A[2]=0; A[3]=0;
+                       A[4]=0; A[5]=0; A[6]=0; A[7]=0;
                        }
+               for (i=(b->max - b->top)&7; i>0; i--,A++)
+                       A[0]=0;
 #else
-                       memcpy(a->d,b->d,sizeof(b->d[0])*b->top);
+                       memset(A,0,sizeof(BN_ULONG)*(words+1));
+                       memcpy(A,b->d,sizeof(b->d[0])*b->top);
+                       b->d=a;
+                       b->max=words;
 #endif
                
 /*             memset(&(p[b->max]),0,((words+1)-b->max)*sizeof(BN_ULONG)); */
@@ -454,7 +499,8 @@ BIGNUM *BN_dup(BIGNUM *a)
 BIGNUM *BN_copy(BIGNUM *a, BIGNUM *b)
        {
        int i;
-       BN_ULONG *A,*B;
+       BN_ULONG *A;
+       const BN_ULONG *B;
 
        bn_check_top(b);
 
@@ -464,47 +510,18 @@ BIGNUM *BN_copy(BIGNUM *a, BIGNUM *b)
 #if 1
        A=a->d;
        B=b->d;
-       for (i=b->top&(~7); i>0; i-=8)
+       for (i=b->top>>2; i>0; i--,A+=4,B+=4)
                {
-               A[0]=B[0];
-               A[1]=B[1];
-               A[2]=B[2];
-               A[3]=B[3];
-               A[4]=B[4];
-               A[5]=B[5];
-               A[6]=B[6];
-               A[7]=B[7];
-               A+=8;
-               B+=8;
+               BN_ULONG a0,a1,a2,a3;
+               a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
+               A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
                }
-       switch (b->top&7)
+       switch (b->top&3)
                {
-       case 7:
-               A[6]=B[6];
-       case 6:
-               A[5]=B[5];
-       case 5:
-               A[4]=B[4];
-       case 4:
-               A[3]=B[3];
-       case 3:
-               A[2]=B[2];
-       case 2:
-               A[1]=B[1];
-       case 1:
-               A[0]=B[0];
-        case 0:
-               /* I need the 'case 0' entry for utrix cc.
-                * If the optimiser is turned on, it does the
-                * switch table by doing
-                * a=top&7
-                * a--;
-                * goto jump_table[a];
-                * If top is 0, this makes us jump to 0xffffffc which is
-                * rather bad :-(.
-                * eric 23-Apr-1998
-                */
-               ;
+               case 3: A[2]=B[2];
+               case 2: A[1]=B[1];
+               case 1: A[0]=B[0];
+               case 0: ; /* ultrix cc workaround, see comments in bn_expand2 */
                }
 #else
        memcpy(a->d,b->d,sizeof(b->d[0])*b->top);
@@ -539,6 +556,8 @@ BN_ULONG BN_get_word(BIGNUM *a)
 #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */
                ret<<=BN_BITS4; /* stops the compiler complaining */
                ret<<=BN_BITS4;
+#else
+               ret=0;
 #endif
                ret|=a->d[i];
                }
@@ -563,6 +582,8 @@ int BN_set_word(BIGNUM *a, BN_ULONG w)
 #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */
                w>>=BN_BITS4;
                w>>=BN_BITS4;
+#else
+               w=0;
 #endif
                a->d[i]=(BN_ULONG)w&BN_MASK2;
                if (a->d[i] != 0) a->top=i+1;
@@ -699,7 +720,7 @@ int BN_set_bit(BIGNUM *a, int n)
                a->top=i+1;
                }
 
-       a->d[i]|=(1L<<j);
+       a->d[i]|=(((BN_ULONG)1)<<j);
        return(1);
        }
 
@@ -711,7 +732,7 @@ int BN_clear_bit(BIGNUM *a, int n)
        j=n%BN_BITS2;
        if (a->top <= i) return(0);
 
-       a->d[i]&=(~(1L<<j));
+       a->d[i]&=(~(((BN_ULONG)1)<<j));
        bn_fix_top(a);
        return(1);
        }
index 28610766d0cde49a4b851bf7937b601c0c55db84..118eb35159bb6256ffded4de14d6a8933cf4a9cf 100644 (file)
@@ -319,7 +319,7 @@ static int probable_prime_dh(BIGNUM *rnd, int bits, BIGNUM *add, BIGNUM *rem,
        loop: for (i=1; i<NUMPRIMES; i++)
                {
                /* check that rnd is a prime */
-               if (BN_mod_word(rnd,(BN_LONG)primes[i]) <= 1)
+               if (BN_mod_word(rnd,(BN_ULONG)primes[i]) <= 1)
                        {
                        if (!BN_add(rnd,rnd,add)) goto err;
                        goto loop;
@@ -366,8 +366,8 @@ static int probable_prime_dh_strong(BIGNUM *p, int bits, BIGNUM *padd,
                /* check that p and q are prime */
                /* check that for p and q
                 * gcd(p-1,primes) == 1 (except for 2) */
-               if (    (BN_mod_word(p,(BN_LONG)primes[i]) == 0) ||
-                       (BN_mod_word(q,(BN_LONG)primes[i]) == 0))
+               if (    (BN_mod_word(p,(BN_ULONG)primes[i]) == 0) ||
+                       (BN_mod_word(q,(BN_ULONG)primes[i]) == 0))
                        {
                        if (!BN_add(p,p,padd)) goto err;
                        if (!BN_add(q,q,qadd)) goto err;