X-Git-Url: https://git.openssl.org/gitweb/?a=blobdiff_plain;ds=sidebyside;f=crypto%2Fbn%2Fbn_lib.c;h=64c9fd9dc17a42fd5c6c0be45e305e476e72933c;hb=e14d4443a27816b05b044350ad39cd15668c55b8;hp=bd53124f1e103bdcc809a60494cb5418de694fae;hpb=257e206da6b42181b0dc8976792164c4d9cff89b;p=openssl.git

diff --git a/crypto/bn/bn_lib.c b/crypto/bn/bn_lib.c
index bd53124f1e..64c9fd9dc1 100644
--- a/crypto/bn/bn_lib.c
+++ b/crypto/bn/bn_lib.c
@@ -150,7 +150,7 @@ char *BN_options(void)
 
 int BN_num_bits_word(BN_ULONG l)
 	{
-	static char bits[256]={
+	static const char bits[256]={
 		0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,
 		5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
 		6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
@@ -343,8 +343,9 @@ void BN_CTX_free(BN_CTX *c)
 
 BIGNUM *bn_expand2(BIGNUM *b, int words)
 	{
-	BN_ULONG *A,*B,*a;
-	int i,j;
+	BN_ULONG *A,*a;
+	const BN_ULONG *B;
+	int i;
 
 	bn_check_top(b);
 
@@ -362,15 +363,38 @@ BIGNUM *bn_expand2(BIGNUM *b, int words)
 			BNerr(BN_F_BN_EXPAND2,ERR_R_MALLOC_FAILURE);
 			return(NULL);
 			}
-memset(A,0x5c,sizeof(BN_ULONG)*(words+1));
 #if 1
 		B=b->d;
 		/* Check if the previous number needs to be copied */
 		if (B != NULL)
 			{
+#if 0
 			/* This lot is an unrolled loop to copy b->top 
 			 * BN_ULONGs from B to A
 			 */
+/*
+ * I have nothing against unrolling but it's usually done for
+ * several reasons, namely:
+ * - minimize percentage of decision making code, i.e. branches;
+ * - avoid cache trashing;
+ * - make it possible to schedule loads earlier;
+ * Now let's examine the code below. The cornerstone of C is
+ * "programmer is always right" and that's what we love it for:-)
+ * For this very reason C compilers have to be paranoid when it
+ * comes to data aliasing and assume the worst. Yeah, but what
+ * does it mean in real life? This means that loop body below will
+ * be compiled to sequence of loads immediately followed by stores
+ * as compiler assumes the worst, something in A==B+1 style. As a
+ * result CPU pipeline is going to starve for incoming data. Secondly
+ * if A and B happen to share same cache line such code is going to
+ * cause severe cache trashing. Both factors have severe impact on
+ * performance of modern CPUs and this is the reason why this
+ * particulare piece of code is #ifdefed away and replaced by more
+ * "friendly" version found in #else section below. This comment
+ * also applies to BN_copy function.
+ *
+ *					<appro@fy.chalmers.se>
+ */
 			for (i=b->top&(~7); i>0; i-=8)
 				{
 				A[0]=B[0]; A[1]=B[1]; A[2]=B[2]; A[3]=B[3];
@@ -407,6 +431,30 @@ memset(A,0x5c,sizeof(BN_ULONG)*(words+1));
 				 */
 				;
 				}
+#else
+			for (i=b->top>>2; i>0; i--,A+=4,B+=4)
+				{
+				/*
+				 * The fact that the loop is unrolled
+				 * 4-wise is a tribute to Intel. It's
+				 * the one that doesn't have enough
+				 * registers to accomodate more data.
+				 * I'd unroll it 8-wise otherwise:-)
+				 *
+				 *		<appro@fy.chalmers.se>
+				 */
+				BN_ULONG a0,a1,a2,a3;
+				a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
+				A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
+				}
+			switch (b->top&3)
+				{
+				case 3:	A[2]=B[2];
+				case 2:	A[1]=B[1];
+				case 1:	A[0]=B[0];
+				case 0:	; /* ultrix cc workaround, see above */
+				}
+#endif
 			Free(b->d);
 			}
 
@@ -415,22 +463,19 @@ memset(A,0x5c,sizeof(BN_ULONG)*(words+1));
 
 		/* Now need to zero any data between b->top and b->max */
 
-		B= &(b->d[b->top]);
-		j=(b->max - b->top) & ~7;
-		for (i=0; i<j; i+=8)
+		A= &(b->d[b->top]);
+		for (i=(b->max - b->top)>>3; i>0; i--,A+=8)
 			{
-			B[0]=0; B[1]=0; B[2]=0; B[3]=0;
-			B[4]=0; B[5]=0; B[6]=0; B[7]=0;
-			B+=8;
-			}
-		j=(b->max - b->top) & 7;
-		for (i=0; i<j; i++)
-			{
-			B[0]=0;
-			B++;
+			A[0]=0; A[1]=0; A[2]=0; A[3]=0;
+			A[4]=0; A[5]=0; A[6]=0; A[7]=0;
 			}
+		for (i=(b->max - b->top)&7; i>0; i--,A++)
+			A[0]=0;
 #else
-			memcpy(a->d,b->d,sizeof(b->d[0])*b->top);
+			memset(A,0,sizeof(BN_ULONG)*(words+1));
+			memcpy(A,b->d,sizeof(b->d[0])*b->top);
+			b->d=a;
+			b->max=words;
 #endif
 		
 /*		memset(&(p[b->max]),0,((words+1)-b->max)*sizeof(BN_ULONG)); */
@@ -454,7 +499,8 @@ BIGNUM *BN_dup(BIGNUM *a)
 BIGNUM *BN_copy(BIGNUM *a, BIGNUM *b)
 	{
 	int i;
-	BN_ULONG *A,*B;
+	BN_ULONG *A;
+	const BN_ULONG *B;
 
 	bn_check_top(b);
 
@@ -464,47 +510,18 @@ BIGNUM *BN_copy(BIGNUM *a, BIGNUM *b)
 #if 1
 	A=a->d;
 	B=b->d;
-	for (i=b->top&(~7); i>0; i-=8)
+	for (i=b->top>>2; i>0; i--,A+=4,B+=4)
 		{
-		A[0]=B[0];
-		A[1]=B[1];
-		A[2]=B[2];
-		A[3]=B[3];
-		A[4]=B[4];
-		A[5]=B[5];
-		A[6]=B[6];
-		A[7]=B[7];
-		A+=8;
-		B+=8;
+		BN_ULONG a0,a1,a2,a3;
+		a0=B[0]; a1=B[1]; a2=B[2]; a3=B[3];
+		A[0]=a0; A[1]=a1; A[2]=a2; A[3]=a3;
 		}
-	switch (b->top&7)
+	switch (b->top&3)
 		{
-	case 7:
-		A[6]=B[6];
-	case 6:
-		A[5]=B[5];
-	case 5:
-		A[4]=B[4];
-	case 4:
-		A[3]=B[3];
-	case 3:
-		A[2]=B[2];
-	case 2:
-		A[1]=B[1];
-	case 1:
-		A[0]=B[0];
-        case 0:
-		/* I need the 'case 0' entry for utrix cc.
-		 * If the optimiser is turned on, it does the
-		 * switch table by doing
-		 * a=top&7
-		 * a--;
-		 * goto jump_table[a];
-		 * If top is 0, this makes us jump to 0xffffffc which is
-		 * rather bad :-(.
-		 * eric 23-Apr-1998
-		 */
-		;
+		case 3: A[2]=B[2];
+		case 2: A[1]=B[1];
+		case 1: A[0]=B[0];
+		case 0: ; /* ultrix cc workaround, see comments in bn_expand2 */
 		}
 #else
 	memcpy(a->d,b->d,sizeof(b->d[0])*b->top);
@@ -539,6 +556,8 @@ BN_ULONG BN_get_word(BIGNUM *a)
 #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */
 		ret<<=BN_BITS4; /* stops the compiler complaining */
 		ret<<=BN_BITS4;
+#else
+		ret=0;
 #endif
 		ret|=a->d[i];
 		}
@@ -563,6 +582,8 @@ int BN_set_word(BIGNUM *a, BN_ULONG w)
 #ifndef SIXTY_FOUR_BIT /* the data item > unsigned long */
 		w>>=BN_BITS4;
 		w>>=BN_BITS4;
+#else
+		w=0;
 #endif
 		a->d[i]=(BN_ULONG)w&BN_MASK2;
 		if (a->d[i] != 0) a->top=i+1;
@@ -699,7 +720,7 @@ int BN_set_bit(BIGNUM *a, int n)
 		a->top=i+1;
 		}
 
-	a->d[i]|=(1L<<j);
+	a->d[i]|=(((BN_ULONG)1)<<j);
 	return(1);
 	}
 
@@ -711,7 +732,7 @@ int BN_clear_bit(BIGNUM *a, int n)
 	j=n%BN_BITS2;
 	if (a->top <= i) return(0);
 
-	a->d[i]&=(~(1L<<j));
+	a->d[i]&=(~(((BN_ULONG)1)<<j));
 	bn_fix_top(a);
 	return(1);
 	}