Improve support for running everything as a monolithic application.

[openssl.git] / crypto / bf / bf_locl.h
diff --git a/crypto/bf/bf_locl.h b/crypto/bf/bf_locl.h

index a5663de8caef0fe3f9d1ccb5117142dba86fcae8..05756b5d3b654629cc30d35be176c2b93bca2cef 100644 (file)
--- a/crypto/bf/bf_locl.h
+++ b/crypto/bf/bf_locl.h
@@ -1,4 +1,4 @@
-/* crypto/bf/bf_locl.org */
+/* crypto/bf/bf_locl.h */
  /* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com)
   * All rights reserved.
   *
@@ -56,39 +56,9 @@
   * [including the GNU Public Licence.]
   */
  
-/* WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
- *
- * Always modify bf_locl.org since bf_locl.h is automatically generated from
- * it during SSLeay configuration.
- *
- * WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
- */
-
-/* Special defines which change the way the code is built depending on the
-   CPU and OS.  For SGI machines you can use _MIPS_SZLONG (32 or 64) to find
-   even newer MIPS CPU's, but at the moment one size fits all for
-   optimization options.  Older Sparc's work better with only UNROLL, but
-   there's no way to tell at compile time what it is you're running on */
-
-#if defined( sun )                    /* Newer Sparc's */
-#  define BF_PTR
-#elif defined( __ultrix )     /* Older MIPS */
-#  define BF_PTR
-#elif defined( __osf1__ )     /* Alpha */
-  /* None */
-#elif defined ( _AIX )                /* RS6000 */
-  /* Unknown */
-#elif defined( __hpux )               /* HP-PA */
-  /* None */
-#elif defined( __aux )                /* 68K */
-  /* Unknown */
-#elif defined( __dgux )               /* 88K (but P6 in latest boxes) */
-  /* Unknown */
-#elif defined( __sgi )                /* Newer MIPS */
-#  define BF_PTR
-#elif defined( i386 )         /* x86 boxes, should be gcc */
-#elif defined( _MSC_VER )     /* x86 boxes, Visual C */
-#endif /* Systems-specific speed defines */
+#ifndef HEADER_BF_LOCL_H
+#define HEADER_BF_LOCL_H
+#include <openssl/opensslconf.h> /* BF_PTR, BF_PTR2 */
  
  #undef c2l
  #define c2l(c,l)       (l =((unsigned long)(*((c)++)))    , \
@@ -181,62 +151,69 @@
  /* This is actually a big endian algorithm, the most significate byte
   * is used to lookup array 0 */
  
-/* use BF_PTR2 for intel boxes,
- * BF_PTR for sparc and MIPS/SGI
- * use nothing for Alpha and HP.
- */
-#if !defined(BF_PTR) && !defined(BF_PTR2)
-#undef BF_PTR
-#endif
-
-#define BF_M   0x3fc
-#define BF_0   22L
-#define BF_1   14L
-#define BF_2    6L
-#define BF_3    2L /* left shift */
-
  #if defined(BF_PTR2)
  
-/* This is basically a special pentium verson */
-#define BF_ENC(LL,R,S,P) \
-       { \
-       BF_LONG t,u,v; \
-       u=R>>BF_0; \
-       v=R>>BF_1; \
-       u&=BF_M; \
-       v&=BF_M; \
-       t=  *(BF_LONG *)((unsigned char *)&(S[  0])+u); \
-       u=R>>BF_2; \
-       t+= *(BF_LONG *)((unsigned char *)&(S[256])+v); \
-       v=R<<BF_3; \
-       u&=BF_M; \
-       v&=BF_M; \
-       t^= *(BF_LONG *)((unsigned char *)&(S[512])+u); \
-       LL^=P; \
-       t+= *(BF_LONG *)((unsigned char *)&(S[768])+v); \
-       LL^=t; \
-       }
+/*
+ * This is basically a special Intel version. Point is that Intel
+ * doesn't have many registers, but offers a reach choice of addressing
+ * modes. So we spare some registers by directly traversing BF_KEY
+ * structure and hiring the most decorated addressing mode. The code
+ * generated by EGCS is *perfectly* competitive with assembler
+ * implementation!
+ */
+#define BF_ENC(LL,R,KEY,Pi) (\
+       LL^=KEY[Pi], \
+       t=  KEY[BF_ROUNDS+2 +   0 + ((R>>24)&0xFF)], \
+       t+= KEY[BF_ROUNDS+2 + 256 + ((R>>16)&0xFF)], \
+       t^= KEY[BF_ROUNDS+2 + 512 + ((R>>8 )&0xFF)], \
+       t+= KEY[BF_ROUNDS+2 + 768 + ((R    )&0xFF)], \
+       LL^=t \
+       )
  
  #elif defined(BF_PTR)
  
-/* This is normally very good */
+#ifndef BF_LONG_LOG2
+#define BF_LONG_LOG2  2       /* default to BF_LONG being 32 bits */
+#endif
+#define BF_M  (0xFF<<BF_LONG_LOG2)
+#define BF_0  (24-BF_LONG_LOG2)
+#define BF_1  (16-BF_LONG_LOG2)
+#define BF_2  ( 8-BF_LONG_LOG2)
+#define BF_3  BF_LONG_LOG2 /* left shift */
+
+/*
+ * This is normally very good on RISC platforms where normally you
+ * have to explicitely "multiplicate" array index by sizeof(BF_LONG)
+ * in order to caclulate the effective address. This implementation
+ * excuses CPU from this extra work. Power[PC] uses should have most
+ * fun as (R>>BF_i)&BF_M gets folded into a single instruction, namely
+ * rlwinm. So let'em double-check if their compiler does it.
+ */
  
-#define BF_ENC(LL,R,S,P) \
-       LL^=P; \
+#define BF_ENC(LL,R,S,P) ( \
+       LL^=P, \
         LL^= (((*(BF_LONG *)((unsigned char *)&(S[  0])+((R>>BF_0)&BF_M))+ \
                 *(BF_LONG *)((unsigned char *)&(S[256])+((R>>BF_1)&BF_M)))^ \
                 *(BF_LONG *)((unsigned char *)&(S[512])+((R>>BF_2)&BF_M)))+ \
-               *(BF_LONG *)((unsigned char *)&(S[768])+((R<<BF_3)&BF_M)));
+               *(BF_LONG *)((unsigned char *)&(S[768])+((R<<BF_3)&BF_M))) \
+       )
  #else
  
-/* This will always work, even on 64 bit machines and strangly enough,
- * on the Alpha it is faster than the pointer versions (both 32 and 64
- * versions of BF_LONG) */
+/*
+ * This is a *generic* version. Seem to perform best on platforms that
+ * offer explicit support for extraction of 8-bit nibbles preferably
+ * complemented with "multiplying" of array index by sizeof(BF_LONG).
+ * For the moment of this writing the list comprises Alpha CPU featuring
+ * extbl and s[48]addq instructions.
+ */
+
+#define BF_ENC(LL,R,S,P) ( \
+       LL^=P, \
+       LL^=((( S[       ((int)(R>>24)&0xff)] + \
+               S[0x0100+((int)(R>>16)&0xff)])^ \
+               S[0x0200+((int)(R>> 8)&0xff)])+ \
+               S[0x0300+((int)(R    )&0xff)])&0xffffffffL \
+       )
+#endif
  
-#define BF_ENC(LL,R,S,P) \
-       LL^=P; \
-       LL^=((( S[        (int)(R>>24L)      ] + \
-               S[0x0100+((int)(R>>16L)&0xff)])^ \
-               S[0x0200+((int)(R>> 8L)&0xff)])+ \
-               S[0x0300+((int)(R     )&0xff)])&0xffffffffL;
  #endif