+
+/*
+ * BN_window_bits_for_exponent_size -- macro for sliding window mod_exp functions
+ *
+ *
+ * For window size 'w' (w >= 2) and a random 'b' bits exponent,
+ * the number of multiplications is a constant plus on average
+ *
+ * 2^(w-1) + (b-w)/(w+1);
+ *
+ * here 2^(w-1) is for precomputing the table (we actually need
+ * entries only for windows that have the lowest bit set), and
+ * (b-w)/(w+1) is an approximation for the expected number of
+ * w-bit windows, not counting the first one.
+ *
+ * Thus we should use
+ *
+ * w >= 6 if b > 671
+ * w = 5 if 671 > b > 239
+ * w = 4 if 239 > b > 79
+ * w = 3 if 79 > b > 23
+ * w <= 2 if 23 > b
+ *
+ * (with draws in between). Very small exponents are often selected
+ * with low Hamming weight, so we use w = 1 for b <= 23.
+ */
+#if 1
+#define BN_window_bits_for_exponent_size(b) \
+ ((b) > 671 ? 6 : \
+ (b) > 239 ? 5 : \
+ (b) > 79 ? 4 : \
+ (b) > 23 ? 3 : 1)
+#else
+/* Old SSLeay/OpenSSL table.
+ * Maximum window size was 5, so this table differs for b==1024;
+ * but it coincides for other interesting values (b==160, b==512).
+ */
+#define BN_window_bits_for_exponent_size(b) \
+ ((b) > 255 ? 5 : \
+ (b) > 127 ? 4 : \
+ (b) > 17 ? 3 : 1)
+#endif
+
+
+
+/* BN_mod_exp_mont_conttime is based on the assumption that the
+ * L1 data cache line width of the target processor is at least
+ * the following value.
+ */
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH ( 64 )
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
+
+/* Window sizes optimized for fixed window size modular exponentiation
+ * algorithm (BN_mod_exp_mont_consttime).
+ *
+ * To achieve the security goals of BN_mode_exp_mont_consttime, the
+ * maximum size of the window must not exceed
+ * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH).
+ *
+ * Window size thresholds are defined for cache line sizes of 32 and 64,
+ * cache line sizes where log_2(32)=5 and log_2(64)=6 respectively. A
+ * window size of 7 should only be used on processors that have a 128
+ * byte or greater cache line size.
+ */
+#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
+
+# define BN_window_bits_for_ctime_exponent_size(b) \
+ ((b) > 937 ? 6 : \
+ (b) > 306 ? 5 : \
+ (b) > 89 ? 4 : \
+ (b) > 22 ? 3 : 1)
+# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6)
+
+#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
+
+# define BN_window_bits_for_ctime_exponent_size(b) \
+ ((b) > 306 ? 5 : \
+ (b) > 89 ? 4 : \
+ (b) > 22 ? 3 : 1)
+# define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5)
+
+#endif
+
+
+/* Pentium pro 16,16,16,32,64 */
+/* Alpha 16,16,16,16.64 */
+#define BN_MULL_SIZE_NORMAL (16) /* 32 */
+#define BN_MUL_RECURSIVE_SIZE_NORMAL (16) /* 32 less than */
+#define BN_SQR_RECURSIVE_SIZE_NORMAL (16) /* 32 */
+#define BN_MUL_LOW_RECURSIVE_SIZE_NORMAL (32) /* 32 */
+#define BN_MONT_CTX_SET_SIZE_WORD (64) /* 32 */
+
+#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM) && !defined(PEDANTIC)
+/*
+ * BN_UMULT_HIGH section.
+ *
+ * No, I'm not trying to overwhelm you when stating that the
+ * product of N-bit numbers is 2*N bits wide:-) No, I don't expect
+ * you to be impressed when I say that if the compiler doesn't
+ * support 2*N integer type, then you have to replace every N*N
+ * multiplication with 4 (N/2)*(N/2) accompanied by some shifts
+ * and additions which unavoidably results in severe performance
+ * penalties. Of course provided that the hardware is capable of
+ * producing 2*N result... That's when you normally start
+ * considering assembler implementation. However! It should be
+ * pointed out that some CPUs (most notably Alpha, PowerPC and
+ * upcoming IA-64 family:-) provide *separate* instruction
+ * calculating the upper half of the product placing the result
+ * into a general purpose register. Now *if* the compiler supports
+ * inline assembler, then it's not impossible to implement the
+ * "bignum" routines (and have the compiler optimize 'em)
+ * exhibiting "native" performance in C. That's what BN_UMULT_HIGH
+ * macro is about:-)
+ *
+ * <appro@fy.chalmers.se>
+ */
+# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+# if defined(__DECC)
+# include <c_asm.h>
+# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b))
+# elif defined(__GNUC__) && __GNUC__>=2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ asm ("umulh %1,%2,%0" \
+ : "=r"(ret) \
+ : "r"(a), "r"(b)); \
+ ret; })
+# endif /* compiler */
+# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG)
+# if defined(__GNUC__) && __GNUC__>=2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ asm ("mulhdu %0,%1,%2" \
+ : "=r"(ret) \
+ : "r"(a), "r"(b)); \
+ ret; })
+# endif /* compiler */
+# elif (defined(__x86_64) || defined(__x86_64__)) && \
+ (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT))
+# if defined(__GNUC__) && __GNUC__>=2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret,discard; \
+ asm ("mulq %3" \
+ : "=a"(discard),"=d"(ret) \
+ : "a"(a), "g"(b) \
+ : "cc"); \
+ ret; })
+# define BN_UMULT_LOHI(low,high,a,b) \
+ asm ("mulq %3" \
+ : "=a"(low),"=d"(high) \
+ : "a"(a),"g"(b) \
+ : "cc");
+# endif
+# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
+# if defined(_MSC_VER) && _MSC_VER>=1400
+ unsigned __int64 __umulh (unsigned __int64 a,unsigned __int64 b);
+ unsigned __int64 _umul128 (unsigned __int64 a,unsigned __int64 b,
+ unsigned __int64 *h);
+# pragma intrinsic(__umulh,_umul128)
+# define BN_UMULT_HIGH(a,b) __umulh((a),(b))
+# define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
+# endif
+# elif defined(__mips) && (defined(SIXTY_FOUR_BIT) || defined(SIXTY_FOUR_BIT_LONG))
+# if defined(__GNUC__) && __GNUC__>=2
+# define BN_UMULT_HIGH(a,b) ({ \
+ register BN_ULONG ret; \
+ asm ("dmultu %1,%2" \
+ : "=h"(ret) \
+ : "r"(a), "r"(b) : "l"); \
+ ret; })
+# define BN_UMULT_LOHI(low,high,a,b) \
+ asm ("dmultu %2,%3" \
+ : "=l"(low),"=h"(high) \
+ : "r"(a), "r"(b));
+# endif
+# endif /* cpu */
+#endif /* OPENSSL_NO_ASM */
+