3-4 times better RSA/DSA performance on WIN64A target. Well, on AMD64 CPU,
[openssl.git] / crypto / bn / bn_lcl.h
index 18960f191ba5d11ac68666d3ea2c53305756e8d8..ad4ca7ff305a8e9c274464c8c67f37b7a86edabd 100644 (file)
@@ -163,6 +163,45 @@ extern "C" {
 
 
 
+/* BN_mod_exp_mont_conttime is based on the assumption that the
+ * L1 data cache line width of the target processor is at least
+ * the following value.
+ */
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH     ( 64 )
+#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK      (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1)
+
+/* Window sizes optimized for fixed window size modular exponentiation
+ * algorithm (BN_mod_exp_mont_consttime).
+ *
+ * To achieve the security goals of BN_mode_exp_mont_consttime, the
+ * maximum size of the window must not exceed
+ * log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH). 
+ *
+ * Window size thresholds are defined for cache line sizes of 32 and 64,
+ * cache line sizes where log_2(32)=5 and log_2(64)=6 respectively. A
+ * window size of 7 should only be used on processors that have a 128
+ * byte or greater cache line size.
+ */
+#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64
+
+#  define BN_window_bits_for_ctime_exponent_size(b) \
+               ((b) > 937 ? 6 : \
+                (b) > 306 ? 5 : \
+                (b) >  89 ? 4 : \
+                (b) >  22 ? 3 : 1)
+#  define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE   (6)
+
+#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32
+
+#  define BN_window_bits_for_ctime_exponent_size(b) \
+               ((b) > 306 ? 5 : \
+                (b) >  89 ? 4 : \
+                (b) >  22 ? 3 : 1)
+#  define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE   (5)
+
+#endif
+
+
 /* Pentium pro 16,16,16,32,64 */
 /* Alpha       16,16,16,16.64 */
 #define BN_MULL_SIZE_NORMAL                    (16) /* 32 */
@@ -231,6 +270,15 @@ extern "C" {
                : "a"(a),"g"(b)         \
                : "cc");
 #  endif
+# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
+#  if defined(_MSC_VER) && _MSC_VER>=1400
+    unsigned __int64 __umulh   (unsigned __int64 a,unsigned __int64 b);
+    unsigned __int64 _umul128  (unsigned __int64 a,unsigned __int64 b,
+                                unsigned __int64 *h);
+#   pragma intrinsic(__umulh,_umul128)
+#   define BN_UMULT_HIGH(a,b)          __umulh((a),(b))
+#   define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
+#  endif
 # endif                /* cpu */
 #endif         /* OPENSSL_NO_ASM */
 
@@ -240,7 +288,7 @@ extern "C" {
 #define Lw(t)    (((BN_ULONG)(t))&BN_MASK2)
 #define Hw(t)    (((BN_ULONG)((t)>>BN_BITS2))&BN_MASK2)
 
-
+#ifdef BN_DEBUG_RAND
 #define bn_clear_top2max(a) \
        { \
        int      ind = (a)->dmax - (a)->top; \
@@ -248,6 +296,9 @@ extern "C" {
        for (; ind != 0; ind--) \
                *(++ftl) = 0x0; \
        }
+#else
+#define bn_clear_top2max(a)
+#endif
 
 #ifdef BN_LLONG
 #define mul_add(r,a,w,c) { \
@@ -271,6 +322,33 @@ extern "C" {
        (r1)=Hw(t); \
        }
 
+#elif defined(BN_UMULT_LOHI)
+#define mul_add(r,a,w,c) {             \
+       BN_ULONG high,low,ret,tmp=(a);  \
+       ret =  (r);                     \
+       BN_UMULT_LOHI(low,high,w,tmp);  \
+       ret += (c);                     \
+       (c) =  (ret<(c))?1:0;           \
+       (c) += high;                    \
+       ret += low;                     \
+       (c) += (ret<low)?1:0;           \
+       (r) =  ret;                     \
+       }
+
+#define mul(r,a,w,c)   {               \
+       BN_ULONG high,low,ret,ta=(a);   \
+       BN_UMULT_LOHI(low,high,w,ta);   \
+       ret =  low + (c);               \
+       (c) =  high;                    \
+       (c) += (ret<low)?1:0;           \
+       (r) =  ret;                     \
+       }
+
+#define sqr(r0,r1,a)   {               \
+       BN_ULONG tmp=(a);               \
+       BN_UMULT_LOHI(r0,r1,tmp,tmp);   \
+       }
+
 #elif defined(BN_UMULT_HIGH)
 #define mul_add(r,a,w,c) {             \
        BN_ULONG high,low,ret,tmp=(a);  \