3-4 times better RSA/DSA performance on WIN64A target. Well, on AMD64 CPU,
authorAndy Polyakov <appro@openssl.org>
Thu, 4 Aug 2005 17:35:42 +0000 (17:35 +0000)
committerAndy Polyakov <appro@openssl.org>
Thu, 4 Aug 2005 17:35:42 +0000 (17:35 +0000)
EMT64T will hardly exhibit better performance...

crypto/bn/bn_asm.c
crypto/bn/bn_lcl.h

index 1997808..99bc2de 100644 (file)
@@ -459,6 +459,34 @@ BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, int n)
 #define sqr_add_c2(a,i,j,c0,c1,c2) \
        mul_add_c2((a)[i],(a)[j],c0,c1,c2)
 
+#elif defined(BN_UMULT_LOHI)
+
+#define mul_add_c(a,b,c0,c1,c2)        {       \
+       BN_ULONG ta=(a),tb=(b);         \
+       BN_UMULT_LOHI(t1,t2,ta,tb);     \
+       c0 += t1; t2 += (c0<t1)?1:0;    \
+       c1 += t2; c2 += (c1<t2)?1:0;    \
+       }
+
+#define mul_add_c2(a,b,c0,c1,c2) {     \
+       BN_ULONG ta=(a),tb=(b),t0;      \
+       BN_UMULT_LOHI(t0,t1,ta,tb);     \
+       t2 = t1+t1; c2 += (t2<t1)?1:0;  \
+       t1 = t0+t0; t2 += (t1<t0)?1:0;  \
+       c0 += t1; t2 += (c0<t1)?1:0;    \
+       c1 += t2; c2 += (c1<t2)?1:0;    \
+       }
+
+#define sqr_add_c(a,i,c0,c1,c2)        {       \
+       BN_ULONG ta=(a)[i];             \
+       BN_UMULT_LOHI(t1,t2,ta,ta);     \
+       c0 += t1; t2 += (c0<t1)?1:0;    \
+       c1 += t2; c2 += (c1<t2)?1:0;    \
+       }
+
+#define sqr_add_c2(a,i,j,c0,c1,c2)     \
+       mul_add_c2((a)[i],(a)[j],c0,c1,c2)
+
 #elif defined(BN_UMULT_HIGH)
 
 #define mul_add_c(a,b,c0,c1,c2)        {       \
index 1c680fc..ad4ca7f 100644 (file)
@@ -270,6 +270,15 @@ extern "C" {
                : "a"(a),"g"(b)         \
                : "cc");
 #  endif
+# elif (defined(_M_AMD64) || defined(_M_X64)) && defined(SIXTY_FOUR_BIT)
+#  if defined(_MSC_VER) && _MSC_VER>=1400
+    unsigned __int64 __umulh   (unsigned __int64 a,unsigned __int64 b);
+    unsigned __int64 _umul128  (unsigned __int64 a,unsigned __int64 b,
+                                unsigned __int64 *h);
+#   pragma intrinsic(__umulh,_umul128)
+#   define BN_UMULT_HIGH(a,b)          __umulh((a),(b))
+#   define BN_UMULT_LOHI(low,high,a,b) ((low)=_umul128((a),(b),&(high)))
+#  endif
 # endif                /* cpu */
 #endif         /* OPENSSL_NO_ASM */
 
@@ -313,6 +322,33 @@ extern "C" {
        (r1)=Hw(t); \
        }
 
+#elif defined(BN_UMULT_LOHI)
+#define mul_add(r,a,w,c) {             \
+       BN_ULONG high,low,ret,tmp=(a);  \
+       ret =  (r);                     \
+       BN_UMULT_LOHI(low,high,w,tmp);  \
+       ret += (c);                     \
+       (c) =  (ret<(c))?1:0;           \
+       (c) += high;                    \
+       ret += low;                     \
+       (c) += (ret<low)?1:0;           \
+       (r) =  ret;                     \
+       }
+
+#define mul(r,a,w,c)   {               \
+       BN_ULONG high,low,ret,ta=(a);   \
+       BN_UMULT_LOHI(low,high,w,ta);   \
+       ret =  low + (c);               \
+       (c) =  high;                    \
+       (c) += (ret<low)?1:0;           \
+       (r) =  ret;                     \
+       }
+
+#define sqr(r0,r1,a)   {               \
+       BN_ULONG tmp=(a);               \
+       BN_UMULT_LOHI(r0,r1,tmp,tmp);   \
+       }
+
 #elif defined(BN_UMULT_HIGH)
 #define mul_add(r,a,w,c) {             \
        BN_ULONG high,low,ret,tmp=(a);  \