Add reference implementation for bn_[mul|sqr]_mont, new candidates for

author Andy Polyakov <appro@openssl.org>

Tue, 4 Oct 2005 06:19:29 +0000 (06:19 +0000)

committer Andy Polyakov <appro@openssl.org>

Tue, 4 Oct 2005 06:19:29 +0000 (06:19 +0000)
author Andy Polyakov <appro@openssl.org>
Tue, 4 Oct 2005 06:19:29 +0000 (06:19 +0000)
committer Andy Polyakov <appro@openssl.org>
Tue, 4 Oct 2005 06:19:29 +0000 (06:19 +0000)
diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h

index 2688684b63512f17b7a1f346bdda1b4e415f8fea..c296e10d251a7d5413ef172081fb58ac64353392 100644 (file)
--- a/crypto/bn/bn.h
+++ b/crypto/bn/bn.h
@@ -727,6 +727,8 @@ int RAND_pseudo_bytes(unsigned char *buf,int num);
         bn_pollute(a); \
         }
  
         bn_pollute(a); \
         }
  
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num);
+void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num);
  BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
  BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
  void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
  BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
  BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w);
  void     bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, int num);
diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c

index 99bc2de4913e8b88c21c3f610f2e4bdae71bfce5..52af96d36b4ec98a36ee42b9ef80c16a6ea72d3a 100644 (file)
--- a/crypto/bn/bn_asm.c
+++ b/crypto/bn/bn_asm.c
@@ -820,18 +820,95 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
         r[6]=c1;
         r[7]=c2;
         }
         r[6]=c1;
         r[7]=c2;
         }
+
+#ifdef OPENSSL_BN_ASM_MONT
+/*
+ * This is essentially reference implementation, which may or may not
+ * result in performance improvement. E.g. on IA-32 this does give 40%
+ * faster rsa1024 private key operations and 10% faster rsa4096 ones,
+ * while on AMD64 it improves rsa1024 sign only by 10%, but *worsens*
+ * rsa4096 sign by 15%. Once again, it's a reference implementation,
+ * one to be used as start-point for platform-specific assembler.
+ */
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num)
+       {
+       BN_ULONG c0,c1,ml,*tp;
+#ifdef mul64
+       BN_ULONG mh;
+#endif
+       volatile BN_ULONG *vp;
+       int i=0,j;
+
+       vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+       tp[num]   = bn_mul_words(tp,ap,num,bp[0]);
+       tp[num+1] = 0;
+       goto enter;
+
+       for(i=0;i<num;i++)
+               {
+               c0 = bn_mul_add_words(tp,ap,num,bp[i]);
+               c1 = (tp[num] + c0)&BN_MASK2;
+               tp[num]   = c1;
+               tp[num+1] = (c1<c0?1:0);
+       enter:
+               c1  = tp[0];
+               ml = (c1*n0)&BN_MASK2;
+               c0 = 0;
+#ifdef mul64
+               mh = HBITS(ml);
+               ml = LBITS(ml);
+               mul_add(c1,np[0],ml,mh,c0);
+#else
+               mul_add(c1,ml,np[0],c0);
+#endif
+               for(j=1;j<num;j++)
+                       {
+                       c1 = tp[j];
+#ifdef mul64
+                       mul_add(c1,np[j],ml,mh,c0);
+#else
+                       mul_add(c1,ml,np[j],c0);
+#endif
+                       tp[j-1] = c1&BN_MASK2;
+                       }
+               c1        = (tp[num] + c0)&BN_MASK2;
+               tp[num-1] = c1;
+               tp[num]   = tp[num+1] + (c1<c0?1:0);
+               }
+
+       if (tp[num]!=0 || tp[num-1]>=np[num-1])
+               {
+               c0 = bn_sub_words(rp,tp,np,num);
+               if (tp[num]!=0 || c0==0)
+                       {
+                       for(i=0;i<num+2;i++)    vp[i] = 0;
+                       return;
+                       }
+               }
+       for(i=0;i<num;i++)      rp[i] = tp[i],  vp[i] = 0;
+       vp[num]   = 0;
+       vp[num+1] = 0;
+       }
+
+void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num)
+       {
+       bn_mul_mont(rp,ap,ap,np,n0,num);
+       }
+#endif /* OPENSSL_BN_ASM_MONT */
+
  #else /* !BN_MUL_COMBA */
  
  /* hmm... is it faster just to do a multiply? */
  #undef bn_sqr_comba4
  #else /* !BN_MUL_COMBA */
  
  /* hmm... is it faster just to do a multiply? */
  #undef bn_sqr_comba4
-void bn_sqr_comba4(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a)
         {
         BN_ULONG t[8];
         bn_sqr_normal(r,a,4,t);
         }
  
  #undef bn_sqr_comba8
         {
         BN_ULONG t[8];
         bn_sqr_normal(r,a,4,t);
         }
  
  #undef bn_sqr_comba8
-void bn_sqr_comba8(BN_ULONG *r, BN_ULONG *a)
+void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a)
         {
         BN_ULONG t[16];
         bn_sqr_normal(r,a,8,t);
         {
         BN_ULONG t[16];
         bn_sqr_normal(r,a,8,t);
@@ -857,4 +934,49 @@ void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b)
         r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
         }
  
         r[15]=bn_mul_add_words(&(r[7]),a,8,b[7]);
         }
  
+#ifdef OPENSSL_BN_ASM_MONT
+void bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num)
+       {
+       BN_ULONG c0,c1,*tp;
+       volatile BN_ULONG *vp;
+       int i=0,j;
+
+       vp = tp = alloca((num+2)*sizeof(BN_ULONG));
+
+       for(i=0;i<=num;i++)     tp[i]=0;
+
+       for(i=0;i<num;i++)
+               {
+               c0         = bn_mul_add_words(tp,ap,num,bp[i]);
+               c1         = tp[num] + c0;
+               tp[num]    = c1;
+               tp[num+1]  = (c1<c0?1:0);
+
+               c0         = bn_mul_add_words(tp,np,num,tp[0]*n0);
+               c1         = tp[num] + c0;
+               tp[num]    = c1;
+               tp[num+1] += (c1<c0?1:0);
+               for(j=0;j<=num;j++)     tp[j]=tp[j+1];
+               }
+
+       if (tp[num]!=0 || tp[num-1]>=np[num-1])
+               {
+               c0 = bn_sub_words(rp,tp,np,num);
+               if (tp[num]!=0 || c0==0)
+                       {
+                       for(i=0;i<num+2;i++)    vp[i] = 0;
+                       return;
+                       }
+               }
+       for(i=0;i<num;i++)      rp[i] = tp[i],  vp[i] = 0;
+       vp[num]   = 0;
+       vp[num+1] = 0;
+       }
+
+void bn_sqr_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *np,BN_ULONG n0, int num)
+       {
+       bn_mul_mont(rp,ap,ap,np,n0,num);
+       }
+#endif /* OPENSSL_BN_ASM_MONT */
+
  #endif /* !BN_MUL_COMBA */
  #endif /* !BN_MUL_COMBA */
diff --git a/crypto/bn/bn_mont.c b/crypto/bn/bn_mont.c

index 82af91f90d7f5ca0660c7761a4caafa20a39b640..f70f8ab3efaee45508786d280f09364f814ccf94 100644 (file)
--- a/crypto/bn/bn_mont.c
+++ b/crypto/bn/bn_mont.c
@@ -74,6 +74,22 @@ int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b,
         {
         BIGNUM *tmp;
         int ret=0;
         {
         BIGNUM *tmp;
         int ret=0;
+#ifdef OPENSSL_BN_ASM_MONT
+       int num = mont->N.top;
+
+       if (num>1 && a->top==num && b->top==num)
+               {
+               if (bn_wexpand(r,num) == NULL) return 0;
+               r->neg = a->neg^b->neg;
+               r->top = num;
+               if (a==b)
+                       bn_sqr_mont(r->d,a->d,mont->N.d,mont->n0,num);
+               else
+                       bn_mul_mont(r->d,a->d,b->d,mont->N.d,mont->n0,num);
+               bn_fix_top(r);
+               return 1;
+               }
+#endif
  
         BN_CTX_start(ctx);
         tmp = BN_CTX_get(ctx);
  
         BN_CTX_start(ctx);
         tmp = BN_CTX_get(ctx);
author	Andy Polyakov <appro@openssl.org>
	Tue, 4 Oct 2005 06:19:29 +0000 (06:19 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Tue, 4 Oct 2005 06:19:29 +0000 (06:19 +0000)
crypto/bn/bn.h		patch \| blob \| history
crypto/bn/bn_asm.c		patch \| blob \| history
crypto/bn/bn_mont.c		patch \| blob \| history