RT2163: Remove some unneeded #include's
[openssl.git] / crypto / bn / bn_mont.c
index 4339aab18761535ec53d1a875291d589fc165544..e41f849479a9fe1be09f208f4f163a4af6bbea29 100644 (file)
  * sections 3.8 and 4.2 in http://security.ece.orst.edu/koc/papers/r01rsasw.pdf
  */
 
-#include <stdio.h>
+#define OPENSSL_FIPSAPI
+
 #include "cryptlib.h"
 #include "bn_lcl.h"
 
@@ -177,40 +178,32 @@ err:
 static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
        {
        BIGNUM *n;
-       BN_ULONG *ap,*np,*rp,n0,v,*nrp;
-       int al,nl,max,i,x,ri;
+       BN_ULONG *ap,*np,*rp,n0,v,carry;
+       int nl,max,i;
 
        n= &(mont->N);
-       /* mont->ri is the size of mont->N in bits (rounded up
-          to the word size) */
-       al=ri=mont->ri/BN_BITS2;
-
        nl=n->top;
-       if ((al == 0) || (nl == 0)) { ret->top=0; return(1); }
+       if (nl == 0) { ret->top=0; return(1); }
 
-       max=(nl+al+1); /* allow for overflow (no?) XXX */
+       max=(2*nl); /* carry is stored separately */
        if (bn_wexpand(r,max) == NULL) return(0);
 
        r->neg^=n->neg;
        np=n->d;
        rp=r->d;
-       nrp= &(r->d[nl]);
 
        /* clear the top words of T */
 #if 1
        for (i=r->top; i<max; i++) /* memset? XXX */
-               r->d[i]=0;
+               rp[i]=0;
 #else
-       memset(&(r->d[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
+       memset(&(rp[r->top]),0,(max-r->top)*sizeof(BN_ULONG)); 
 #endif
 
        r->top=max;
        n0=mont->n0[0];
 
-#ifdef BN_COUNT
-       fprintf(stderr,"word BN_from_montgomery_word %d * %d\n",nl,nl);
-#endif
-       for (i=0; i<nl; i++)
+       for (carry=0, i=0; i<nl; i++, rp++)
                {
 #ifdef __TANDEM
                 {
@@ -228,67 +221,33 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
 #else
                v=bn_mul_add_words(rp,np,nl,(rp[0]*n0)&BN_MASK2);
 #endif
-               nrp++;
-               rp++;
-               if (((nrp[-1]+=v)&BN_MASK2) >= v)
-                       continue;
-               else
-                       {
-                       if (((++nrp[0])&BN_MASK2) != 0) continue;
-                       if (((++nrp[1])&BN_MASK2) != 0) continue;
-                       for (x=2; (((++nrp[x])&BN_MASK2) == 0); x++) ;
-                       }
+               v = (v+carry+rp[nl])&BN_MASK2;
+               carry |= (v != rp[nl]);
+               carry &= (v <= rp[nl]);
+               rp[nl]=v;
                }
-       bn_correct_top(r);
 
-       /* mont->ri will be a multiple of the word size and below code
-        * is kind of BN_rshift(ret,r,mont->ri) equivalent */
-       if (r->top < ri)
-               {
-               ret->top=0;
-               return(1);
-               }
-       al=r->top-ri;
+       if (bn_wexpand(ret,nl) == NULL) return(0);
+       ret->top=nl;
+       ret->neg=r->neg;
+
+       rp=ret->d;
+       ap=&(r->d[nl]);
 
 #define BRANCH_FREE 1
 #if BRANCH_FREE
-       if (bn_wexpand(ret,ri) == NULL) return(0);
-       x=0-(((al-ri)>>(sizeof(al)*8-1))&1);
-       ret->top=x=(ri&~x)|(al&x);      /* min(ri,al) */
-       ret->neg=r->neg;
+       {
+       BN_ULONG *nrp;
+       size_t m;
 
-       rp=ret->d;
-       ap=&(r->d[ri]);
-       nrp=ap;
-
-       /* This 'if' denotes violation of 2*M<r^(n-1) boundary condition
-        * formulated by C.D.Walter in "Montgomery exponentiation needs
-        * no final subtractions." Incurred branch can disclose only
-        * information about modulus length, which is not really secret. */
-       if ((mont->N.d[ri-1]>>(BN_BITS2-2))!=0)
-               {
-               size_t m1,m2;
-
-               v=bn_sub_words(rp,ap,mont->N.d,ri);
-               /* this -----------------------^^ works even in al<ri case
-                * thanks to zealous zeroing of top of the vector in the
-                * beginning. */
-
-               /* if (al==ri && !v) || al>ri) nrp=rp; else nrp=ap; */
-               /* in other words if subtraction result is real, then
-                * trick unconditional memcpy below to perform in-place
-                * "refresh" instead of actual copy. */
-               m1=0-(size_t)(((al-ri)>>(sizeof(al)*8-1))&1);   /* al<ri */
-               m2=0-(size_t)(((ri-al)>>(sizeof(al)*8-1))&1);   /* al>ri */
-               m1|=m2;                 /* (al!=ri) */
-               m1|=(0-(size_t)v);      /* (al!=ri || v) */
-               m1&=~m2;                /* (al!=ri || v) && !al>ri */
-               nrp=(BN_ULONG *)(((size_t)rp&~m1)|((size_t)ap&m1));
-               }
+       v=bn_sub_words(rp,ap,np,nl)-carry;
+       /* if subtraction result is real, then
+        * trick unconditional memcpy below to perform in-place
+        * "refresh" instead of actual copy. */
+       m=(0-(size_t)v);
+       nrp=(BN_ULONG *)(((PTR_SIZE_INT)rp&~m)|((PTR_SIZE_INT)ap&m));
 
-       /* 'i<ri' is chosen to eliminate dependency on input data, even
-        * though it results in redundant copy in al<ri case. */
-       for (i=0,ri-=4; i<ri; i+=4)
+       for (i=0,nl-=4; i<nl; i+=4)
                {
                BN_ULONG t1,t2,t3,t4;
                
@@ -301,38 +260,15 @@ static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, BN_MONT_CTX *mont)
                rp[i+2]=t3;
                rp[i+3]=t4;
                }
-       for (ri+=4; i<ri; i++)
+       for (nl+=4; i<nl; i++)
                rp[i]=nrp[i], ap[i]=0;
+       }
 #else
-       if (bn_wexpand(ret,al) == NULL) return(0);
-       ret->top=al;
-       ret->neg=r->neg;
-
-       rp=ret->d;
-       ap=&(r->d[ri]);
-       al-=4;
-       for (i=0; i<al; i+=4)
-               {
-               BN_ULONG t1,t2,t3,t4;
-               
-               t1=ap[i+0];
-               t2=ap[i+1];
-               t3=ap[i+2];
-               t4=ap[i+3];
-               rp[i+0]=t1;
-               rp[i+1]=t2;
-               rp[i+2]=t3;
-               rp[i+3]=t4;
-               }
-       al+=4;
-       for (; i<al; i++)
-               rp[i]=ap[i];
-
-       if (BN_ucmp(ret, &(mont->N)) >= 0)
-               {
-               if (!BN_usub(ret,ret,&(mont->N))) return(0);
-               }
+       if (bn_sub_words (rp,ap,np,nl)-carry)
+               memcpy(rp,ap,nl*sizeof(BN_ULONG));
 #endif
+       bn_correct_top(r);
+       bn_correct_top(ret);
        bn_check_top(ret);
 
        return(1);
@@ -398,6 +334,7 @@ void BN_MONT_CTX_init(BN_MONT_CTX *ctx)
        BN_init(&(ctx->RR));
        BN_init(&(ctx->N));
        BN_init(&(ctx->Ni));
+       ctx->n0[0] = ctx->n0[1] = 0;
        ctx->flags=0;
        }
 
@@ -429,6 +366,7 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
                BIGNUM tmod;
                BN_ULONG buf[2];
 
+               BN_init(&tmod);
                tmod.d=buf;
                tmod.dmax=2;
                tmod.neg=0;
@@ -436,6 +374,11 @@ int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx)
                mont->ri=(BN_num_bits(mod)+(BN_BITS2-1))/BN_BITS2*BN_BITS2;
 
 #if defined(OPENSSL_BN_ASM_MONT) && (BN_BITS2<=32)
+               /* Only certain BN_BITS2<=32 platforms actually make use of
+                * n0[1], and we could use the #else case (with a shorter R
+                * value) for the others.  However, currently only the assembler
+                * files do know which is which. */
+
                BN_zero(R);
                if (!(BN_set_bit(R,2*BN_BITS2))) goto err;
 
@@ -533,32 +476,38 @@ BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, BN_MONT_CTX *from)
 BN_MONT_CTX *BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, int lock,
                                        const BIGNUM *mod, BN_CTX *ctx)
        {
-       int got_write_lock = 0;
        BN_MONT_CTX *ret;
 
        CRYPTO_r_lock(lock);
-       if (!*pmont)
+       ret = *pmont;
+       CRYPTO_r_unlock(lock);
+       if (ret)
+               return ret;
+
+       /* We don't want to serialise globally while doing our lazy-init math in
+        * BN_MONT_CTX_set. That punishes threads that are doing independent
+        * things. Instead, punish the case where more than one thread tries to
+        * lazy-init the same 'pmont', by having each do the lazy-init math work
+        * independently and only use the one from the thread that wins the race
+        * (the losers throw away the work they've done). */
+       ret = BN_MONT_CTX_new();
+       if (!ret)
+               return NULL;
+       if (!BN_MONT_CTX_set(ret, mod, ctx))
                {
-               CRYPTO_r_unlock(lock);
-               CRYPTO_w_lock(lock);
-               got_write_lock = 1;
+               BN_MONT_CTX_free(ret);
+               return NULL;
+               }
 
-               if (!*pmont)
-                       {
-                       ret = BN_MONT_CTX_new();
-                       if (ret && !BN_MONT_CTX_set(ret, mod, ctx))
-                               BN_MONT_CTX_free(ret);
-                       else
-                               *pmont = ret;
-                       }
+       /* The locked compare-and-set, after the local work is done. */
+       CRYPTO_w_lock(lock);
+       if (*pmont)
+               {
+               BN_MONT_CTX_free(ret);
+               ret = *pmont;
                }
-       
-       ret = *pmont;
-       
-       if (got_write_lock)
-               CRYPTO_w_unlock(lock);
        else
-               CRYPTO_r_unlock(lock);
-               
+               *pmont = ret;
+       CRYPTO_w_unlock(lock);
        return ret;
        }