bn_gf2m.c: optimized BN_GF2m_mod_inv delivers sometimes 2x of ECDSA sign.
authorAndy Polyakov <appro@openssl.org>
Wed, 4 May 2011 15:22:53 +0000 (15:22 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 4 May 2011 15:22:53 +0000 (15:22 +0000)
Exact improvement coefficients vary from one benchmark and platform to
another, e.g. it performs 70%-33% better on ARM, hereafter less for
longer keys, and 100%-90% better on x86_64.

crypto/bn/bn_gf2m.c

index e170fff..6caf288 100644 (file)
@@ -364,21 +364,17 @@ int BN_GF2m_mod_arr(BIGNUM *r, const BIGNUM *a, const int p[])
 int    BN_GF2m_mod(BIGNUM *r, const BIGNUM *a, const BIGNUM *p)
        {
        int ret = 0;
-       const int max = BN_num_bits(p) + 1;
-       int *arr=NULL;
+       int arr[6];
        bn_check_top(a);
        bn_check_top(p);
-       if ((arr = (int *)OPENSSL_malloc(sizeof(int) * max)) == NULL) goto err;
-       ret = BN_GF2m_poly2arr(p, arr, max);
-       if (!ret || ret > max)
+       ret = BN_GF2m_poly2arr(p, arr, sizeof(arr)/sizeof(arr[0]));
+       if (!ret || ret > sizeof(arr)/sizeof(arr[0]))
                {
                BNerr(BN_F_BN_GF2M_MOD,BN_R_INVALID_LENGTH);
-               goto err;
+               return 0;
                }
        ret = BN_GF2m_mod_arr(r, a, arr);
        bn_check_top(r);
-err:
-       if (arr) OPENSSL_free(arr);
        return ret;
        }
 
@@ -533,18 +529,18 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
 
        BN_CTX_start(ctx);
        
-       b = BN_CTX_get(ctx);
-       c = BN_CTX_get(ctx);
-       u = BN_CTX_get(ctx);
-       v = BN_CTX_get(ctx);
-       if (v == NULL) goto err;
+       if ((b = BN_CTX_get(ctx))==NULL) goto err;
+       if ((c = BN_CTX_get(ctx))==NULL) goto err;
+       if ((u = BN_CTX_get(ctx))==NULL) goto err;
+       if ((v = BN_CTX_get(ctx))==NULL) goto err;
 
-       if (!BN_one(b)) goto err;
        if (!BN_GF2m_mod(u, a, p)) goto err;
-       if (!BN_copy(v, p)) goto err;
-
        if (BN_is_zero(u)) goto err;
 
+       if (!BN_copy(v, p)) goto err;
+#if 0
+       if (!BN_one(b)) goto err;
+
        while (1)
                {
                while (!BN_is_odd(u))
@@ -568,7 +564,75 @@ int BN_GF2m_mod_inv(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx)
                if (!BN_GF2m_add(u, u, v)) goto err;
                if (!BN_GF2m_add(b, b, c)) goto err;
                }
+#else
+       {
+       int i,  ubits = BN_num_bits(u),
+               vbits = BN_num_bits(v), /* v is copy of p */
+               top = p->top;
+       BN_ULONG *udp,*bdp,*vdp,*cdp;
+
+       bn_wexpand(u,top);      udp = u->d;
+                               for (i=u->top;i<top;i++) udp[i] = 0;
+                               u->top = top;
+       bn_wexpand(b,top);      bdp = b->d;
+                               bdp[0] = 1;
+                               for (i=1;i<top;i++) bdp[i] = 0;
+                               b->top = top;
+       bn_wexpand(c,top);      cdp = c->d;
+                               for (i=0;i<top;i++) cdp[i] = 0;
+                               c->top = top;
+       vdp = v->d;     /* It pays off to "cache" *->d pointers, because
+                        * it allows optimizer to be more aggressive.
+                        * But we don't have to "cache" p->d, because *p
+                        * is declared 'const'... */
+       while (1)
+               {
+               while (ubits && !(udp[0]&1))
+                       {
+                       BN_ULONG u0,u1,b0,b1,mask;
+
+                       u0   = udp[0];
+                       b0   = bdp[0];
+                       mask = (BN_ULONG)0-(b0&1);
+                       b0  ^= p->d[0]&mask;
+                       for (i=0;i<top-1;i++)
+                               {
+                               u1 = udp[i+1];
+                               udp[i] = ((u0>>1)|(u1<<(BN_BITS2-1)))&BN_MASK2;
+                               u0 = u1;
+                               b1 = bdp[i+1]^(p->d[i+1]&mask);
+                               bdp[i] = ((b0>>1)|(b1<<(BN_BITS2-1)))&BN_MASK2;
+                               b0 = b1;
+                               }
+                       udp[i] = u0>>1;
+                       bdp[i] = b0>>1;
+                       ubits--;
+                       }
 
+               if (ubits<=BN_BITS2 && udp[0]==1) break;
+
+               if (ubits<vbits)
+                       {
+                       i = ubits; ubits = vbits; vbits = i;
+                       tmp = u; u = v; v = tmp;
+                       tmp = b; b = c; c = tmp;
+                       udp = vdp; vdp = v->d;
+                       bdp = cdp; cdp = c->d;
+                       }
+               for(i=0;i<top;i++)
+                       {
+                       udp[i] ^= vdp[i];
+                       bdp[i] ^= cdp[i];
+                       }
+               if (ubits==vbits)
+                       {
+                       bn_fix_top(u);
+                       ubits = BN_num_bits(u);
+                       }
+               }
+       bn_fix_top(b);
+       }
+#endif
 
        if (!BN_copy(r, b)) goto err;
        bn_check_top(r);