From: Bodo Möller Date: Thu, 8 Jun 2000 20:26:03 +0000 (+0000) Subject: Slightly faster DSA verification (BN_mod_exp2_mont), X-Git-Tag: OpenSSL-engine-0_9_6-beta1~21^2~80 X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff_plain;h=dc434bbcb0f63e03c64be1d977fae6c9411bfc5c Slightly faster DSA verification (BN_mod_exp2_mont), marginally faster BN_mod_exp for 1024 bit exponents. --- diff --git a/CHANGES b/CHANGES index 28c8d49cf4..c13f0a9e38 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,17 @@ Changes between 0.9.5a and 0.9.6 [xx XXX 2000] + *) Re-implement BN_mod_exp2_mont using independent (and larger) windows. + This makes DSA verification about 2 % faster. + [Bodo Moeller] + + *) Increase maximum window size in BN_mod_exp_... to 6 bits instead of 5 + (meaning that now 2^5 values will be precomputed, which is only 4 KB + plus overhead for 1024 bit moduli). + This makes exponentiations about 0.5 % faster for 1024 bit + exponents (as measured by "openssl speed rsa2048"). + [Bodo Moeller] + *) Rename memory handling macros to avoid conflicts with other software: Malloc => OPENSSL_malloc @@ -13,7 +24,7 @@ [Richard Levitte] *) New function BN_mod_exp_mont_word for small bases (roughly 15-20% - faster than BN_mod_exp_mont). + faster than BN_mod_exp_mont, i.e. 7.5-10% for a full DH exchange). [Bodo Moeller] *) CygWin32 support. diff --git a/TABLE b/TABLE index 49f72e0d46..f1b8180671 100644 --- a/TABLE +++ b/TABLE @@ -632,7 +632,7 @@ $dso_scheme = *** debug-ben $cc = gcc -$cflags = -DBN_DEBUG -DREF_CHECK -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -O2 -pedantic -Wall -Wshadow -Werror -pipe +$cflags = -DBN_DEBUG -DREF_CHECK -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -DDEBUG_SAFESTACK -O2 -pedantic -Wall -Wshadow -Werror -pipe $unistd = $thread_cflag = (unknown) $lflags = @@ -650,7 +650,7 @@ $dso_scheme = *** debug-ben-debug $cc = gcc -$cflags = -DBN_DEBUG -DREF_CHECK -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -g3 -O2 -pedantic -Wall -Wshadow -Werror -pipe +$cflags = -DBN_DEBUG -DREF_CHECK -DBN_CTX_DEBUG -DCRYPTO_MDEBUG -DPEDANTIC -DDEBUG_SAFESTACK -g3 -O2 -pedantic -Wall -Wshadow -Werror -pipe $unistd = $thread_cflag = (unknown) $lflags = @@ -1228,7 +1228,7 @@ $dso_scheme = $cc = cc $cflags = -n32 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W $unistd = -$thread_cflag = (unknown) +$thread_cflag = -D_SGI_MP_SOURCE $lflags = $bn_ops = DES_PTR RC4_CHAR RC4_CHUNK_LL DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT $bn_obj = asm/mips3.o @@ -1246,7 +1246,7 @@ $dso_scheme = $cc = gcc $cflags = -mabi=n32 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W $unistd = -$thread_cflag = (unknown) +$thread_cflag = -D_SGI_MP_SOURCE $lflags = $bn_ops = MD2_CHAR RC4_INDEX RC4_CHAR RC4_CHUNK_LL DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT $bn_obj = asm/mips3.o @@ -1264,7 +1264,7 @@ $dso_scheme = $cc = cc $cflags = -64 -mips4 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W $unistd = -$thread_cflag = (unknown) +$thread_cflag = -D_SGI_MP_SOURCE $lflags = $bn_ops = RC4_CHAR RC4_CHUNK DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG $bn_obj = asm/mips3.o @@ -1282,7 +1282,7 @@ $dso_scheme = $cc = gcc $cflags = -mabi=64 -mips4 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W $unistd = -$thread_cflag = (unknown) +$thread_cflag = -D_SGI_MP_SOURCE $lflags = $bn_ops = RC4_CHAR RC4_CHUNK DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG $bn_obj = asm/mips3.o @@ -1300,7 +1300,7 @@ $dso_scheme = $cc = ccc $cflags = -fast -readonly_strings -DL_ENDIAN -DTERMIO $unistd = -$thread_cflag = (unknown) +$thread_cflag = -D_REENTRANT $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL $bn_obj = asm/alpha.o @@ -1318,7 +1318,7 @@ $dso_scheme = $cc = gcc $cflags = -O3 -DL_ENDIAN -DTERMIO $unistd = -$thread_cflag = (unknown) +$thread_cflag = -D_REENTRANT $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_RISC1 DES_UNROLL $bn_obj = asm/alpha.o @@ -1336,7 +1336,7 @@ $dso_scheme = $cc = ccc $cflags = -fast -readonly_strings -DL_ENDIAN -DTERMIO $unistd = -$thread_cflag = (unknown) +$thread_cflag = -D_REENTRANT $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL $bn_obj = asm/alpha.o @@ -1354,7 +1354,7 @@ $dso_scheme = $cc = gcc $cflags = -O3 -DL_ENDIAN -DTERMIO $unistd = -$thread_cflag = (unknown) +$thread_cflag = -D_REENTRANT $lflags = $bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_UNROLL $bn_obj = asm/alpha.o @@ -1859,7 +1859,7 @@ $cc = gcc $cflags = -O3 -mv8 -Dssize_t=int $unistd = $thread_cflag = (unknown) -$lflags = +$lflags = -liberty $bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL DES_PTR DES_RISC1 $bn_obj = $des_obj = diff --git a/crypto/bn/bn.h b/crypto/bn/bn.h index 000ff48155..d292394c0f 100644 --- a/crypto/bn/bn.h +++ b/crypto/bn/bn.h @@ -485,6 +485,7 @@ BN_ULONG bn_sub_words(BN_ULONG *rp, BN_ULONG *ap, BN_ULONG *bp,int num); #define BN_F_BN_CTX_NEW 106 #define BN_F_BN_DIV 107 #define BN_F_BN_EXPAND2 108 +#define BN_F_BN_MOD_EXP2_MONT 118 #define BN_F_BN_MOD_EXP_MONT 109 #define BN_F_BN_MOD_EXP_MONT_WORD 117 #define BN_F_BN_MOD_INVERSE 110 diff --git a/crypto/bn/bn_err.c b/crypto/bn/bn_err.c index e0cbd70b7d..86550c4c21 100644 --- a/crypto/bn/bn_err.c +++ b/crypto/bn/bn_err.c @@ -76,8 +76,9 @@ static ERR_STRING_DATA BN_str_functs[]= {ERR_PACK(0,BN_F_BN_CTX_NEW,0), "BN_CTX_new"}, {ERR_PACK(0,BN_F_BN_DIV,0), "BN_div"}, {ERR_PACK(0,BN_F_BN_EXPAND2,0), "bn_expand2"}, +{ERR_PACK(0,BN_F_BN_MOD_EXP2_MONT,0), "BN_mod_exp2_mont"}, {ERR_PACK(0,BN_F_BN_MOD_EXP_MONT,0), "BN_mod_exp_mont"}, -{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT_WORD,0), "BN_MOD_EXP_MONT_WORD"}, +{ERR_PACK(0,BN_F_BN_MOD_EXP_MONT_WORD,0), "BN_mod_exp_mont_word"}, {ERR_PACK(0,BN_F_BN_MOD_INVERSE,0), "BN_mod_inverse"}, {ERR_PACK(0,BN_F_BN_MOD_MUL_RECIPROCAL,0), "BN_mod_mul_reciprocal"}, {ERR_PACK(0,BN_F_BN_MPI2BN,0), "BN_mpi2bn"}, diff --git a/crypto/bn/bn_exp.c b/crypto/bn/bn_exp.c index 996bdfa107..11540c6f7b 100644 --- a/crypto/bn/bn_exp.c +++ b/crypto/bn/bn_exp.c @@ -121,7 +121,7 @@ #endif -#define TABLE_SIZE 16 +#define TABLE_SIZE 32 /* slow but works */ int BN_mod_mul(BIGNUM *ret, BIGNUM *a, BIGNUM *b, const BIGNUM *m, BN_CTX *ctx) @@ -427,27 +427,22 @@ int BN_mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, ts=1; if (!BN_mod(&(val[0]),a,m,ctx)) goto err; /* 1 */ - if (!BN_mod_mul_reciprocal(aa,&(val[0]),&(val[0]),&recp,ctx)) - goto err; /* 2 */ - - if (bits <= 17) /* This is probably 3 or 0x10001, so just do singles */ - window=1; - else if (bits >= 256) - window=5; /* max size of window */ - else if (bits >= 128) - window=4; - else - window=3; - j=1<<(window-1); - for (i=1; i 1) { - BN_init(&val[i]); - if (!BN_mod_mul_reciprocal(&(val[i]),&(val[i-1]),aa,&recp,ctx)) - goto err; + if (!BN_mod_mul_reciprocal(aa,&(val[0]),&(val[0]),&recp,ctx)) + goto err; /* 2 */ + j=1<<(window-1); + for (i=1; i= 256) - window=5; /* max size of window */ - else if (bits >= 128) - window=4; - else - window=3; - j=1<<(window-1); - for (i=1; i 1) { - BN_init(&(val[i])); - if (!BN_mod_mul_montgomery(&(val[i]),&(val[i-1]),d,mont,ctx)) - goto err; + if (!BN_mod_mul_montgomery(d,&(val[0]),&(val[0]),mont,ctx)) goto err; /* 2 */ + j=1<<(window-1); + for (i=1; i= 256) - window=5; /* max size of window */ - else if (bits >= 128) - window=4; - else - window=3; - j=1<<(window-1); - for (i=1; i 1) { - BN_init(&(val[i])); - if (!BN_mod_mul(&(val[i]),&(val[i-1]),d,m,ctx)) - goto err; + if (!BN_mod_mul(d,&(val[0]),&(val[0]),m,ctx)) + goto err; /* 2 */ + j=1<<(window-1); + for (i=1; i #include "cryptlib.h" #include "bn_lcl.h" -/* I've done some timing with different table sizes. - * The main hassle is that even with bits set at 3, this requires - * 63 BIGNUMs to store the pre-calculated values. - * 512 1024 - * bits=1 75.4% 79.4% - * bits=2 61.2% 62.4% - * bits=3 61.3% 59.3% - * The lack of speed improvement is also a function of the pre-calculation - * which could be removed. - */ -#define EXP2_TABLE_BITS 2 /* 1 2 3 4 5 */ -#define EXP2_TABLE_SIZE 4 /* 2 4 8 16 32 */ +#define TABLE_SIZE 32 int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2, BIGNUM *p2, BIGNUM *m, BN_CTX *ctx, BN_MONT_CTX *in_mont) { - int i,j,k,bits,bits1,bits2,ret=0,wstart,wend,window,xvalue,yvalue; - int start=1,ts=0,x,y; - BIGNUM *d,*aa1,*aa2,*r; - BIGNUM val[EXP2_TABLE_SIZE][EXP2_TABLE_SIZE]; + int i,j,bits,b,bits1,bits2,ret=0,wpos1,wpos2,window1,window2,wvalue1,wvalue2; + int r_is_one=1,ts1=0,ts2=0; + BIGNUM *d,*r; + BIGNUM *a_mod_m; + BIGNUM val1[TABLE_SIZE], val2[TABLE_SIZE]; BN_MONT_CTX *mont=NULL; bn_check_top(a1); @@ -32,7 +133,7 @@ int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2, if (!(m->d[0] & 1)) { - BNerr(BN_F_BN_MOD_EXP_MONT,BN_R_CALLED_WITH_EVEN_MODULUS); + BNerr(BN_F_BN_MOD_EXP2_MONT,BN_R_CALLED_WITH_EVEN_MODULUS); return(0); } bits1=BN_num_bits(p1); @@ -42,17 +143,13 @@ int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2, BN_one(rr); return(1); } + bits=(bits1 > bits2)?bits1:bits2; BN_CTX_start(ctx); d = BN_CTX_get(ctx); r = BN_CTX_get(ctx); if (d == NULL || r == NULL) goto err; - bits=(bits1 > bits2)?bits1:bits2; - - /* If this is not done, things will break in the montgomery - * part */ - if (in_mont != NULL) mont=in_mont; else @@ -61,139 +158,143 @@ int BN_mod_exp2_mont(BIGNUM *rr, BIGNUM *a1, BIGNUM *p1, BIGNUM *a2, if (!BN_MONT_CTX_set(mont,m,ctx)) goto err; } - BN_init(&(val[0][0])); - BN_init(&(val[1][1])); - BN_init(&(val[0][1])); - BN_init(&(val[1][0])); - ts=1; + window1 = BN_window_bits_for_exponent_size(bits1); + window2 = BN_window_bits_for_exponent_size(bits2); + + /* + * Build table for a1: val1[i] := a1^(2*i + 1) mod m for i = 0 .. 2^(window1-1) + */ + BN_init(&val1[0]); + ts1=1; if (BN_ucmp(a1,m) >= 0) { - BN_mod(&(val[1][0]),a1,m,ctx); - aa1= &(val[1][0]); + if (!BN_mod(&(val1[0]),a1,m,ctx)) + goto err; + a_mod_m = &(val1[0]); } else - aa1=a1; + a_mod_m = a1; + if (!BN_to_montgomery(&(val1[0]),a_mod_m,mont,ctx)) goto err; + if (window1 > 1) + { + if (!BN_mod_mul_montgomery(d,&(val1[0]),&(val1[0]),mont,ctx)) goto err; + + j=1<<(window1-1); + for (i=1; i= 0) { - BN_mod(&(val[0][1]),a2,m,ctx); - aa2= &(val[0][1]); + if (!BN_mod(&(val2[0]),a2,m,ctx)) + goto err; + a_mod_m = &(val2[0]); } else - aa2=a2; - if (!BN_to_montgomery(&(val[1][0]),aa1,mont,ctx)) goto err; - if (!BN_to_montgomery(&(val[0][1]),aa2,mont,ctx)) goto err; - if (!BN_mod_mul_montgomery(&(val[1][1]), - &(val[1][0]),&(val[0][1]),mont,ctx)) - goto err; - -#if 0 - if (bits <= 20) /* This is probably 3 or 0x10001, so just do singles */ - window=1; - else if (bits > 250) - window=5; /* max size of window */ - else if (bits >= 120) - window=4; - else - window=3; -#else - window=EXP2_TABLE_BITS; -#endif - - k=1< 1) { - if (x >= 2) - { - BN_init(&(val[x][0])); - BN_init(&(val[x][1])); - if (!BN_mod_mul_montgomery(&(val[x][0]), - &(val[1][0]),&(val[x-1][0]),mont,ctx)) goto err; - if (!BN_mod_mul_montgomery(&(val[x][1]), - &(val[1][0]),&(val[x-1][1]),mont,ctx)) goto err; - } - for (y=2; y 0, the bottom bit of the first window */ + wpos2=0; /* If wvalue2 > 0, the bottom bit of the second window */ + + if (!BN_to_montgomery(r,BN_value_one(),mont,ctx)) goto err; + for (b=bits-1; b>=0; b--) { - xvalue=BN_is_bit_set(p1,wstart); - yvalue=BN_is_bit_set(p2,wstart); - if (!(xvalue || yvalue)) + if (!r_is_one) { - if (!start) + if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) + goto err; + } + + if (!wvalue1) + if (BN_is_bit_set(p1, b)) { - if (!BN_mod_mul_montgomery(r,r,r,mont,ctx)) - goto err; + /* consider bits b-window1+1 .. b for this window */ + i = b-window1+1; + while (!BN_is_bit_set(p1, i)) + i++; + wpos1 = i; + wvalue1 = 1; + for (i = b-1; i >= wpos1; i--) + { + wvalue1 <<= 1; + if (BN_is_bit_set(p1, i)) + wvalue1++; + } } - wstart--; - if (wstart < 0) break; - continue; - } - /* We now have wstart on a 'set' bit, we now need to work out - * how bit a window to do. To do this we need to scan - * forward until the last set bit before the end of the - * window */ - j=wstart; - /* xvalue=BN_is_bit_set(p1,wstart); already set */ - /* yvalue=BN_is_bit_set(p1,wstart); already set */ - wend=0; - for (i=1; i= wpos2; i--) + { + wvalue2 <<= 1; + if (BN_is_bit_set(p2, i)) + wvalue2++; + } } + + if (wvalue1 && b == wpos1) + { + /* wvalue1 is odd and < 2^window1 */ + if (!BN_mod_mul_montgomery(r,r,&(val1[wvalue1>>1]),mont,ctx)) + goto err; + wvalue1 = 0; + r_is_one = 0; + } - /* wvalue will be an odd number < 2^window */ - if (xvalue || yvalue) + if (wvalue2 && b == wpos2) { - if (!BN_mod_mul_montgomery(r,r,&(val[xvalue][yvalue]), - mont,ctx)) goto err; + /* wvalue2 is odd and < 2^window2 */ + if (!BN_mod_mul_montgomery(r,r,&(val2[wvalue2>>1]),mont,ctx)) + goto err; + wvalue2 = 0; + r_is_one = 0; } - - /* move the 'window' down further */ - wstart-=i; - start=0; - if (wstart < 0) break; } BN_from_montgomery(rr,r,mont,ctx); ret=1; err: if ((in_mont == NULL) && (mont != NULL)) BN_MONT_CTX_free(mont); BN_CTX_end(ctx); - for (i=0; i= 2) and a random 'b' bits exponent, + * the number of multiplications is a constant plus on average + * + * 2^(w-1) + (b-w)/(w+1); + * + * here 2^(w-1) is for precomputing the table (we actually need + * entries only for windows that have the lowest bit set), and + * (b-w)/(w+1) is an approximation for the expected number of + * w-bit windows, not counting the first one. + * + * Thus we should use + * + * w >= 6 if b > 671 + * w = 5 if 671 > b > 239 + * w = 4 if 239 > b > 79 + * w = 3 if 79 > b > 23 + * w <= 2 if 23 > b + * + * (with draws in between). Very small exponents are often selected + * with low Hamming weight, so we use w = 1 for b <= 23. + */ +#if 1 +#define BN_window_bits_for_exponent_size(b) \ + ((b) > 671 ? 6 : \ + (b) > 239 ? 5 : \ + (b) > 79 ? 4 : \ + (b) > 23 ? 3 : 1) +#else +/* Old SSLeay/OpenSSL table. + * Maximum window size was 5, so this table differs for b==1024; + * but it coincides for other interesting values (b==160, b==512). + */ +#define BN_window_bits_for_exponent_size(b) \ + ((b) > 255 ? 5 : \ + (b) > 127 ? 4 : \ + (b) > 17 ? 3 : 1) +#endif + + + /* Pentium pro 16,16,16,32,64 */ /* Alpha 16,16,16,16.64 */ #define BN_MULL_SIZE_NORMAL (16) /* 32 */ diff --git a/util/libeay.num b/util/libeay.num index f798b828e1..ebcb744b0d 100755 --- a/util/libeay.num +++ b/util/libeay.num @@ -1801,3 +1801,5 @@ X509_CRL_digest 2391 d2i_ASN1_SET_OF_PKCS7 2397 EVP_CIPHER_CTX_set_key_length 2399 EVP_CIPHER_CTX_ctrl 2400 +BN_mod_exp_mont_word 2401 +RAND_egd_bytes 2402