X-Git-Url: https://git.openssl.org/gitweb/?a=blobdiff_plain;f=crypto%2Fbn%2Fbn_asm.c;h=c43c91cc09f485fa615eddc61ddb81d20e792bf4;hb=3ebbe8853f0597bc859d6fad5206229aff3ce784;hp=9b8e380c4f7e1ae11110d3c2f9daef864b67c0ed;hpb=ca04d7a20842b2de39264b6c1605c3443fc09d16;p=openssl.git diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c index 9b8e380c4f..c43c91cc09 100644 --- a/crypto/bn/bn_asm.c +++ b/crypto/bn/bn_asm.c @@ -828,19 +828,25 @@ void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) r[7]=c2; } +#ifdef OPENSSL_NO_ASM #ifdef OPENSSL_BN_ASM_MONT +#include /* * This is essentially reference implementation, which may or may not * result in performance improvement. E.g. on IA-32 this routine was * observed to give 40% faster rsa1024 private key operations and 10% * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a - * reference implementation, one to be used as start-point for - * platform-specific assembler. + * reference implementation, one to be used as starting point for + * platform-specific assembler. Mentioned numbers apply to compiler + * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and + * can vary not only from platform to platform, but even for compiler + * versions. Assembler vs. assembler improvement coefficients can + * [and are known to] differ and are to be documented elsewhere. */ -int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,BN_ULONG n0, int num) +int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num) { - BN_ULONG c0,c1,ml,*tp; + BN_ULONG c0,c1,ml,*tp,n0; #ifdef mul64 BN_ULONG mh; #endif @@ -848,17 +854,41 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U int i=0,j; #if 0 /* template for platform-specific implementation */ - if (ap==bp) return bn_sqr_mont(rp,ap,np,n0,num); + if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num); #endif vp = tp = alloca((num+2)*sizeof(BN_ULONG)); - tp[num] = bn_mul_words(tp,ap,num,bp[0]); + n0 = *n0p; + + c0 = 0; + ml = bp[0]; +#ifdef mul64 + mh = HBITS(ml); + ml = LBITS(ml); + for (j=0;j +int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_ULONG *np,const BN_ULONG *n0p, int num) { - BN_ULONG c0,c1,*tp; + BN_ULONG c0,c1,*tp,n0=*n0p; volatile BN_ULONG *vp; int i=0,j; @@ -963,12 +996,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, const BN_U for(i=0;i