From fb81ac5e6be4041e59df9362cd63880b21e2cafc Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Wed, 2 Feb 2000 16:18:12 +0000 Subject: [PATCH 1/1] Support for "multiply high" instruction, see BN_UMULT_HIGH comment in crypto/bn/bn_lcl.h for further details. It should be noted that for the moment of this writing the code was tested only on Alpha. If compiled with DEC C the C implementation exhibits 12% performance improvement over the crypto/bn/asm/alpha.s (on EV56 box running AlphaLinux). GNU C is (unfortunately) 8% behind the assembler implementation. But it's OpenVMS Alpha users who *may* benefit most as 'apps/openssl speed rsa' exhibits 6 (six) times performance improvement over the original VMS bignum implementation. Where "*may*" means "as soon as code is enabled though #define SIXTY_FOUR_BIT and crypto/bn/asm/vms.mar is skipped." --- crypto/bn/bn_asm.c | 98 +++++++++++++++++++++++++++++----------------- crypto/bn/bn_div.c | 5 +++ crypto/bn/bn_lcl.h | 85 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 151 insertions(+), 37 deletions(-) diff --git a/crypto/bn/bn_asm.c b/crypto/bn/bn_asm.c index 4d3da16a0c..a62fdcf07a 100644 --- a/crypto/bn/bn_asm.c +++ b/crypto/bn/bn_asm.c @@ -60,7 +60,7 @@ #include "cryptlib.h" #include "bn_lcl.h" -#ifdef BN_LLONG +#if defined(BN_LLONG) || defined(BN_UMULT_HIGH) BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) { @@ -69,18 +69,19 @@ BN_ULONG bn_mul_add_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) bn_check_num(num); if (num <= 0) return(c1); - for (;;) + while (num&~3) { mul_add(rp[0],ap[0],w,c1); - if (--num == 0) break; mul_add(rp[1],ap[1],w,c1); - if (--num == 0) break; mul_add(rp[2],ap[2],w,c1); - if (--num == 0) break; mul_add(rp[3],ap[3],w,c1); - if (--num == 0) break; - ap+=4; - rp+=4; + ap+=4; rp+=4; num-=4; + } + if (num) + { + mul_add(rp[0],ap[0],w,c1); if (--num==0) return c1; + mul_add(rp[1],ap[1],w,c1); if (--num==0) return c1; + mul_add(rp[2],ap[2],w,c1); return c1; } return(c1); @@ -93,19 +94,19 @@ BN_ULONG bn_mul_words(BN_ULONG *rp, BN_ULONG *ap, int num, BN_ULONG w) bn_check_num(num); if (num <= 0) return(c1); - /* for (;;) */ - while (1) /* circumvent egcs-1.1.2 bug */ + while (num&~3) { mul(rp[0],ap[0],w,c1); - if (--num == 0) break; mul(rp[1],ap[1],w,c1); - if (--num == 0) break; mul(rp[2],ap[2],w,c1); - if (--num == 0) break; mul(rp[3],ap[3],w,c1); - if (--num == 0) break; - ap+=4; - rp+=4; + ap+=4; rp+=4; num-=4; + } + if (num) + { + mul(rp[0],ap[0],w,c1); if (--num == 0) return c1; + mul(rp[1],ap[1],w,c1); if (--num == 0) return c1; + mul(rp[2],ap[2],w,c1); } return(c1); } @@ -114,28 +115,19 @@ void bn_sqr_words(BN_ULONG *r, BN_ULONG *a, int n) { bn_check_num(n); if (n <= 0) return; - for (;;) + while (n&~3) { - BN_ULLONG t; - - t=(BN_ULLONG)(a[0])*(a[0]); - r[0]=Lw(t); r[1]=Hw(t); - if (--n == 0) break; - - t=(BN_ULLONG)(a[1])*(a[1]); - r[2]=Lw(t); r[3]=Hw(t); - if (--n == 0) break; - - t=(BN_ULLONG)(a[2])*(a[2]); - r[4]=Lw(t); r[5]=Hw(t); - if (--n == 0) break; - - t=(BN_ULLONG)(a[3])*(a[3]); - r[6]=Lw(t); r[7]=Hw(t); - if (--n == 0) break; - - a+=4; - r+=8; + sqr(r[0],r[1],a[0]); + sqr(r[2],r[3],a[1]); + sqr(r[4],r[5],a[2]); + sqr(r[6],r[7],a[3]); + a+=4; r+=8; n-=4; + } + if (n) + { + sqr(r[0],r[1],a[0]); if (--n == 0) return; + sqr(r[2],r[3],a[1]); if (--n == 0) return; + sqr(r[4],r[5],a[2]); } } @@ -460,6 +452,38 @@ BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) #define sqr_add_c2(a,i,j,c0,c1,c2) \ mul_add_c2((a)[i],(a)[j],c0,c1,c2) + +#elif defined(BN_UMULT_HIGH) + +#define mul_add_c(a,b,c0,c1,c2) { \ + BN_ULONG ta=(a),tb=(b); \ + t1 = ta * tb; \ + t2 = BN_UMULT_HIGH(ta,tb); \ + c0 += t1; t2 += (c0 + */ +# if defined(__alpha) && (defined(SIXTY_FOUR_BIT_LONG) || defined(SIXTY_FOUR_BIT)) +# if defined(__DECC) +# include +# define BN_UMULT_HIGH(a,b) (BN_ULONG)asm("umulh %a0,%a1,%v0",(a),(b)) +# elif defined(__GNUC__) +# define BN_UMULT_HIGH(a,b) ({ \ + register BN_ULONG ret; \ + asm ("umulh %1,%2,%0" \ + : "=r"(ret) \ + : "r"(a), "r"(b)); \ + ret; }) +# endif /* compiler */ +# elif defined(_ARCH_PPC) && defined(__64BIT__) && defined(SIXTY_FOUR_BIT_LONG) +# if defined(__GNUC__) +# define BN_UMULT_HIGH(a,b) ({ \ + register BN_ULONG ret; \ + asm ("mulhdu %0,%1,%2" \ + : "=r"(ret) \ + : "r"(a), "r"(b)); \ + ret; }) +# endif /* compiler */ +# endif /* cpu */ +#endif /* NO_ASM */ + /************************************************************* * Using the long long type */ @@ -151,6 +199,43 @@ extern "C" { (c)= Hw(t); \ } +#define sqr(r0,r1,a) { \ + BN_ULLONG t; \ + t=(BN_ULLONG)(a)*(a); \ + (r0)=Lw(t); \ + (r1)=Hw(t); ] + } + +#elif defined(BN_UMULT_HIGH) +#define mul_add(r,a,w,c) { \ + BN_ULONG high,low,ret,tmp=(a); \ + ret = (r); \ + high= BN_UMULT_HIGH(w,tmp); \ + ret += (c); \ + low = (w) * tmp; \ + (c) = (ret<(c))?1:0; \ + (c) += high; \ + ret += low; \ + (c) += (ret