From: Andy Polyakov Date: Thu, 7 Oct 1999 12:03:59 +0000 (+0000) Subject: RC4 tune-up featuring 30-40% performance improvement on most RISC X-Git-Tag: OpenSSL_0_9_5beta1~482 X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff_plain;h=2dae04d03880cc88a48700e423ee94f9b94242bd;ds=sidebyside RC4 tune-up featuring 30-40% performance improvement on most RISC platforms. See crypto/rc4/rc4_enc.c for further details. --- diff --git a/Configure b/Configure index 5df1aee351..4107d8a303 100755 --- a/Configure +++ b/Configure @@ -51,6 +51,10 @@ my $usage="Usage: Configure [-Dxxx] [-lxxx] [-Lxxx] [-fxxx] [-Kxxx] [rsaref] [no # RC4_LONG use 'long' instead of 'int' for RC4_INT in crypto/rc4/rc4.h # RC4_INDEX define RC4_INDEX in crypto/rc4/rc4_locl.h. This turns on # array lookups instead of pointer use. +# RC4_CHUNK enables code that handles data aligned at long (natural CPU +# word) boundary. +# RC4_CHUNK_LL enables code that handles data aligned at long long boundary +# (intended for 64-bit CPUs running 32-bit OS). # BF_PTR use 'pointer arithmatic' for Blowfish (unsafe on Alpha). # BF_PTR2 intel specific version (generic version is more efficient). # MD5_ASM use some extra md5 assember, @@ -109,59 +113,59 @@ my %table=( "solaris-x86-gcc","gcc:-O3 -fomit-frame-pointer -m486 -Wall -DL_ENDIAN::-D_REENTRANT:-lsocket -lnsl:BN_LLONG $x86_gcc_des $x86_gcc_opts:$x86_sol_asm", #### SPARC Solaris with GNU C setups -"solaris-sparcv7-gcc","gcc:-O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:::", -"solaris-sparcv8-gcc","gcc:-mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8.o::", -"solaris-sparcv9-gcc","gcc:-mcpu=ultrasparc -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", +"solaris-sparcv7-gcc","gcc:-O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:::", +"solaris-sparcv8-gcc","gcc:-mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:asm/sparcv8.o::", +"solaris-sparcv9-gcc","gcc:-mcpu=ultrasparc -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", # gcc pre-2.8 doesn't understand -mcpu=ultrasparc, so fall down to -mv8 # but keep the assembler modules. -"solaris-sparcv9-gcc27","gcc:-mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus-gcc27.o:::asm/md5-sparcv8plus-gcc27.o:", +"solaris-sparcv9-gcc27","gcc:-mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:asm/sparcv8plus-gcc27.o:::asm/md5-sparcv8plus-gcc27.o:", #### -"debug-solaris-sparcv8-gcc","gcc:-DREF_CHECK -DCRYPTO_MDEBUG_ALL -O -g -mv8 -Wall -DB_ENDIAN::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8.o::", -"debug-solaris-sparcv9-gcc","gcc:-DREF_CHECK -DCRYPTO_MDEBUG_ALL -O -g -mcpu=ultrasparc -Wall -DB_ENDIAN::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus.o::", +"debug-solaris-sparcv8-gcc","gcc:-DREF_CHECK -DCRYPTO_MDEBUG_ALL -O -g -mv8 -Wall -DB_ENDIAN::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:asm/sparcv8.o::", +"debug-solaris-sparcv9-gcc","gcc:-DREF_CHECK -DCRYPTO_MDEBUG_ALL -O -g -mcpu=ultrasparc -Wall -DB_ENDIAN::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:asm/sparcv8plus.o::", #### SPARC Solaris with Sun C setups # DO NOT use /xO[34] on sparc with SC3.0. It is broken, and will not pass the tests -"solaris-sparc-sc3","cc:-fast -O -Xa -DB_ENDIAN::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_UNROLL BF_PTR:::", +"solaris-sparc-sc3","cc:-fast -O -Xa -DB_ENDIAN::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_UNROLL BF_PTR:::", # SC4.0 doesn't pass 'make test', upgrade to SC5.0 or SC4.2. # SC4.2 is ok, better than gcc even on bn as long as you tell it -xarch=v8 # SC5.0 note: Compiler common patch 107357-01 or later is required! -"solaris-sparcv7-cc","cc:-xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::", -"solaris-sparcv8-cc","cc:-xarch=v8 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::", -"solaris-sparcv9-cc","cc:-xtarget=ultra -xarch=v8plus -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", -"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:", +"solaris-sparcv7-cc","cc:-xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR:::", +"solaris-sparcv8-cc","cc:-xarch=v8 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::", +"solaris-sparcv9-cc","cc:-xtarget=ultra -xarch=v8plus -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", +"solaris64-sparcv9-cc","cc:-xtarget=ultra -xarch=v9 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:", #### -"debug-solaris-sparcv8-cc","cc:-DREF_CHECK -DCRYPTO_MDEBUG_ALL -xarch=v8 -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::", -"debug-solaris-sparcv9-cc","cc:-DREF_CHECK -DCRYPTO_MDEBUG_ALL -xtarget=ultra -xarch=v8plus -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", +"debug-solaris-sparcv8-cc","cc:-DREF_CHECK -DCRYPTO_MDEBUG_ALL -xarch=v8 -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8.o::", +"debug-solaris-sparcv9-cc","cc:-DREF_CHECK -DCRYPTO_MDEBUG_ALL -xtarget=ultra -xarch=v8plus -g -O -xstrconst -Xa -DB_ENDIAN -DBN_DIV2W -DULTRASPARC::-D_REENTRANT:-lsocket -lnsl:BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", #### SPARC Linux setups -"linux-sparcv7","gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::BN_LLONG RC4_CHAR DES_UNROLL BF_PTR::", +"linux-sparcv7","gcc:-DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::", # Ray Miller has patiently # assisted with debugging of following two configs. -"linux-sparcv8","gcc:-mv8 -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT::BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8.o::::", +"linux-sparcv8","gcc:-mv8 -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV2W::-D_REENTRANT::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:asm/sparcv8.o::::", # it's a real mess with -mcpu=ultrasparc option under Linux, but # -Wa,-Av8plus should do the trick no matter what. -"linux-sparcv9","gcc:-mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -Wa,-Av8plus -DULTRASPARC -DBN_DIV2W::-D_REENTRANT::BN_LLONG RC4_CHAR DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", +"linux-sparcv9","gcc:-mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -Wa,-Av8plus -DULTRASPARC -DBN_DIV2W::-D_REENTRANT::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR:asm/sparcv8plus.o:::asm/md5-sparcv8plus.o:", # !!!Folowing can't be even tested yet!!! # We have to wait till 64-bit glibc for SPARC is operational!!! -#"linux64-sparcv9","sparc64-linux-gcc:-m64 -mcpu=v9 -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DULTRASPARC -DBN_DIV2W::-D_REENTRANT::BN_LLONG RC4_CHAR DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:", +#"linux64-sparcv9","sparc64-linux-gcc:-m64 -mcpu=v9 -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DULTRASPARC -DBN_DIV2W::-D_REENTRANT::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR::::asm/md5-sparcv9.o:", # Sunos configs, assuming sparc for the gcc one. ##"sunos-cc", "cc:-O4 -DNOPROTO -DNOCONST::(unknown)::DES_UNROLL:::", -"sunos-gcc","gcc:-O3 -mv8::(unknown)::BN_LLONG RC4_CHAR DES_UNROLL DES_PTR DES_RISC1:::", +"sunos-gcc","gcc:-O3 -mv8::(unknown)::BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL DES_PTR DES_RISC1:::", #### IRIX 5.x configs # -mips2 flag is added by ./config when appropriate. -"irix-gcc","gcc:-O3 -DTERMIOS -DB_ENDIAN::(unknown)::BN_LLONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR:::", -"irix-cc", "cc:-O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::BN_LLONG DES_PTR DES_RISC2 DES_UNROLL BF_PTR:::", +"irix-gcc","gcc:-O3 -DTERMIOS -DB_ENDIAN::(unknown)::BN_LLONG MD2_CHAR RC4_INDEX RC4_CHAR RC4_CHUNK DES_UNROLL DES_RISC2 DES_PTR BF_PTR:::", +"irix-cc", "cc:-O2 -use_readonly_const -DTERMIOS -DB_ENDIAN::(unknown)::BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC2 DES_UNROLL BF_PTR:::", #### IRIX 6.x configs # Only N32 and N64 ABIs are supported. If you need O32 ABI build, invoke # './Configure irix-[g]cc' manually. # -mips4 flag is added by ./config when appropriate. -"irix-mips3-gcc","gcc:-mabi=n32 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::", -"irix-mips3-cc", "cc:-n32 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::", +"irix-mips3-gcc","gcc:-mabi=n32 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::MD2_CHAR RC4_INDEX RC4_CHAR RC4_CHUNK_LL DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::", +"irix-mips3-cc", "cc:-n32 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::DES_PTR RC4_CHAR RC4_CHUNK_LL DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT:asm/mips3.o::", # N64 ABI builds. -"irix64-mips4-gcc","gcc:-mabi=64 -mips4 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::", -"irix64-mips4-cc", "cc:-64 -mips4 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::", +"irix64-mips4-gcc","gcc:-mabi=64 -mips4 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::RC4_CHAR RC4_CHUNK DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::", +"irix64-mips4-cc", "cc:-64 -mips4 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W::(unknown)::RC4_CHAR RC4_CHUNK DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG:asm/mips3.o::", # HPUX 9.X config. # Don't use the bundled cc. It is broken. Use HP ANSI C if possible, or @@ -186,17 +190,18 @@ my %table=( # If hpux10-gcc fails, try this one: "hpux10-brokengcc", "gcc:-DB_ENDIAN -DBN_DIV2W -O3::-D_REENTRANT::DES_PTR DES_UNROLL DES_RISC1:::", -# HPUX 11.X -"hpux11-32bit-cc","cc:+DAportable -DB_ENDIAN -D_HPUX_SOURCE -Aa -Ae +ESlit::-D_REENTRANT::DES_PTR DES_UNROLL DES_RISC1:::", +# HPUX 11.X from www.globus.org. +# Only works on PA-RISC 2.0 cpus, and not optimized. Why? +"hpux11-32bit-cc","cc:+DA2.0 -DB_ENDIAN -D_HPUX_SOURCE -Aa -Ae +ESlit::-D_REENTRANT::DES_PTR DES_UNROLL DES_RISC1:::", "hpux11-64bit-cc","cc:+DA2.0W -g -D_HPUX_SOURCE -Aa -Ae +ESlit::-D_REENTRANT::SIXTY_FOUR_BIT_LONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC1 DES_INT :::", # Dec Alpha, OSF/1 - the alpha164-cc is the flags for a 21164A with # the new compiler # For gcc, the following gave a %50 speedup on a 164 over the 'DES_INT' version -"alpha-gcc","gcc:-O3::(unknown)::SIXTY_FOUR_BIT_LONG DES_UNROLL DES_RISC1:::", -"alpha-cc", "cc:-std1 -tune host -O4 -readonly_strings::(unknown)::SIXTY_FOUR_BIT_LONG:::", -"alpha164-cc", "cc:-std1 -tune host -fast -readonly_strings::(unknown)::SIXTY_FOUR_BIT_LONG:::", -"FreeBSD-alpha","gcc:-DTERMIOS -O3 -fomit-frame-pointer::(unknown)::SIXTY_FOUR_BIT_LONG DES_INT DES_PTR DES_RISC2:::", +"alpha-gcc","gcc:-O3::(unknown)::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_RISC1:::", +"alpha-cc", "cc:-std1 -tune host -O4 -readonly_strings::(unknown)::SIXTY_FOUR_BIT_LONG RC4_CHUNK:::", +"alpha164-cc", "cc:-std1 -tune host -fast -readonly_strings::(unknown)::SIXTY_FOUR_BIT_LONG RC4_CHUNK:::", +"FreeBSD-alpha","gcc:-DTERMIOS -O3 -fomit-frame-pointer::(unknown)::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC2:::", # assembler versions -- currently defunct: ##"alpha-gcc","gcc:-O3::(unknown)::SIXTY_FOUR_BIT_LONG DES_UNROLL DES_RISC1:asm/alpha.o::", @@ -252,7 +257,7 @@ my %table=( # for some st_addr stuff, and then sizeof and address-of fails # I could not use the ams/alpha.o option because the Cray assembler, 'cam' # did not like it. -"cray-t3e", "cc: -DBIT_FIELD_LIMITS -DTERMIOS::(unknown)::SIXTY_FOUR_BIT_LONG DES_INT:::", +"cray-t3e", "cc: -DBIT_FIELD_LIMITS -DTERMIOS::(unknown)::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:::", # DGUX, 88100. "dgux-R3-gcc", "gcc:-O3 -fomit-frame-pointer::(unknown)::RC4_INDEX DES_UNROLL:::", @@ -602,6 +607,7 @@ my $md2_int=$def_int; my $idea_int=$def_int; my $rc2_int=$def_int; my $rc4_idx=0; +my $rc4_chunk=0; my $bf_ptr=0; my @type=("char","short","int","long"); my ($b64l,$b64,$b32,$b16,$b8)=(0,0,1,0,0); @@ -619,6 +625,8 @@ foreach (sort split(/\s+/,$bn_ops)) $rc4_int=0 if /RC4_CHAR/; $rc4_int=3 if /RC4_LONG/; $rc4_idx=1 if /RC4_INDEX/; + $rc4_chunk=1 if /RC4_CHUNK/; + $rc4_chunk=2 if /RC4_CHUNK_LL/; $md2_int=0 if /MD2_CHAR/; $md2_int=3 if /MD2_LONG/; $idea_int=1 if /IDEA_SHORT/; @@ -670,6 +678,12 @@ while () { printf OUT "#%s DES_UNROLL\n",($des_unroll)?'define':'undef'; } elsif (/^#define\s+RC4_INT\s/) { printf OUT "#define RC4_INT unsigned %s\n",$type[$rc4_int]; } + elsif (/^#undef\s+RC4_CHUNK/) + { + printf OUT "#undef RC4_CHUNK\n" if $rc4_chunk==0; + printf OUT "#define RC4_CHUNK unsigned long\n" if $rc4_chunk==1; + printf OUT "#define RC4_CHUNK unsigned long long\n" if $rc4_chunk==2; + } elsif (/^#((define)|(undef))\s+RC4_INDEX/) { printf OUT "#%s RC4_INDEX\n",($rc4_idx)?"define":"undef"; } elsif (/^#(define|undef)\s+I386_ONLY/) @@ -709,6 +723,9 @@ print "DES_INT used\n" if $des_int; print "BN_LLONG mode\n" if $bn_ll; print "RC4 uses u$type[$rc4_int]\n" if $rc4_int != $def_int; print "RC4_INDEX mode\n" if $rc4_idx; +print "RC4_CHUNK is undefined\n" if $rc4_chunk==0; +print "RC4_CHUNK is unsigned long\n" if $rc4_chunk==1; +print "RC4_CHUNK is unsigned long long\n" if $rc4_chunk==2; print "MD2 uses u$type[$md2_int]\n" if $md2_int != $def_int; print "IDEA uses u$type[$idea_int]\n" if $idea_int != $def_int; print "RC2 uses u$type[$rc2_int]\n" if $rc2_int != $def_int; diff --git a/TABLE b/TABLE index ba55be71d0..4a53987d2f 100644 --- a/TABLE +++ b/TABLE @@ -91,7 +91,7 @@ $cflags = -DTERMIOS -O3 -fomit-frame-pointer $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = SIXTY_FOUR_BIT_LONG DES_INT DES_PTR DES_RISC2 +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC2 $bn_obj = $des_obj = $bf_obj = @@ -448,7 +448,7 @@ $cflags = -std1 -tune host -O4 -readonly_strings $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = SIXTY_FOUR_BIT_LONG +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK $bn_obj = $des_obj = $bf_obj = @@ -465,7 +465,7 @@ $cflags = -O3 $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = SIXTY_FOUR_BIT_LONG DES_UNROLL DES_RISC1 +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_UNROLL DES_RISC1 $bn_obj = $des_obj = $bf_obj = @@ -482,7 +482,7 @@ $cflags = -std1 -tune host -fast -readonly_strings $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = SIXTY_FOUR_BIT_LONG +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK $bn_obj = $des_obj = $bf_obj = @@ -550,7 +550,7 @@ $cflags = -DBIT_FIELD_LIMITS -DTERMIOS $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = SIXTY_FOUR_BIT_LONG DES_INT +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT $bn_obj = $des_obj = $bf_obj = @@ -703,7 +703,7 @@ $cflags = -DREF_CHECK -DCRYPTO_MDEBUG_ALL -xarch=v8 -g -O -xstrconst -Xa - $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR $bn_obj = asm/sparcv8.o $des_obj = $bf_obj = @@ -720,7 +720,7 @@ $cflags = -DREF_CHECK -DCRYPTO_MDEBUG_ALL -O -g -mv8 -Wall -DB_ENDIAN $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = asm/sparcv8.o $des_obj = $bf_obj = @@ -737,7 +737,7 @@ $cflags = -DREF_CHECK -DCRYPTO_MDEBUG_ALL -xtarget=ultra -xarch=v8plus -g $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR $bn_obj = asm/sparcv8plus.o $des_obj = $bf_obj = @@ -754,7 +754,7 @@ $cflags = -DREF_CHECK -DCRYPTO_MDEBUG_ALL -O -g -mcpu=ultrasparc -Wall -DB $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = asm/sparcv8plus.o $des_obj = $bf_obj = @@ -1043,7 +1043,7 @@ $cflags = -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = BN_LLONG DES_PTR DES_RISC2 DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC2 DES_UNROLL BF_PTR $bn_obj = $des_obj = $bf_obj = @@ -1060,7 +1060,7 @@ $cflags = -O3 -DTERMIOS -DB_ENDIAN $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = BN_LLONG MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR +$bn_ops = BN_LLONG MD2_CHAR RC4_INDEX RC4_CHAR RC4_CHUNK DES_UNROLL DES_RISC2 DES_PTR BF_PTR $bn_obj = $des_obj = $bf_obj = @@ -1077,7 +1077,7 @@ $cflags = -n32 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV3W $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = DES_PTR DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT +$bn_ops = DES_PTR RC4_CHAR RC4_CHUNK_LL DES_RISC2 DES_UNROLL BF_PTR SIXTY_FOUR_BIT $bn_obj = asm/mips3.o $des_obj = $bf_obj = @@ -1094,7 +1094,7 @@ $cflags = -mabi=n32 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = MD2_CHAR RC4_INDEX RC4_CHAR DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT +$bn_ops = MD2_CHAR RC4_INDEX RC4_CHAR RC4_CHUNK_LL DES_UNROLL DES_RISC2 DES_PTR BF_PTR SIXTY_FOUR_BIT $bn_obj = asm/mips3.o $des_obj = $bf_obj = @@ -1111,7 +1111,7 @@ $cflags = -64 -mips4 -O2 -use_readonly_const -DTERMIOS -DB_ENDIAN -DBN_DIV $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG +$bn_ops = RC4_CHAR RC4_CHUNK DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG $bn_obj = asm/mips3.o $des_obj = $bf_obj = @@ -1128,7 +1128,7 @@ $cflags = -mabi=64 -mips4 -mmips-as -O3 -DTERMIOS -DB_ENDIAN -DBN_DIV3W $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG +$bn_ops = RC4_CHAR RC4_CHUNK DES_RISC2 DES_UNROLL SIXTY_FOUR_BIT_LONG $bn_obj = asm/mips3.o $des_obj = $bf_obj = @@ -1213,7 +1213,7 @@ $cflags = -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall $unistd = $thread_cflag = -D_REENTRANT $lflags = -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = $des_obj = $bf_obj = @@ -1230,7 +1230,7 @@ $cflags = -mv8 -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -Wall -DBN_DIV $unistd = $thread_cflag = -D_REENTRANT $lflags = -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = asm/sparcv8.o $des_obj = $bf_obj = @@ -1247,7 +1247,7 @@ $cflags = -mcpu=ultrasparc -DB_ENDIAN -DTERMIO -O3 -fomit-frame-pointer -W $unistd = $thread_cflag = -D_REENTRANT $lflags = -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = asm/sparcv8plus.o $des_obj = $bf_obj = @@ -1366,7 +1366,7 @@ $cflags = -fast -O -Xa -DB_ENDIAN $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_PTR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_UNROLL BF_PTR $bn_obj = $des_obj = $bf_obj = @@ -1383,7 +1383,7 @@ $cflags = -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR $bn_obj = $des_obj = $bf_obj = @@ -1400,7 +1400,7 @@ $cflags = -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = $des_obj = $bf_obj = @@ -1417,7 +1417,7 @@ $cflags = -xarch=v8 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN -DBN_DIV2W $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_PTR DES_RISC1 DES_UNROLL BF_PTR $bn_obj = asm/sparcv8.o $des_obj = $bf_obj = @@ -1434,7 +1434,7 @@ $cflags = -mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = asm/sparcv8.o $des_obj = $bf_obj = @@ -1451,7 +1451,7 @@ $cflags = -xtarget=ultra -xarch=v8plus -xO5 -xstrconst -xdepend -Xa -DB_EN $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_PTR DES_RISC1 DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK_LL DES_PTR DES_RISC1 DES_UNROLL BF_PTR $bn_obj = asm/sparcv8plus.o $des_obj = $bf_obj = @@ -1468,7 +1468,7 @@ $cflags = -mcpu=ultrasparc -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_ $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = asm/sparcv8plus.o $des_obj = $bf_obj = @@ -1485,7 +1485,7 @@ $cflags = -mv8 -O3 -fomit-frame-pointer -Wall -DB_ENDIAN -DBN_DIV2W -DULTR $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL BF_PTR +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL BF_PTR $bn_obj = asm/sparcv8plus-gcc27.o $des_obj = $bf_obj = @@ -1519,7 +1519,7 @@ $cflags = -xtarget=ultra -xarch=v9 -xO5 -xstrconst -xdepend -Xa -DB_ENDIAN $unistd = $thread_cflag = -D_REENTRANT $lflags = -lsocket -lnsl -$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR +$bn_ops = SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL BF_PTR $bn_obj = $des_obj = $bf_obj = @@ -1536,7 +1536,7 @@ $cflags = -O3 -mv8 $unistd = $thread_cflag = (unknown) $lflags = -$bn_ops = BN_LLONG RC4_CHAR DES_UNROLL DES_PTR DES_RISC1 +$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_UNROLL DES_PTR DES_RISC1 $bn_obj = $des_obj = $bf_obj = diff --git a/crypto/opensslconf.h.in b/crypto/opensslconf.h.in index e4a8f8ad54..522545e0aa 100644 --- a/crypto/opensslconf.h.in +++ b/crypto/opensslconf.h.in @@ -25,11 +25,25 @@ #define RC2_INT unsigned int #endif -#if defined(HEADER_RC4_H) && !defined(RC4_INT) +#if defined(HEADER_RC4_H) +#if !defined(RC4_INT) /* using int types make the structure larger but make the code faster * on most boxes I have tested - up to %20 faster. */ +/* + * I don't know what does "most" mean, but declaring "int" is a must on: + * - Intel P6 because partial register stalls are very expensive; + * - elder Alpha because it lacks byte load/store instructions; + */ #define RC4_INT unsigned int #endif +#if !defined(RC4_CHUNK) +/* + * This enables code handling data aligned at natural CPU word + * boundary. See crypto/rc4/rc4_enc.c for further details. + */ +#undef RC4_CHUNK +#endif +#endif #if defined(HEADER_DES_H) && !defined(DES_LONG) /* If this is set to 'unsigned int' on a DEC Alpha, this gives about a diff --git a/crypto/rc4/rc4_enc.c b/crypto/rc4/rc4_enc.c index 93a75cd8f9..35dbc7c47e 100644 --- a/crypto/rc4/rc4_enc.c +++ b/crypto/rc4/rc4_enc.c @@ -78,7 +78,7 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, y=key->y; d=key->data; -#if defined(RC4_CHUNK) && (defined(L_ENDIAN) || defined(B_ENDIAN)) +#if defined(RC4_CHUNK) /* * The original reason for implementing this(*) was the fact that * pre-21164a Alpha CPUs don't have byte load/store instructions @@ -87,21 +87,30 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, * at natural word size made it possible to reduce amount of * instructions as well as to perform early read-ahead without * suffering from RAW (read-after-write) hazard. This resulted - * in >40%(**) performance improvement (on 21064 box with gcc). + * in ~40%(**) performance improvement on 21064 box with gcc. * But it's not only Alpha users who win here:-) Thanks to the * early-n-wide read-ahead this implementation also exhibits - * >40% speed-up on SPARC and almost 20% on MIPS. + * >40% speed-up on SPARC and 20-30% on 64-bit MIPS (depending + * on sizeof(RC4_INT)). * * (*) "this" means code which recognizes the case when input * and output pointers appear to be aligned at natural CPU - * word boundary. + * word boundary * (**) i.e. according to 'apps/openssl speed rc4' benchmark, - * crypto/rc4/rc4speed.c exhibits almost 70% speed-up. + * crypto/rc4/rc4speed.c exhibits almost 70% speed-up... + * + * Cavets. + * + * - RC4_CHUNK="unsigned long long" should be a #1 choice for + * UltraSPARC. Unfortunately gcc generates very slow code + * (2.5-3 times slower than one generated by Sun's WorkShop + * C) and therefore gcc (at least 2.95 and earlier) should + * always be told that RC4_CHUNK="unsigned long". * * */ -#define RC4_STEP ( \ +# define RC4_STEP ( \ x=(x+1) &0xff, \ tx=d[x], \ y=(tx+y)&0xff, \ @@ -111,70 +120,148 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, (RC4_CHUNK)d[(tx+ty)&0xff]\ ) -#if defined(L_ENDIAN) -# define SHFT(c) ((c)*8) -# define MASK(i) (((RC4_CHUNK)-1)>>((sizeof(RC4_CHUNK)-(i))<<3)) -# define SHINC 8 -#elif defined(B_ENDIAN) -# define SHFT(c) ((sizeof(RC4_CHUNK)-(c)-1)*8) -# define MASK(i) (((RC4_CHUNK)-1)<<((sizeof(RC4_CHUNK)-(i))<<3)) -# define SHINC -8 -#else -# error "L_ENDIAN or B_ENDIAN *must* be defined!" -#endif - if ( ( ((unsigned long)indata & (sizeof(RC4_CHUNK)-1)) | - ((unsigned long)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 - ) { - RC4_CHUNK ichunk,cipher; + ((unsigned long)outdata & (sizeof(RC4_CHUNK)-1)) ) == 0 ) + { + RC4_CHUNK ichunk,otp; + const union { long one; char little; } is_endian = {1}; - for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) { - ichunk = *(RC4_CHUNK *)indata; - cipher = RC4_STEP< + * who also stands for the is_endian union:-) + * + * Special notes. + * + * - is_endian is declared automatic as doing otherwise + * (declaring static) prevents gcc from eliminating + * the redundant code; + * - compilers (those I've tried) don't seem to have + * problems eliminating either the operators guarded + * by "if (sizeof(RC4_CHUNK)==8)" or the condition + * expressions themselves so I've got 'em to replace + * corresponding #ifdefs from the previous version; + * - I chose to let the redundant switch cases when + * sizeof(RC4_CHUNK)!=8 be (were also #ifdefed + * before); + * - in case you wonder "&(sizeof(RC4_CHUNK)*8-1)" in + * [LB]ESHFT guards against "shift is out of range" + * warnings when sizeof(RC4_CHUNK)!=8 + * + * + */ + if (!is_endian.little) + { /* BIG-ENDIAN CASE */ +# define BESHFT(c) (((sizeof(RC4_CHUNK)-(c)-1)*8)&(sizeof(RC4_CHUNK)*8-1)) + for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) + { + ichunk = *(RC4_CHUNK *)indata; + otp = RC4_STEP<x=x; + key->y=y; + return; + } + else + { /* LITTLE-ENDIAN CASE */ +# define LESHFT(c) (((c)*8)&(sizeof(RC4_CHUNK)*8-1)) + for (;len&-sizeof(RC4_CHUNK);len-=sizeof(RC4_CHUNK)) + { + ichunk = *(RC4_CHUNK *)indata; + otp = RC4_STEP; + otp |= RC4_STEP<<8; + otp |= RC4_STEP<<16; + otp |= RC4_STEP<<24; + if (sizeof(RC4_CHUNK)==8) + { + otp |= RC4_STEP<>= (sizeof(RC4_CHUNK)-len)<<3; + switch (len&(sizeof(RC4_CHUNK)-1)) + { + case 7: otp = RC4_STEP, i+=8; + case 6: otp |= RC4_STEP<x=x; + key->y=y; + return; } - ochunk &= ~mask; - ochunk |= (cipher^ichunk) & mask; - *(RC4_CHUNK *)outdata = ochunk; } - } - else #endif - { #define LOOP(in,out) \ x=((x+1)&0xff); \ tx=d[x]; \ @@ -223,7 +310,6 @@ void RC4(RC4_KEY *key, unsigned long len, unsigned char *indata, RC4_LOOP(indata,outdata,6); if (--i == 0) break; } } - } key->x=x; key->y=y; }