From 4b60f4b175158b426c1f1a2f30fac616ded28d21 Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Sun, 11 Nov 2007 16:25:46 +0000 Subject: [PATCH] rc4-x86_64.pl update [from HEAD]. --- Configure | 10 +-- TABLE | 68 +++++++++--------- crypto/rc4/Makefile | 4 +- crypto/rc4/asm/rc4-x86_64.pl | 130 ++++++++++++++++++++++++++++++++++- 4 files changed, 168 insertions(+), 44 deletions(-) diff --git a/Configure b/Configure index d260005487..089e4eb76f 100755 --- a/Configure +++ b/Configure @@ -114,12 +114,12 @@ my $tlib="-lnsl -lsocket"; my $bits1="THIRTY_TWO_BIT "; my $bits2="SIXTY_FOUR_BIT "; -my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o:rm86-elf.o:r586-elf.o"; -my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o"; -my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o"; +my $x86_elf_asm="x86cpuid-elf.o:bn86-elf.o co86-elf.o:dx86-elf.o yx86-elf.o:ax86-elf.o:bx86-elf.o:mx86-elf.o:sx86-elf.o s512sse2-elf.o:cx86-elf.o:rx86-elf.o:rm86-elf.o:r586-elf.o rc4_skey.o"; +my $x86_coff_asm="x86cpuid-cof.o:bn86-cof.o co86-cof.o:dx86-cof.o yx86-cof.o:ax86-cof.o:bx86-cof.o:mx86-cof.o:sx86-cof.o s512sse2-cof.o:cx86-cof.o:rx86-cof.o:rm86-cof.o:r586-cof.o rc4_skey.o"; +my $x86_out_asm="x86cpuid-out.o:bn86-out.o co86-out.o:dx86-out.o yx86-out.o:ax86-out.o:bx86-out.o:mx86-out.o:sx86-out.o s512sse2-out.o:cx86-out.o:rx86-out.o:rm86-out.o:r586-out.o rc4_skey.o"; my $x86_64_asm="x86_64cpuid.o:x86_64-gcc.o::aes-x86_64.o::md5-x86_64.o:sha1-x86_64.o sha256-x86_64.o sha512-x86_64.o::rc4-x86_64.o::"; -my $ia64_asm=":bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o:::sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o::"; +my $ia64_asm=":bn-ia64.o::aes_core.o aes_cbc.o aes-ia64.o:::sha1-ia64.o sha256-ia64.o sha512-ia64.o::rc4-ia64.o rc4_skey.o::"; my $no_asm="::::::::::"; @@ -589,7 +589,7 @@ my $des_enc="des_enc.o fcrypt_b.o"; my $aes_enc="aes_core.o aes_cbc.o"; my $bf_enc ="bf_enc.o"; my $cast_enc="c_enc.o"; -my $rc4_enc="rc4_enc.o"; +my $rc4_enc="rc4_enc.o rc4_skey.o"; my $rc5_enc="rc5_enc.o"; my $md5_obj=""; my $sha1_obj=""; diff --git a/TABLE b/TABLE index 7fd0d4dcf1..a4e0277615 100644 --- a/TABLE +++ b/TABLE @@ -124,7 +124,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-ia64.o sha256-ia64.o sha512-ia64.o $cast_obj = -$rc4_obj = rc4-ia64.o +$rc4_obj = rc4-ia64.o rc4_skey.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -207,7 +207,7 @@ $sha1_obj = sx86-out.o s512sse2-out.o $cast_obj = cx86-out.o $rc4_obj = rx86-out.o $rmd160_obj = rm86-out.o -$rc5_obj = r586-out.o +$rc5_obj = r586-out.o rc4_skey.o $dso_scheme = dlfcn $shared_target= bsd-shared $shared_cflag = -fPIC @@ -234,7 +234,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= bsd-shared $shared_cflag = -fPIC @@ -288,7 +288,7 @@ $sha1_obj = sx86-cof.o s512sse2-cof.o $cast_obj = cx86-cof.o $rc4_obj = rx86-cof.o $rmd160_obj = rm86-cof.o -$rc5_obj = r586-cof.o +$rc5_obj = r586-cof.o rc4_skey.o $dso_scheme = dlfcn $shared_target= cygwin-shared $shared_cflag = -D_WINDLL @@ -342,7 +342,7 @@ $sha1_obj = sx86-out.o s512sse2-out.o $cast_obj = cx86-out.o $rc4_obj = rx86-out.o $rmd160_obj = rm86-out.o -$rc5_obj = r586-out.o +$rc5_obj = r586-out.o rc4_skey.o $dso_scheme = $shared_target= $shared_cflag = @@ -855,7 +855,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= bsd-gcc-shared $shared_cflag = -fPIC @@ -1098,7 +1098,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= bsd-shared $shared_cflag = -fPIC @@ -1287,7 +1287,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = $shared_target= $shared_cflag = @@ -1368,7 +1368,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= linux-shared $shared_cflag = -fPIC @@ -1395,7 +1395,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= linux-shared $shared_cflag = -fPIC @@ -1476,7 +1476,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= linux-shared $shared_cflag = -fPIC @@ -1503,7 +1503,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= linux-shared $shared_cflag = -fPIC @@ -1530,7 +1530,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= $shared_cflag = @@ -1557,7 +1557,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= $shared_cflag = @@ -1584,7 +1584,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = $shared_target= $shared_cflag = @@ -1719,7 +1719,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= linux-shared $shared_cflag = @@ -1881,7 +1881,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = $shared_target= $shared_cflag = @@ -2014,7 +2014,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-ia64.o sha256-ia64.o sha512-ia64.o $cast_obj = -$rc4_obj = rc4-ia64.o +$rc4_obj = rc4-ia64.o rc4_skey.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2041,7 +2041,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-ia64.o sha256-ia64.o sha512-ia64.o $cast_obj = -$rc4_obj = rc4-ia64.o +$rc4_obj = rc4-ia64.o rc4_skey.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2230,7 +2230,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-ia64.o sha256-ia64.o sha512-ia64.o $cast_obj = -$rc4_obj = rc4-ia64.o +$rc4_obj = rc4-ia64.o rc4_skey.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2257,7 +2257,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-ia64.o sha256-ia64.o sha512-ia64.o $cast_obj = -$rc4_obj = rc4-ia64.o +$rc4_obj = rc4-ia64.o rc4_skey.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2340,7 +2340,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= linux-shared $shared_cflag = -fPIC @@ -2637,7 +2637,7 @@ $sha1_obj = sx86-out.o s512sse2-out.o $cast_obj = cx86-out.o $rc4_obj = rx86-out.o $rmd160_obj = rm86-out.o -$rc5_obj = r586-out.o +$rc5_obj = r586-out.o rc4_skey.o $dso_scheme = $shared_target= $shared_cflag = @@ -2664,7 +2664,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= linux-shared $shared_cflag = -fPIC @@ -2745,7 +2745,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= linux-shared $shared_cflag = -KPIC @@ -2770,7 +2770,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-ia64.o sha256-ia64.o sha512-ia64.o $cast_obj = -$rc4_obj = rc4-ia64.o +$rc4_obj = rc4-ia64.o rc4_skey.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2797,7 +2797,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-ia64.o sha256-ia64.o sha512-ia64.o $cast_obj = -$rc4_obj = rc4-ia64.o +$rc4_obj = rc4-ia64.o rc4_skey.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -2824,7 +2824,7 @@ $bf_obj = $md5_obj = $sha1_obj = sha1-ia64.o sha256-ia64.o sha512-ia64.o $cast_obj = -$rc4_obj = rc4-ia64.o +$rc4_obj = rc4-ia64.o rc4_skey.o $rmd160_obj = $rc5_obj = $dso_scheme = dlfcn @@ -3015,7 +3015,7 @@ $sha1_obj = sx86-cof.o s512sse2-cof.o $cast_obj = cx86-cof.o $rc4_obj = rx86-cof.o $rmd160_obj = rm86-cof.o -$rc5_obj = r586-cof.o +$rc5_obj = r586-cof.o rc4_skey.o $dso_scheme = win32 $shared_target= cygwin-shared $shared_cflag = -D_WINDLL -DOPENSSL_USE_APPLINK @@ -3420,7 +3420,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= svr3-shared $shared_cflag = -Kpic @@ -3447,7 +3447,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= svr3-shared $shared_cflag = -fPIC @@ -3663,7 +3663,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= solaris-shared $shared_cflag = -fPIC @@ -3987,7 +3987,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= svr5-shared $shared_cflag = -Kpic @@ -4014,7 +4014,7 @@ $sha1_obj = sx86-elf.o s512sse2-elf.o $cast_obj = cx86-elf.o $rc4_obj = rx86-elf.o $rmd160_obj = rm86-elf.o -$rc5_obj = r586-elf.o +$rc5_obj = r586-elf.o rc4_skey.o $dso_scheme = dlfcn $shared_target= gnu-shared $shared_cflag = -fPIC diff --git a/crypto/rc4/Makefile b/crypto/rc4/Makefile index 7857c95fbf..187ed5c668 100644 --- a/crypto/rc4/Makefile +++ b/crypto/rc4/Makefile @@ -10,7 +10,7 @@ INCLUDES= CFLAG=-g AR= ar r -RC4_ENC=rc4_enc.o +RC4_ENC=rc4_enc.o rc4_skey.o CFLAGS= $(INCLUDES) $(CFLAG) ASFLAGS= $(INCLUDES) $(ASFLAG) @@ -22,7 +22,7 @@ APPS= LIB=$(TOP)/libcrypto.a LIBSRC=rc4_skey.c rc4_enc.c -LIBOBJ=rc4_skey.o $(RC4_ENC) +LIBOBJ=$(RC4_ENC) SRC= $(LIBSRC) diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 4b990cba07..2d47320485 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -2,8 +2,9 @@ # # ==================================================================== # Written by Andy Polyakov for the OpenSSL -# project. Rights for redistribution and usage in source and binary -# forms are granted according to the OpenSSL license. +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. # ==================================================================== # # 2.22x RC4 tune-up:-) It should be noted though that my hand [as in @@ -49,8 +50,22 @@ # is not implemented, then this final RC4_CHAR code-path should be # preferred, as it provides better *all-round* performance]. +# Intel Core2 was observed to perform poorly on both code paths:-( It +# apparently suffers from some kind of partial register stall, which +# occurs in 64-bit mode only [as virtually identical 32-bit loop was +# observed to outperform 64-bit one by almost 50%]. Adding two movzb to +# cloop1 boosts its performance by 80%! This loop appears to be optimal +# fit for Core2 and therefore the code was modified to skip cloop8 on +# this CPU. + $output=shift; -open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output"; + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or +die "can't locate x86_64-xlate.pl"; + +open STDOUT,"| $^X $xlate $output"; $dat="%rdi"; # arg1 $len="%rsi"; # arg2 @@ -152,6 +167,8 @@ $code.=<<___; movzb ($dat,$XX[0]),$TX[0]#d test \$-8,$len jz .Lcloop1 + cmp \$0,260($dat) + jnz .Lcloop1 push %rbx jmp .Lcloop8 .align 16 @@ -221,6 +238,8 @@ $code.=<<___; movb $TY#b,($dat,$XX[0]) add $TX[0]#b,$TY#b add \$1,$XX[0]#b + movzb $TY#b,$TY#d + movzb $XX[0]#b,$XX[0]#d movzb ($dat,$TY),$TY#d movzb ($dat,$XX[0]),$TX[0]#d xorb ($inp),$TY#b @@ -233,6 +252,111 @@ $code.=<<___; .size RC4,.-RC4 ___ +$idx="%r8"; +$ido="%r9"; + +$code.=<<___; +.extern OPENSSL_ia32cap_P +.globl RC4_set_key +.type RC4_set_key,\@function,3 +.align 16 +RC4_set_key: + lea 8($dat),$dat + lea ($inp,$len),$inp + neg $len + mov $len,%rcx + xor %eax,%eax + xor $ido,$ido + xor %r10,%r10 + xor %r11,%r11 + + mov OPENSSL_ia32cap_P(%rip),$idx#d + bt \$20,$idx#d + jnc .Lw1stloop + bt \$30,$idx#d + setc $ido#b + mov $ido#d,260($dat) + jmp .Lc1stloop + +.align 16 +.Lw1stloop: + mov %eax,($dat,%rax,4) + add \$1,%al + jnc .Lw1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lw2ndloop: + mov ($dat,$ido,4),%r10d + add ($inp,$len,1),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx,4),%r11d + cmovz %rcx,$len + mov %r10d,($dat,$idx,4) + mov %r11d,($dat,$ido,4) + add \$1,$ido#b + jnc .Lw2ndloop + jmp .Lexit_key + +.align 16 +.Lc1stloop: + mov %al,($dat,%rax) + add \$1,%al + jnc .Lc1stloop + + xor $ido,$ido + xor $idx,$idx +.align 16 +.Lc2ndloop: + mov ($dat,$ido),%r10b + add ($inp,$len),$idx#b + add %r10b,$idx#b + add \$1,$len + mov ($dat,$idx),%r11b + jnz .Lcnowrap + mov %rcx,$len +.Lcnowrap: + mov %r10b,($dat,$idx) + mov %r11b,($dat,$ido) + add \$1,$ido#b + jnc .Lc2ndloop + movl \$-1,256($dat) + +.align 16 +.Lexit_key: + xor %eax,%eax + mov %eax,-8($dat) + mov %eax,-4($dat) + ret +.size RC4_set_key,.-RC4_set_key + +.globl RC4_options +.type RC4_options,\@function,0 +.align 16 +RC4_options: + .picmeup %rax + lea .Lopts-.(%rax),%rax + mov OPENSSL_ia32cap_P(%rip),%edx + bt \$20,%edx + jnc .Ldone + add \$12,%rax + bt \$30,%edx + jnc .Ldone + add \$13,%rax +.Ldone: + ret +.align 64 +.Lopts: +.asciz "rc4(8x,int)" +.asciz "rc4(8x,char)" +.asciz "rc4(1x,char)" +.asciz "RC4 for x86_64, CRYPTOGAMS by " +.align 64 +.size RC4_options,.-RC4_options +___ + $code =~ s/#([bwd])/$1/gm; print $code; -- 2.34.1