# May 2011
#
-# The only code path that was not modified is P4-specific one. New
-# AMD code path is inspired by and Intel optimization is heavily
-# based on submission from Maxim Locktyukhin of Intel. Current
-# performance in cycles per processed byte (less is better) and
-# improvement coefficients relative to previous version of this
-# module are:
+# The only code path that was not modified is P4-specific one. Non-P4
+# Intel code path optimization is heavily based on submission by Maxim
+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
+# some of the ideas even in attempt to optmize the original RC4_INT
+# code path... Current performance in cycles per processed byte (less
+# is better) and improvement coefficients relative to previous
+# version of this module are:
#
-# Opteron 5.3/+0%
+# Opteron 5.3/+0%(*)
# P4 6.5
-# Core2 6.2/+15%(*)
+# Core2 6.2/+15%(**)
# Westmere 4.2/+60%
# Sandy Bridge 4.2/+120%
# Atom 9.3/+80%
+# VIA Nano 6.4/+4%
+# Ivy Bridge 4.1/+30%
+# Bulldozer 4.5/+30%(*)
#
-# (*) Note that this result is ~15% lower than result for 32-bit
-# code, meaning that it's possible to improve it, but it's
-# more than likely at the cost of the others...
+# (*) But corresponding loop has less instructions, which should have
+# positive effect on upcoming Bulldozer, which has one less ALU.
+# For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**) Note that Core2 result is ~15% lower than corresponding result
+# for 32-bit code, meaning that it's possible to improve it,
+# but more than likely at the cost of the others (see rc4-586.pl
+# to get the idea)...
$flavour = shift;
$output = shift;
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
{
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl RC4
.type RC4,\@function,4
movl ($dat,$XX[0],4),$TX[0]#d
test \$-16,$len
jz .Lloop1
- bt \$30,%r8d # Intel CPU Family 6
- jc .L16x
+ bt \$30,%r8d # Intel CPU?
+ jc .Lintel
and \$7,$TX[1]
lea 1($XX[0]),$XX[1]
jz .Loop8
jmp .Lexit
.align 16
-.L16x:
+.Lintel:
test \$-32,$len
jz .Lloop1
and \$15,$TX[1]
$ido="%r9";
$code.=<<___;
-.extern OPENSSL_ia32cap_P
.globl RC4_set_key
.type RC4_set_key,\@function,3
.align 16
xor %r11,%r11
mov OPENSSL_ia32cap_P(%rip),$idx#d
- bt \$20,$idx#d # Intel CPU
- jnc .Lw1stloop
- bt \$30,$idx#d # Intel CPU Family 6
- jnc .Lc1stloop
+ bt \$20,$idx#d # RC4_CHAR?
+ jc .Lc1stloop
jmp .Lw1stloop
.align 16
lea .Lopts(%rip),%rax
mov OPENSSL_ia32cap_P(%rip),%edx
bt \$20,%edx
- jnc .Ldone
- add \$12,%rax
+ jc .L8xchar
bt \$30,%edx
jnc .Ldone
- add \$13,%rax
+ add \$25,%rax
+ ret
+.L8xchar:
+ add \$12,%rax
.Ldone:
ret
.align 64