x86_64 assembly pack: make Windows build more robust.

[openssl.git] / crypto / rc4 / asm / rc4-x86_64.pl
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl

index b08cc256566c05d39f35b0e84f8d7b3d53ea3298..fa227631870354f78eee9f23b1f54a008d81c110 100755 (executable)
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -78,23 +78,31 @@
  
  # May 2011
  #
-# The only code path that was not modified is P4-specific one. New
-# AMD code path is inspired by and Intel optimization is heavily
-# based on submission from Maxim Locktyukhin of Intel. Current
-# performance in cycles per processed byte (less is better) and
-# improvement coefficients relative to previous version of this
-# module are:
+# The only code path that was not modified is P4-specific one. Non-P4
+# Intel code path optimization is heavily based on submission by Maxim
+# Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used
+# some of the ideas even in attempt to optmize the original RC4_INT
+# code path... Current performance in cycles per processed byte (less
+# is better) and improvement coefficients relative to previous
+# version of this module are:
  #
-# Opteron      5.3/+0%
+# Opteron      5.3/+0%(*)
  # P4           6.5
-# Core2                6.2/+15%(*)
+# Core2                6.2/+15%(**)
  # Westmere     4.2/+60%
  # Sandy Bridge 4.2/+120%
  # Atom         9.3/+80%
+# VIA Nano     6.4/+4%
+# Ivy Bridge   4.1/+30%
+# Bulldozer    4.5/+30%(*)
  #
-# (*)  Note that this result is ~15% lower than result for 32-bit
-#      code, meaning that it's possible to improve it, but it's
-#      more than likely at the cost of the others...
+# (*)  But corresponding loop has less instructions, which should have
+#      positive effect on upcoming Bulldozer, which has one less ALU.
+#      For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**) Note that Core2 result is ~15% lower than corresponding result
+#      for 32-bit code, meaning that it's possible to improve it,
+#      but more than likely at the cost of the others (see rc4-586.pl
+#      to get the idea)...
  
  $flavour = shift;
  $output  = shift;
@@ -107,7 +115,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  die "can't locate x86_64-xlate.pl";
  
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
  
  $dat="%rdi";       # arg1
  $len="%rsi";       # arg2
@@ -117,6 +126,7 @@ $out="%rcx";            # arg4
  {
  $code=<<___;
  .text
+.extern        OPENSSL_ia32cap_P
  
  .globl RC4
  .type  RC4,\@function,4
@@ -159,8 +169,8 @@ $code.=<<___;
         movl    ($dat,$XX[0],4),$TX[0]#d
         test    \$-16,$len
         jz      .Lloop1
-       bt      \$30,%r8d       # Intel CPU Family 6
-       jc      .L16x
+       bt      \$30,%r8d       # Intel CPU?
+       jc      .Lintel
         and     \$7,$TX[1]
         lea     1($XX[0]),$XX[1]
         jz      .Loop8
@@ -217,7 +227,7 @@ $code.=<<___;
         jmp     .Lexit
  
  .align 16
-.L16x:
+.Lintel:
         test    \$-32,$len
         jz      .Lloop1
         and     \$15,$TX[1]
@@ -423,7 +433,6 @@ $idx="%r8";
  $ido="%r9";
  
  $code.=<<___;
-.extern        OPENSSL_ia32cap_P
  .globl RC4_set_key
  .type  RC4_set_key,\@function,3
  .align 16
@@ -438,10 +447,8 @@ RC4_set_key:
         xor     %r11,%r11
  
         mov     OPENSSL_ia32cap_P(%rip),$idx#d
-       bt      \$20,$idx#d     # Intel CPU
-       jnc     .Lw1stloop
-       bt      \$30,$idx#d     # Intel CPU Family 6
-       jnc     .Lc1stloop
+       bt      \$20,$idx#d     # RC4_CHAR?
+       jc      .Lc1stloop
         jmp     .Lw1stloop
  
  .align 16
@@ -505,11 +512,13 @@ RC4_options:
         lea     .Lopts(%rip),%rax
         mov     OPENSSL_ia32cap_P(%rip),%edx
         bt      \$20,%edx
-       jnc     .Ldone
-       add     \$12,%rax
+       jc      .L8xchar
         bt      \$30,%edx
         jnc     .Ldone
-       add     \$13,%rax
+       add     \$25,%rax
+       ret
+.L8xchar:
+       add     \$12,%rax
  .Ldone:
         ret
  .align 64