rc4-x86_64.pl: "Westmere" optimization.

[openssl.git] / crypto / rc4 / asm / rc4-x86_64.pl
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl

index a134127e8c654adeb676eef37b9ea07ba5b6fe77..23fe4d99963b73a8c4f0016b680471b82f30999a 100755 (executable)
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -58,6 +58,10 @@
  # fit for Core2 and therefore the code was modified to skip cloop8 on
  # this CPU.
  
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
  $flavour = shift;
  $output  = shift;
  if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@ -82,7 +86,6 @@ $YY="%r12";
  $TY="%r13";
  
  $code=<<___;
-.section .note.GNU-stack,"",\@progbits
  .text
  
  .globl RC4
@@ -230,6 +233,7 @@ $code.=<<___;
  .align 16
  .Lcloop1:
         add     $TX[0]#b,$YY#b
+       movzb   $YY#b,$YY#d
         movzb   ($dat,$YY),$TY#d
         movb    $TX[0]#b,($dat,$YY)
         movb    $TY#b,($dat,$XX[0])