From: Andy Polyakov Date: Thu, 13 May 2010 21:01:24 +0000 (+0000) Subject: rc4-x86_64.pl: "Westmere" optimization. X-Git-Tag: OpenSSL-fips-2_0-rc1~1100 X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff_plain;h=629fd3aa913f547f6228740d5068193f283abe94;ds=sidebyside rc4-x86_64.pl: "Westmere" optimization. --- diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 677be5fe25..23fe4d9996 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -58,6 +58,10 @@ # fit for Core2 and therefore the code was modified to skip cloop8 on # this CPU. +# Intel Westmere was observed to perform suboptimally. Adding yet +# another movzb to cloop1 improved performance by almost 50%! Core2 +# performance is improved too, but nominally... + $flavour = shift; $output = shift; if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } @@ -229,6 +233,7 @@ $code.=<<___; .align 16 .Lcloop1: add $TX[0]#b,$YY#b + movzb $YY#b,$YY#d movzb ($dat,$YY),$TY#d movb $TX[0]#b,($dat,$YY) movb $TY#b,($dat,$XX[0])