projects
/
openssl.git
/ commitdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
| commitdiff |
tree
raw
|
patch
|
inline
| side by side (parent:
1aa8a62
)
rc4-x86_64.pl: "Westmere" optimization.
author
Andy Polyakov
<appro@openssl.org>
Thu, 13 May 2010 21:01:24 +0000
(21:01 +0000)
committer
Andy Polyakov
<appro@openssl.org>
Thu, 13 May 2010 21:01:24 +0000
(21:01 +0000)
crypto/rc4/asm/rc4-x86_64.pl
patch
|
blob
|
history
diff --git
a/crypto/rc4/asm/rc4-x86_64.pl
b/crypto/rc4/asm/rc4-x86_64.pl
index 677be5fe25badfe44a315855a827c65d71543e72..23fe4d99963b73a8c4f0016b680471b82f30999a 100755
(executable)
--- a/
crypto/rc4/asm/rc4-x86_64.pl
+++ b/
crypto/rc4/asm/rc4-x86_64.pl
@@
-58,6
+58,10
@@
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.
# fit for Core2 and therefore the code was modified to skip cloop8 on
# this CPU.
+# Intel Westmere was observed to perform suboptimally. Adding yet
+# another movzb to cloop1 improved performance by almost 50%! Core2
+# performance is improved too, but nominally...
+
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
@@
-229,6
+233,7
@@
$code.=<<___;
.align 16
.Lcloop1:
add $TX[0]#b,$YY#b
.align 16
.Lcloop1:
add $TX[0]#b,$YY#b
+ movzb $YY#b,$YY#d
movzb ($dat,$YY),$TY#d
movb $TX[0]#b,($dat,$YY)
movb $TY#b,($dat,$XX[0])
movzb ($dat,$YY),$TY#d
movb $TX[0]#b,($dat,$YY)
movb $TY#b,($dat,$XX[0])