Two extra instructions in RC4 character loop give 80% performance
authorAndy Polyakov <appro@openssl.org>
Tue, 20 Mar 2007 09:13:07 +0000 (09:13 +0000)
committerAndy Polyakov <appro@openssl.org>
Tue, 20 Mar 2007 09:13:07 +0000 (09:13 +0000)
improvement on Core2. I still need to detect Core2 and choose this
path...

crypto/rc4/asm/rc4-x86_64.pl

index 4b990cb..5236afe 100755 (executable)
@@ -221,6 +221,8 @@ $code.=<<___;
        movb    $TY#b,($dat,$XX[0])
        add     $TX[0]#b,$TY#b
        add     \$1,$XX[0]#b
+       movzb   $TY#b,$TY#d
+       movzb   $XX[0]#b,$XX[0]#d
        movzb   ($dat,$TY),$TY#d
        movzb   ($dat,$XX[0]),$TX[0]#d
        xorb    ($inp),$TY#b