rc4-586.pl: optimize even further...

author Andy Polyakov <appro@openssl.org>

Fri, 27 May 2011 09:46:19 +0000 (09:46 +0000)

committer Andy Polyakov <appro@openssl.org>

Fri, 27 May 2011 09:46:19 +0000 (09:46 +0000)
author Andy Polyakov <appro@openssl.org>
Fri, 27 May 2011 09:46:19 +0000 (09:46 +0000)
committer Andy Polyakov <appro@openssl.org>
Fri, 27 May 2011 09:46:19 +0000 (09:46 +0000)
diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl

index 0c4cac4e8986e24970ca04fe381526a1b2df1444..4b8bc78b33b294960cc1bdf56354a0ad55ce0a54 100644 (file)
--- a/crypto/rc4/asm/rc4-586.pl
+++ b/crypto/rc4/asm/rc4-586.pl
@@ -37,10 +37,10 @@
  # Pentium III  7.8(*)
  # Intel P4     7.5
  #
  # Pentium III  7.8(*)
  # Intel P4     7.5
  #
-# Opteron      6.4/+14%                # new MMX numbers
-# Core2                5.8/+50%(**)
-# Westmere     5.5/+80%(**)
-# Sandy Bridge 5.4/0%
+# Opteron      6.1/+20%                # new MMX numbers
+# Core2                5.3/+67%(**)
+# Westmere     5.1/+94%(**)
+# Sandy Bridge 5.0/+8%
  #
  # (*)  PIII can actually deliver 6.6 cycles per byte with MMX code,
  #      but this specific code performs poorly on Core2. And vice
  #
  # (*)  PIII can actually deliver 6.6 cycles per byte with MMX code,
  #      but this specific code performs poorly on Core2. And vice
@@ -126,8 +126,7 @@ if ($alt=0) {
    $RC4_loop_mmx = sub {
      my $i=shift;
  
    $RC4_loop_mmx = sub {
      my $i=shift;
  
-       &add    ($yy,$tx);
-       &movz   ($yy,&LB($yy));                         # (*)
+       &add    (&LB($yy),&LB($tx));
         &psllq  ("mm1",8*(($i-1)&7))                    if (abs($i)!=1);
         &mov    ($ty,&DWP(0,$dat,$yy,4));
         &mov    (&DWP(0,$dat,$yy,4),$tx);
         &psllq  ("mm1",8*(($i-1)&7))                    if (abs($i)!=1);
         &mov    ($ty,&DWP(0,$dat,$yy,4));
         &mov    (&DWP(0,$dat,$yy,4),$tx);
@@ -204,6 +203,9 @@ if ($alt=0) {
                 &$RC4_loop_mmx(0);
         &set_label("loop_mmx_enter");
                 for     ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
                 &$RC4_loop_mmx(0);
         &set_label("loop_mmx_enter");
                 for     ($i=1;$i<8;$i++) { &$RC4_loop_mmx($i); }
+               &mov    ($ty,$yy);
+               &xor    ($yy,$yy);              # this is second key to Core2
+               &mov    (&LB($yy),&LB($ty));    # and Westmere performance...
                 &cmp    ($inp,&DWP(-4,$dat));
                 &lea    ($inp,&DWP(8,$inp));
         &jb     (&label("loop_mmx"));
                 &cmp    ($inp,&DWP(-4,$dat));
                 &lea    ($inp,&DWP(8,$inp));
         &jb     (&label("loop_mmx"));
author	Andy Polyakov <appro@openssl.org>
	Fri, 27 May 2011 09:46:19 +0000 (09:46 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Fri, 27 May 2011 09:46:19 +0000 (09:46 +0000)