+20% performance improvement of P4-specific RC4_CHAR loop.
authorAndy Polyakov <appro@openssl.org>
Sun, 15 May 2005 22:43:00 +0000 (22:43 +0000)
committerAndy Polyakov <appro@openssl.org>
Sun, 15 May 2005 22:43:00 +0000 (22:43 +0000)
crypto/rc4/asm/rc4-586.pl

index d6e98f0..22bda4b 100644 (file)
@@ -200,22 +200,23 @@ sub RC4
 
        &lea    ($ty,&DWP(0,$in,$ty));
        &mov    (&swtmp(2),$ty);
+       &movz   ($tx,&BP(0,$d,$x));
 
        # strangely enough unrolled loop performs over 20% slower...
        &set_label("RC4_CHAR_loop");
-               &movz   ($tx,&BP(0,$d,$x));
                &add    (&LB($y),&LB($tx));
                &movz   ($ty,&BP(0,$d,$y));
                &movb   (&BP(0,$d,$y),&LB($tx));
                &movb   (&BP(0,$d,$x),&LB($ty));
                &add    (&LB($ty),&LB($tx));
                &movz   ($ty,&BP(0,$d,$ty));
+               &add    (&LB($x),1);
                &xorb   (&LB($ty),&BP(0,$in));
-               &movb   (&BP(0,$out),&LB($ty));
-               &inc    (&LB($x));
-               &inc    ($in);
-               &inc    ($out);
+               &lea    ($in,&BP(1,$in));
+               &movz   ($tx,&BP(0,$d,$x));
                &cmp    ($in,&swtmp(2));
+               &movb   (&BP(0,$out),&LB($ty));
+               &lea    ($out,&BP(1,$out));
        &jb     (&label("RC4_CHAR_loop"));
 
        &set_label("finished");