+ &jmp(&label("finished"));
+
+ &align(16);
+ # this is essentially Intel P4 specific codepath, see rc4_skey.c,
+ # and is engaged in 0.9.8 and later context...
+ &set_label("RC4_CHAR");
+
+ &lea ($ty,&DWP(0,$in,$ty));
+ &mov (&swtmp(2),$ty);
+
+ # strangely enough unrolled loop performs over 20% slower...
+ &set_label("RC4_CHAR_loop");
+ &movz ($tx,&BP(0,$d,$x));
+ &add (&LB($y),&LB($tx));
+ &movz ($ty,&BP(0,$d,$y));
+ &movb (&BP(0,$d,$y),&LB($tx));
+ &movb (&BP(0,$d,$x),&LB($ty));
+ &add (&LB($ty),&LB($tx));
+ &movz ($ty,&BP(0,$d,$ty));
+ &xorb (&LB($ty),&BP(0,$in));
+ &movb (&BP(0,$out),&LB($ty));
+ &inc (&LB($x));
+ &inc ($in);
+ &inc ($out);
+ &cmp ($in,&swtmp(2));
+ &jb (&label("RC4_CHAR_loop"));
+