3 # At some point it became apparent that the original SSLeay RC4
4 # assembler implementation performs suboptimal on latest IA-32
5 # microarchitectures. After re-tuning performance has changed as
13 # (*) This number is actually a trade-off:-) It's possible to
14 # achieve +72%, but at the cost of -48% off PIII performance.
15 # In other words code performing further 13% faster on AMD
16 # would perform almost 2 times slower on Intel PIII...
17 # For reference! This code delivers ~80% of rc4-amd64.pl
18 # performance on same Opteron machine.
19 # (**) This number requires compressed key schedule set up by
20 # RC4_set_key, see commentary section in rc4_skey.c for
22 # <appro@fy.chalmers.se>
24 push(@INC,"perlasm","../../perlasm");
27 &asm_init($ARGV[0],"rc4-586.pl");
43 local($n,$p,$char)=@_;
53 &jbe(&label("finished"));
61 &jb(&label("finished"));
66 # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
68 &add( &LB($y), &LB($tx));
69 &inc( &LB($x)); # NEXT ROUND
70 &mov( $ty, &DWP(0,$d,$y,4));
72 &mov( &DWP(-4,$d,$x,4),$ty); # AGI
74 &mov( &DWP(0,$d,$y,4),$tx);
76 &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
77 &mov( $ty, &DWP(0,$d,$ty,4));
81 #moved up into last round
86 &movb( &BP($n,"esp","",0), &LB($ty));
90 # Note in+=8 has occured
91 &movb( &HB($ty), &BP(-1,$in,"",0));
93 &xorb(&LB($ty), &HB($ty));
95 &movb(&BP($n,$out,"",0),&LB($ty));
104 &function_begin_B($name,"");
106 &mov($ty,&wparam(1)); # len
108 &jne(&label("proceed"));
110 &set_label("proceed");
117 &xor( $x, $x); # avoid partial register stalls
119 &xor( $y, $y); # avoid partial register stalls
120 &mov( $d, &wparam(0)); # key
121 &mov( $in, &wparam(2));
123 &movb( &LB($x), &BP(0,$d,"",1));
124 &movb( &LB($y), &BP(4,$d,"",1));
126 &mov( $out, &wparam(3));
129 &stack_push(3); # 3 temp variables
132 # detect compressed schedule, see commentary section in rc4_skey.c...
133 &cmp(&DWP(256,$d),-1);
134 &je(&label("RC4_CHAR"));
136 &lea( $ty, &DWP(-8,$ty,$in));
138 # check for 0 length input
140 &mov( &swtmp(2), $ty); # this is now address to exit at
141 &mov( $tx, &DWP(0,$d,$x,4));
144 &jb( &label("end")); # less than 8 bytes
160 &comment("apply the cipher text");
161 # xor the cipher data with input
163 #&add( $out, 8); #moved up into last round
165 &mov( $tx, &swtmp(0));
166 &mov( $ty, &DWP(-8,$in,"",0));
168 &mov( $ty, &DWP(-4,$in,"",0));
169 &mov( &DWP(-8,$out,"",0), $tx);
170 &mov( $tx, &swtmp(1));
172 &mov( $ty, &swtmp(2)); # load end ptr;
173 &mov( &DWP(-4,$out,"",0), $tx);
174 &mov( $tx, &DWP(0,$d,$x,4));
176 &jbe(&label("start"));
180 # There is quite a bit of extra crap in RC4_loop() for this
190 &jmp(&label("finished"));
193 # this is essentially Intel P4 specific codepath, see rc4_skey.c...
194 &set_label("RC4_CHAR");
196 &lea ($ty,&DWP(0,$in,$ty));
197 &mov (&swtmp(2),$ty);
199 # strangely enough unrolled loop performs over 20% slower...
200 &set_label("RC4_CHAR_loop");
201 &movz ($tx,&BP(0,$d,$x));
202 &add (&LB($y),&LB($tx));
203 &movz ($ty,&BP(0,$d,$y));
204 &movb (&BP(0,$d,$y),&LB($tx));
205 &movb (&BP(0,$d,$x),&LB($ty));
206 &add (&LB($ty),&LB($tx));
207 &movz ($ty,&BP(0,$d,$ty));
208 &xorb (&LB($ty),&BP(0,$in));
209 &movb (&BP(0,$out),&LB($ty));
213 &cmp ($in,&swtmp(2));
214 &jb (&label("RC4_CHAR_loop"));
216 &set_label("finished");
219 &movb( &BP(-4,$d,"",0),&LB($y));
220 &movb( &BP(-8,$d,"",0),&LB($x));
222 &function_end($name);