3 # At some point it became apparent that the original SSLeay RC4
4 # assembler implementation performs suboptimaly on latest IA-32
5 # microarchitectures. After re-tuning performance has changed as
13 # (*) This number is actually a trade-off:-) It's possible to
14 # achieve +72%, but at the cost of -48% off PIII performance.
15 # In other words code performing further 13% faster on AMD
16 # would perform almost 2 times slower on Intel PIII...
17 # For reference! This code delivers ~80% of rc4-amd64.pl
18 # performance on the same Opteron machine.
19 # (**) This number requires compressed key schedule set up by
20 # RC4_set_key and therefore doesn't apply to 0.9.7 [option for
21 # compressed key schedule is implemented in 0.9.8 and later,
22 # see commentary section in rc4_skey.c for further details].
24 # <appro@fy.chalmers.se>
26 push(@INC,"perlasm","../../perlasm");
29 &asm_init($ARGV[0],"rc4-586.pl");
41 local($n,$p,$char)=@_;
51 &jbe(&label("finished"));
59 &jb(&label("finished"));
64 # &mov( $tx, &DWP(0,$d,$x,4)) if $p < 0;
66 &add( &LB($y), &LB($tx));
67 &mov( $ty, &DWP(0,$d,$y,4));
69 &mov( &DWP(0,$d,$x,4),$ty);
71 &mov( &DWP(0,$d,$y,4),$tx);
73 &inc( &LB($x)); # NEXT ROUND
74 &mov( $tx, &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
75 &mov( $ty, &DWP(0,$d,$ty,4));
79 #moved up into last round
84 &movb( &BP($n,"esp","",0), &LB($ty));
88 # Note in+=8 has occured
89 &movb( &HB($ty), &BP(-1,$in,"",0));
91 &xorb(&LB($ty), &HB($ty));
93 &movb(&BP($n,$out,"",0),&LB($ty));
98 &function_begin_B("RC4");
103 &mov($ty,&wparam(1)); # len
105 &jne(&label("proceed"));
107 &set_label("proceed");
114 &xor( $x, $x); # avoid partial register stalls
116 &xor( $y, $y); # avoid partial register stalls
117 &mov( $d, &wparam(0)); # key
118 &mov( $in, &wparam(2));
120 &movb( &LB($x), &BP(0,$d,"",1));
121 &movb( &LB($y), &BP(4,$d,"",1));
123 &mov( $out, &wparam(3));
126 &stack_push(3); # 3 temp variables
129 # detect compressed schedule, see commentary section in rc4_skey.c...
130 # in 0.9.7 context ~50 bytes below RC4_CHAR label remain redundant,
131 # as compressed key schedule is set up in 0.9.8 and later.
132 &cmp(&DWP(256,$d),-1);
133 &je(&label("RC4_CHAR"));
135 &lea( $ty, &DWP(-8,$ty,$in));
137 # check for 0 length input
139 &mov( &swtmp(2), $ty); # this is now address to exit at
140 &mov( $tx, &DWP(0,$d,$x,4));
143 &jb( &label("end")); # less than 8 bytes
159 &comment("apply the cipher text");
160 # xor the cipher data with input
162 #&add( $out, 8); #moved up into last round
164 &mov( $tx, &swtmp(0));
165 &mov( $ty, &DWP(-8,$in,"",0));
167 &mov( $ty, &DWP(-4,$in,"",0));
168 &mov( &DWP(-8,$out,"",0), $tx);
169 &mov( $tx, &swtmp(1));
171 &mov( $ty, &swtmp(2)); # load end ptr;
172 &mov( &DWP(-4,$out,"",0), $tx);
173 &mov( $tx, &DWP(0,$d,$x,4));
175 &jbe(&label("start"));
179 # There is quite a bit of extra crap in RC4_loop() for this
189 &jmp(&label("finished"));
192 # this is essentially Intel P4 specific codepath, see rc4_skey.c,
193 # and is engaged in 0.9.8 and later context...
194 &set_label("RC4_CHAR");
196 &lea ($ty,&DWP(0,$in,$ty));
197 &mov (&swtmp(2),$ty);
198 &movz ($tx,&BP(0,$d,$x));
200 # strangely enough unrolled loop performs over 20% slower...
201 &set_label("RC4_CHAR_loop");
202 &add (&LB($y),&LB($tx));
203 &movz ($ty,&BP(0,$d,$y));
204 &movb (&BP(0,$d,$y),&LB($tx));
205 &movb (&BP(0,$d,$x),&LB($ty));
206 &add (&LB($ty),&LB($tx));
207 &movz ($ty,&BP(0,$d,$ty));
209 &xorb (&LB($ty),&BP(0,$in));
210 &lea ($in,&BP(1,$in));
211 &movz ($tx,&BP(0,$d,$x));
212 &cmp ($in,&swtmp(2));
213 &movb (&BP(0,$out),&LB($ty));
214 &lea ($out,&BP(1,$out));
215 &jb (&label("RC4_CHAR_loop"));
217 &set_label("finished");
220 &movb( &BP(-4,$d,"",0),&LB($y));
221 &movb( &BP(-8,$d,"",0),&LB($x));
223 &function_end("RC4");
225 ########################################################################
233 &external_label("OPENSSL_ia32cap_P");
235 # void RC4_set_key(RC4_KEY *key,int len,const unsigned char *data);
236 &function_begin("RC4_set_key");
237 &mov ($out,&wparam(0)); # load key
238 &mov ($idi,&wparam(1)); # load len
239 &mov ($inp,&wparam(2)); # load data
240 &picmeup($idx,"OPENSSL_ia32cap_P");
242 &lea ($out,&DWP(2*4,$out)); # &key->data
243 &lea ($inp,&DWP(0,$inp,$idi)); # $inp to point at the end
246 &mov (&DWP(-4,$out),$idi); # borrow key->y
248 &bt (&DWP(0,$idx),20); # check for bit#20
249 &jc (&label("c1stloop"));
251 &set_label("w1stloop",16);
252 &mov (&DWP(0,$out,"eax",4),"eax"); # key->data[i]=i;
253 &add (&LB("eax"),1); # i++;
254 &jnc (&label("w1stloop"));
259 &set_label("w2ndloop",16);
260 &mov ("eax",&DWP(0,$out,$ido,4));
261 &add (&LB($idx),&BP(0,$inp,$idi));
262 &add (&LB($idx),&LB("eax"));
264 &mov ("ebx",&DWP(0,$out,$idx,4));
265 &jnz (&label("wnowrap"));
266 &mov ($idi,&DWP(-4,$out));
267 &set_label("wnowrap");
268 &mov (&DWP(0,$out,$idx,4),"eax");
269 &mov (&DWP(0,$out,$ido,4),"ebx");
271 &jnc (&label("w2ndloop"));
272 &jmp (&label("exit"));
274 &set_label("c1stloop",16);
275 &mov (&BP(0,$out,"eax"),&LB("eax")); # key->data[i]=i;
276 &add (&LB("eax"),1); # i++;
277 &jnc (&label("c1stloop"));
283 &set_label("c2ndloop",16);
284 &mov (&LB("eax"),&BP(0,$out,$ido));
285 &add (&LB($idx),&BP(0,$inp,$idi));
286 &add (&LB($idx),&LB("eax"));
288 &mov (&LB("ebx"),&BP(0,$out,$idx));
289 &jnz (&label("cnowrap"));
290 &mov ($idi,&DWP(-4,$out));
291 &set_label("cnowrap");
292 &mov (&BP(0,$out,$idx),&LB("eax"));
293 &mov (&BP(0,$out,$ido),&LB("ebx"));
295 &jnc (&label("c2ndloop"));
297 &mov (&DWP(256,$out),-1); # mark schedule as compressed
301 &mov (&DWP(-8,$out),"eax"); # key->x=0;
302 &mov (&DWP(-4,$out),"eax"); # key->y=0;
303 &function_end("RC4_set_key");
305 # const char *RC4_options(void);
306 &function_begin_B("RC4_options");
307 &call (&label("pic_point"));
308 &set_label("pic_point");
310 &lea ("eax",&DWP(&label("opts")."-".&label("pic_point"),"eax"));
311 &picmeup("edx","OPENSSL_ia32cap_P");
312 &bt (&DWP(0,"edx"),20);
313 &jnc (&label("skip"));
317 &set_label("opts",64);
318 &asciz ("rc4(8x,int)");
319 &asciz ("rc4(1x,char)");
320 &asciz ("RC4 for x86, OpenSSL project"); # RC4_version
322 &function_end_B("RC4_options");