RC4 tune-up for Intel P4 core, both 32- and 64-bit ones. As it's
[openssl.git] / crypto / rc4 / asm / rc4-586.pl
1 #!/usr/local/bin/perl
2
3 # define for pentium pro friendly version
4
5 push(@INC,"perlasm","../../perlasm");
6 require "x86asm.pl";
7
8 &asm_init($ARGV[0],"rc4-586.pl");
9
10 $x="eax";
11 $y="ebx";
12 $tx="ecx";
13 $ty="edx";
14 $in="esi";
15 $out="edi";
16 $d="ebp";
17
18 &RC4("RC4");
19
20 &asm_finish();
21
22 sub RC4_loop
23         {
24         local($n,$p,$char)=@_;
25
26         &comment("Round $n");
27
28         if ($char)
29                 {
30                 if ($p >= 0)
31                         {
32                          &mov($ty,      &swtmp(2));
33                         &cmp($ty,       $in);
34                          &jbe(&label("finished"));
35                         &inc($in);
36                         }
37                 else
38                         {
39                         &add($ty,       8);
40                          &inc($in);
41                         &cmp($ty,       $in);
42                          &jb(&label("finished"));
43                         &mov(&swtmp(2), $ty);
44                         }
45                 }
46         # Moved out
47         # &mov( $tx,            &DWP(0,$d,$x,4)) if $p < 0;
48
49          &add(  $y,             $tx);
50         &and(   $y,             0xff);
51          &inc(  $x);                    # NEXT ROUND 
52         &mov(   $ty,            &DWP(0,$d,$y,4));
53          # XXX
54         &mov(   &DWP(-4,$d,$x,4),$ty);                  # AGI
55          &add(  $ty,            $tx);
56         &and(   $x,             0xff);  # NEXT ROUND
57          &and(  $ty,            0xff);
58         &mov(   &DWP(0,$d,$y,4),$tx);
59          &nop();
60         &mov(   $ty,            &DWP(0,$d,$ty,4));
61          &mov(  $tx,            &DWP(0,$d,$x,4)) if $p < 1; # NEXT ROUND
62          # XXX
63
64         if (!$char)
65                 {
66                 #moved up into last round
67                 if ($p >= 1)
68                         {
69                         &add(   $out,   8)
70                         }
71                 &movb(  &BP($n,"esp","",0),     &LB($ty));
72                 }
73         else
74                 {
75                 # Note in+=8 has occured
76                 &movb(  &HB($ty),       &BP(-1,$in,"",0));
77                  # XXX
78                 &xorb(&LB($ty),         &HB($ty));
79                  # XXX
80                 &movb(&BP($n,$out,"",0),&LB($ty));
81                 }
82         }
83
84
85 sub RC4
86         {
87         local($name)=@_;
88
89         &function_begin_B($name,"");
90
91         &mov($ty,&wparam(1));           # len
92         &cmp($ty,0);
93         &jne(&label("proceed"));
94         &ret();
95         &set_label("proceed");
96
97         &comment("");
98
99         &push("ebp");
100          &push("ebx");
101         &push("esi");
102          &push("edi");
103         &mov(   $d,     &wparam(0));    # key
104          &mov(  $in,    &wparam(2));
105
106         &mov(   $x,     &DWP(0,$d,"",1));
107          &mov(  $y,     &DWP(4,$d,"",1));
108
109         &mov(   $out,   &wparam(3));
110          &inc(  $x);
111
112         &stack_push(3); # 3 temp variables
113          &add(  $d,     8);
114         &and(   $x,             0xff);
115
116         # detect compressed schedule, see commentary section in rc4_skey.c...
117         &cmp(&DWP(256,$d),-1);
118         &je(&label("RC4_CHAR"));
119
120          &lea(  $ty,    &DWP(-8,$ty,$in));
121
122         # check for 0 length input
123
124          &mov(  &swtmp(2),      $ty);   # this is now address to exit at
125         &mov(   $tx,    &DWP(0,$d,$x,4));
126
127          &cmp(  $ty,    $in);
128         &jb(    &label("end")); # less than 8 bytes
129
130         &set_label("start");
131
132         # filling DELAY SLOT
133         &add(   $in,    8);
134
135         &RC4_loop(0,-1,0);
136         &RC4_loop(1,0,0);
137         &RC4_loop(2,0,0);
138         &RC4_loop(3,0,0);
139         &RC4_loop(4,0,0);
140         &RC4_loop(5,0,0);
141         &RC4_loop(6,0,0);
142         &RC4_loop(7,1,0);
143         
144         &comment("apply the cipher text");
145         # xor the cipher data with input
146
147         #&add(  $out,   8); #moved up into last round
148
149         &mov(   $tx,    &swtmp(0));
150          &mov(  $ty,    &DWP(-8,$in,"",0));
151         &xor(   $tx,    $ty);
152          &mov(  $ty,    &DWP(-4,$in,"",0)); 
153         &mov(   &DWP(-8,$out,"",0),     $tx);
154          &mov(  $tx,    &swtmp(1));
155         &xor(   $tx,    $ty);
156          &mov(  $ty,    &swtmp(2));     # load end ptr;
157         &mov(   &DWP(-4,$out,"",0),     $tx);
158          &mov(  $tx,            &DWP(0,$d,$x,4));
159         &cmp($in,       $ty);
160          &jbe(&label("start"));
161
162         &set_label("end");
163
164         # There is quite a bit of extra crap in RC4_loop() for this
165         # first round
166         &RC4_loop(0,-1,1);
167         &RC4_loop(1,0,1);
168         &RC4_loop(2,0,1);
169         &RC4_loop(3,0,1);
170         &RC4_loop(4,0,1);
171         &RC4_loop(5,0,1);
172         &RC4_loop(6,1,1);
173
174         &jmp(&label("finished"));
175
176         &align(16);
177         # this is essentially Intel P4 specific codepath, see rc4_skey.c...
178         &set_label("RC4_CHAR");
179
180         &lea    ($ty,&DWP(0,$in,$ty));
181         &mov    (&swtmp(2),$ty);
182
183         # strangely enough unrolled loop performs over 20% slower...
184         &set_label("RC4_CHAR_loop");
185                 &movz   ($tx,&BP(0,$d,$x));
186                 &add    (&LB($y),&LB($tx));
187                 &movz   ($ty,&BP(0,$d,$y));
188                 &movb   (&BP(0,$d,$y),&LB($tx));
189                 &movb   (&BP(0,$d,$x),&LB($ty));
190                 &add    (&LB($ty),&LB($tx));
191                 &movz   ($ty,&BP(0,$d,$ty));
192                 &xorb   (&LB($ty),&BP(0,$in));
193                 &movb   (&BP(0,$out),&LB($ty));
194                 &inc    (&LB($x));
195                 &inc    ($in);
196                 &inc    ($out);
197                 &cmp    ($in,&swtmp(2));
198         &jb     (&label("RC4_CHAR_loop"));
199
200         &set_label("finished");
201         &dec(   $x);
202          &stack_pop(3);
203         &mov(   &DWP(-4,$d,"",0),$y);
204          &movb( &BP(-8,$d,"",0),&LB($x));
205
206         &function_end($name);
207         }
208