crypto/cast/asm/cast-586.pl: +5% on PIII and remove obsolete readme.
[openssl.git] / crypto / cast / asm / cast-586.pl
1 #!/usr/local/bin/perl
2
3 # This flag makes the inner loop one cycle longer, but generates 
4 # code that runs %30 faster on the pentium pro/II, 44% faster
5 # of PIII, while only %7 slower on the pentium.
6 # By default, this flag is on.
7 $ppro=1;
8
9 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
10 push(@INC,"${dir}","${dir}../../perlasm");
11 require "x86asm.pl";
12 require "cbc.pl";
13
14 &asm_init($ARGV[0],"cast-586.pl",$ARGV[$#ARGV] eq "386");
15
16 $CAST_ROUNDS=16;
17 $L="edi";
18 $R="esi";
19 $K="ebp";
20 $tmp1="ecx";
21 $tmp2="ebx";
22 $tmp3="eax";
23 $tmp4="edx";
24 $S1="CAST_S_table0";
25 $S2="CAST_S_table1";
26 $S3="CAST_S_table2";
27 $S4="CAST_S_table3";
28
29 @F1=("add","xor","sub");
30 @F2=("xor","sub","add");
31 @F3=("sub","add","xor");
32
33 &CAST_encrypt("CAST_encrypt",1);
34 &CAST_encrypt("CAST_decrypt",0);
35 &cbc("CAST_cbc_encrypt","CAST_encrypt","CAST_decrypt",1,4,5,3,-1,-1);
36
37 &asm_finish();
38
39 sub CAST_encrypt {
40     local($name,$enc)=@_;
41
42     local($win_ex)=<<"EOF";
43 EXTERN  _CAST_S_table0:DWORD
44 EXTERN  _CAST_S_table1:DWORD
45 EXTERN  _CAST_S_table2:DWORD
46 EXTERN  _CAST_S_table3:DWORD
47 EOF
48     &main::external_label(
49                           "CAST_S_table0",
50                           "CAST_S_table1",
51                           "CAST_S_table2",
52                           "CAST_S_table3",
53                           );
54
55     &function_begin_B($name,$win_ex);
56
57     &comment("");
58
59     &push("ebp");
60     &push("ebx");
61     &mov($tmp2,&wparam(0));
62     &mov($K,&wparam(1));
63     &push("esi");
64     &push("edi");
65
66     &comment("Load the 2 words");
67     &mov($L,&DWP(0,$tmp2,"",0));
68     &mov($R,&DWP(4,$tmp2,"",0));
69
70     &comment('Get short key flag');
71     &mov($tmp3,&DWP(128,$K,"",0));
72     if($enc) {
73         &push($tmp3);
74     } else {
75         &or($tmp3,$tmp3);
76         &jnz(&label('cast_dec_skip'));
77     }
78
79     &xor($tmp3, $tmp3);
80
81     # encrypting part
82
83     if ($enc) {
84         &E_CAST( 0,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
85         &E_CAST( 1,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
86         &E_CAST( 2,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
87         &E_CAST( 3,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
88         &E_CAST( 4,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
89         &E_CAST( 5,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
90         &E_CAST( 6,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
91         &E_CAST( 7,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
92         &E_CAST( 8,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
93         &E_CAST( 9,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
94         &E_CAST(10,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
95         &E_CAST(11,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
96         &comment('test short key flag');
97         &pop($tmp4);
98         &or($tmp4,$tmp4);
99         &jnz(&label('cast_enc_done'));
100         &E_CAST(12,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
101         &E_CAST(13,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
102         &E_CAST(14,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
103         &E_CAST(15,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
104     } else {
105         &E_CAST(15,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
106         &E_CAST(14,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
107         &E_CAST(13,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
108         &E_CAST(12,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
109         &set_label('cast_dec_skip');
110         &E_CAST(11,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
111         &E_CAST(10,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
112         &E_CAST( 9,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
113         &E_CAST( 8,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
114         &E_CAST( 7,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
115         &E_CAST( 6,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
116         &E_CAST( 5,$S,$L,$R,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
117         &E_CAST( 4,$S,$R,$L,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
118         &E_CAST( 3,$S,$L,$R,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
119         &E_CAST( 2,$S,$R,$L,$K,@F3,$tmp1,$tmp2,$tmp3,$tmp4);
120         &E_CAST( 1,$S,$L,$R,$K,@F2,$tmp1,$tmp2,$tmp3,$tmp4);
121         &E_CAST( 0,$S,$R,$L,$K,@F1,$tmp1,$tmp2,$tmp3,$tmp4);
122     }
123
124     &set_label('cast_enc_done') if $enc;
125 # Why the nop? - Ben 17/1/99
126     &nop();
127     &mov($tmp3,&wparam(0));
128     &mov(&DWP(4,$tmp3,"",0),$L);
129     &mov(&DWP(0,$tmp3,"",0),$R);
130     &function_end($name);
131 }
132
133 sub E_CAST {
134     local($i,$S,$L,$R,$K,$OP1,$OP2,$OP3,$tmp1,$tmp2,$tmp3,$tmp4)=@_;
135     # Ri needs to have 16 pre added.
136
137     &comment("round $i");
138     &mov(       $tmp4,          &DWP($i*8,$K,"",1));
139
140     &mov(       $tmp1,          &DWP($i*8+4,$K,"",1));
141     &$OP1(      $tmp4,          $R);
142
143     &rotl(      $tmp4,          &LB($tmp1));
144
145     if ($ppro) {
146         &xor(   $tmp1,          $tmp1);
147         &mov(   $tmp2,          0xff);
148         
149         &movb(  &LB($tmp1),     &HB($tmp4));    # A
150         &and(   $tmp2,          $tmp4);
151
152         &shr(   $tmp4,          16);            #
153         &xor(   $tmp3,          $tmp3);
154     } else {
155         &mov(   $tmp2,          $tmp4);         # B
156         &movb(  &LB($tmp1),     &HB($tmp4));    # A     # BAD BAD BAD
157         
158         &shr(   $tmp4,          16);            #
159         &and(   $tmp2,          0xff);
160     }
161
162     &movb(      &LB($tmp3),     &HB($tmp4));    # C     # BAD BAD BAD
163     &and(       $tmp4,          0xff);          # D
164
165     &mov(       $tmp1,          &DWP($S1,"",$tmp1,4));
166     &mov(       $tmp2,          &DWP($S2,"",$tmp2,4));
167
168     &$OP2(      $tmp1,          $tmp2);
169     &mov(       $tmp2,          &DWP($S3,"",$tmp3,4));
170
171     &$OP3(      $tmp1,          $tmp2);
172     &mov(       $tmp2,          &DWP($S4,"",$tmp4,4));
173
174     &$OP1(      $tmp1,          $tmp2);
175     # XXX
176
177     &xor(       $L,             $tmp1);
178     # XXX
179 }
180