AES-NI engine jumbo update.
[openssl.git] / crypto / aes / asm / aesni-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13 # details].
14
15 $PREFIX="aesni";        # if $PREFIX is set to "AES", the script
16                         # generates drop-in replacement for
17                         # crypto/aes/asm/aes-586.pl:-)
18
19 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
20 push(@INC,"${dir}","${dir}../../perlasm");
21 require "x86asm.pl";
22
23 &asm_init($ARGV[0],$0);
24
25 $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
26
27 $len="eax";
28 $rounds="ecx";
29 $key="edx";
30 $inp="esi";
31 $out="edi";
32 $rounds_="ebx"; # backup copy for $rounds
33 $key_="ebp";    # backup copy for $key
34
35 $inout0="xmm0";
36 $inout1="xmm1";
37 $inout2="xmm2";
38 $rndkey0="xmm3";
39 $rndkey1="xmm4";
40 $ivec="xmm5";
41 $in0="xmm6";
42 $in1="xmm7";    $inout3="xmm7";
43
44 # Inline version of internal aesni_[en|de]crypt1
45 sub aesni_inline_generate1
46 { my $p=shift;
47
48     &$movekey           ($rndkey0,&QWP(0,$key));
49     &$movekey           ($rndkey1,&QWP(16,$key));
50     &lea                ($key,&DWP(32,$key));
51     &pxor               ($inout0,$rndkey0);
52     &set_label("${p}1_loop");
53         eval"&aes${p}   ($inout0,$rndkey1)";
54         &dec            ($rounds);
55         &$movekey       ($rndkey1,&QWP(0,$key));
56         &lea            ($key,&DWP(16,$key));
57     &jnz                (&label("${p}1_loop"));
58     eval"&aes${p}last   ($inout0,$rndkey1)";
59 }
60
61 sub aesni_generate1     # fully unrolled loop
62 { my $p=shift;
63
64     &function_begin_B("_aesni_${p}rypt1");
65         &$movekey       ($rndkey0,&QWP(0,$key));
66         &$movekey       ($rndkey1,&QWP(0x10,$key));
67         &cmp            ($rounds,11);
68         &pxor           ($inout0,$rndkey0);
69         &$movekey       ($rndkey0,&QWP(0x20,$key));
70         &lea            ($key,&DWP(0x30,$key));
71         &jb             (&label("${p}128"));
72         &lea            ($key,&DWP(0x20,$key));
73         &je             (&label("${p}192"));
74         &lea            ($key,&DWP(0x20,$key));
75         eval"&aes${p}   ($inout0,$rndkey1)";
76         &$movekey       ($rndkey1,&QWP(-0x40,$key));
77         eval"&aes${p}   ($inout0,$rndkey0)";
78         &$movekey       ($rndkey0,&QWP(-0x30,$key));
79     &set_label("${p}192");
80         eval"&aes${p}   ($inout0,$rndkey1)";
81         &$movekey       ($rndkey1,&QWP(-0x20,$key));
82         eval"&aes${p}   ($inout0,$rndkey0)";
83         &$movekey       ($rndkey0,&QWP(-0x10,$key));
84     &set_label("${p}128");
85         eval"&aes${p}   ($inout0,$rndkey1)";
86         &$movekey       ($rndkey1,&QWP(0,$key));
87         eval"&aes${p}   ($inout0,$rndkey0)";
88         &$movekey       ($rndkey0,&QWP(0x10,$key));
89         eval"&aes${p}   ($inout0,$rndkey1)";
90         &$movekey       ($rndkey1,&QWP(0x20,$key));
91         eval"&aes${p}   ($inout0,$rndkey0)";
92         &$movekey       ($rndkey0,&QWP(0x30,$key));
93         eval"&aes${p}   ($inout0,$rndkey1)";
94         &$movekey       ($rndkey1,&QWP(0x40,$key));
95         eval"&aes${p}   ($inout0,$rndkey0)";
96         &$movekey       ($rndkey0,&QWP(0x50,$key));
97         eval"&aes${p}   ($inout0,$rndkey1)";
98         &$movekey       ($rndkey1,&QWP(0x60,$key));
99         eval"&aes${p}   ($inout0,$rndkey0)";
100         &$movekey       ($rndkey0,&QWP(0x70,$key));
101         eval"&aes${p}   ($inout0,$rndkey1)";
102     eval"&aes${p}last   ($inout0,$rndkey0)";
103     &ret();
104     &function_end_B("_aesni_${p}rypt1");
105 }
106
107 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
108 # &aesni_generate1("dec");
109 &function_begin_B("${PREFIX}_encrypt");
110         &mov    ("eax",&wparam(0));
111         &mov    ($key,&wparam(2));
112         &movups ($inout0,&QWP(0,"eax"));
113         &mov    ($rounds,&DWP(240,$key));
114         &mov    ("eax",&wparam(1));
115         &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
116         &movups (&QWP(0,"eax"),$inout0);
117         &ret    ();
118 &function_end_B("${PREFIX}_encrypt");
119
120 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
121 # &aesni_generate1("dec");
122 &function_begin_B("${PREFIX}_decrypt");
123         &mov    ("eax",&wparam(0));
124         &mov    ($key,&wparam(2));
125         &movups ($inout0,&QWP(0,"eax"));
126         &mov    ($rounds,&DWP(240,$key));
127         &mov    ("eax",&wparam(1));
128         &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1");
129         &movups (&QWP(0,"eax"),$inout0);
130         &ret    ();
131 &function_end_B("${PREFIX}_decrypt");
132 \f
133 # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
134 # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
135 # latency is 6, it turned out that it can be scheduled only every
136 # *second* cycle. Thus 3x interleave is the one providing optimal
137 # utilization, i.e. when subroutine's throughput is virtually same as
138 # of non-interleaved subroutine [for number of input blocks up to 3].
139 # This is why it makes no sense to implement 2x subroutine. As soon
140 # as/if Intel improves throughput by making it possible to schedule
141 # the instructions in question *every* cycles I would have to
142 # implement 6x interleave and use it in loop...
143 sub aesni_generate3
144 { my $p=shift;
145
146     &function_begin_B("_aesni_${p}rypt3");
147         &$movekey       ($rndkey0,&QWP(0,$key));
148         &shr            ($rounds,1);
149         &$movekey       ($rndkey1,&QWP(16,$key));
150         &lea            ($key,&DWP(32,$key));
151         &pxor           ($inout0,$rndkey0);
152         &pxor           ($inout1,$rndkey0);
153         &pxor           ($inout2,$rndkey0);
154         &jmp            (&label("${p}3_loop"));
155     &set_label("${p}3_loop",16);
156         eval"&aes${p}   ($inout0,$rndkey1)";
157         &$movekey       ($rndkey0,&QWP(0,$key));
158         eval"&aes${p}   ($inout1,$rndkey1)";
159         &dec            ($rounds);
160         eval"&aes${p}   ($inout2,$rndkey1)";
161         &$movekey       ($rndkey1,&QWP(16,$key));
162         eval"&aes${p}   ($inout0,$rndkey0)";
163         &lea            ($key,&DWP(32,$key));
164         eval"&aes${p}   ($inout1,$rndkey0)";
165         eval"&aes${p}   ($inout2,$rndkey0)";
166         &jnz            (&label("${p}3_loop"));
167     eval"&aes${p}       ($inout0,$rndkey1)";
168     &$movekey           ($rndkey0,&QWP(0,$key));
169     eval"&aes${p}       ($inout1,$rndkey1)";
170     eval"&aes${p}       ($inout2,$rndkey1)";
171     eval"&aes${p}last   ($inout0,$rndkey0)";
172     eval"&aes${p}last   ($inout1,$rndkey0)";
173     eval"&aes${p}last   ($inout2,$rndkey0)";
174     &ret();
175     &function_end_B("_aesni_${p}rypt3");
176 }
177
178 # 4x interleave is implemented to improve small block performance,
179 # most notably [and naturally] 4 block by ~30%. One can argue that one
180 # should have implemented 5x as well, but improvement  would be <20%,
181 # so it's not worth it...
182 sub aesni_generate4
183 { my $p=shift;
184
185     &function_begin_B("_aesni_${p}rypt4");
186         &$movekey       ($rndkey0,&QWP(0,$key));
187         &$movekey       ($rndkey1,&QWP(16,$key));
188         &shr            ($rounds,1);
189         &lea            ($key,&DWP(32,$key));
190         &pxor           ($inout0,$rndkey0);
191         &pxor           ($inout1,$rndkey0);
192         &pxor           ($inout2,$rndkey0);
193         &pxor           ($inout3,$rndkey0);
194         &jmp            (&label("${p}3_loop"));
195     &set_label("${p}3_loop",16);
196         eval"&aes${p}   ($inout0,$rndkey1)";
197         &$movekey       ($rndkey0,&QWP(0,$key));
198         eval"&aes${p}   ($inout1,$rndkey1)";
199         &dec            ($rounds);
200         eval"&aes${p}   ($inout2,$rndkey1)";
201         eval"&aes${p}   ($inout3,$rndkey1)";
202         &$movekey       ($rndkey1,&QWP(16,$key));
203         eval"&aes${p}   ($inout0,$rndkey0)";
204         &lea            ($key,&DWP(32,$key));
205         eval"&aes${p}   ($inout1,$rndkey0)";
206         eval"&aes${p}   ($inout2,$rndkey0)";
207         eval"&aes${p}   ($inout3,$rndkey0)";
208         &jnz            (&label("${p}3_loop"));
209     eval"&aes${p}       ($inout0,$rndkey1)";
210     &$movekey           ($rndkey0,&QWP(0,$key));
211     eval"&aes${p}       ($inout1,$rndkey1)";
212     eval"&aes${p}       ($inout2,$rndkey1)";
213     eval"&aes${p}       ($inout3,$rndkey1)";
214     eval"&aes${p}last   ($inout0,$rndkey0)";
215     eval"&aes${p}last   ($inout1,$rndkey0)";
216     eval"&aes${p}last   ($inout2,$rndkey0)";
217     eval"&aes${p}last   ($inout3,$rndkey0)";
218     &ret();
219     &function_end_B("_aesni_${p}rypt4");
220 }
221 &aesni_generate3("enc") if ($PREFIX eq "aesni");
222 &aesni_generate3("dec");
223 &aesni_generate4("enc") if ($PREFIX eq "aesni");
224 &aesni_generate4("dec");
225
226 if ($PREFIX eq "aesni") {
227 # void aesni_ecb_encrypt (const void *in, void *out,
228 #                         size_t length, const AES_KEY *key,
229 #                         int enc);
230 &function_begin("aesni_ecb_encrypt");
231         &mov    ($inp,&wparam(0));
232         &mov    ($out,&wparam(1));
233         &mov    ($len,&wparam(2));
234         &mov    ($key,&wparam(3));
235         &mov    ($rounds,&wparam(4));
236         &cmp    ($len,16);
237         &jb     (&label("ecb_ret"));
238         &and    ($len,-16);
239         &test   ($rounds,$rounds)
240         &mov    ($rounds,&DWP(240,$key));
241         &mov    ($key_,$key);           # backup $key
242         &mov    ($rounds_,$rounds);     # backup $rounds
243         &jz     (&label("ecb_decrypt"));
244
245         &sub    ($len,0x40);
246         &jbe    (&label("ecb_enc_tail"));
247         &jmp    (&label("ecb_enc_loop3"));
248
249 &set_label("ecb_enc_loop3",16);
250         &movups ($inout0,&QWP(0,$inp));
251         &movups ($inout1,&QWP(0x10,$inp));
252         &movups ($inout2,&QWP(0x20,$inp));
253         &call   ("_aesni_encrypt3");
254         &sub    ($len,0x30);
255         &lea    ($inp,&DWP(0x30,$inp));
256         &lea    ($out,&DWP(0x30,$out));
257         &movups (&QWP(-0x30,$out),$inout0);
258         &mov    ($key,$key_);           # restore $key
259         &movups (&QWP(-0x20,$out),$inout1);
260         &mov    ($rounds,$rounds_);     # restore $rounds
261         &movups (&QWP(-0x10,$out),$inout2);
262         &ja     (&label("ecb_enc_loop3"));
263
264 &set_label("ecb_enc_tail");
265         &add    ($len,0x40);
266         &jz     (&label("ecb_ret"));
267
268         &cmp    ($len,0x10);
269         &movups ($inout0,&QWP(0,$inp));
270         &je     (&label("ecb_enc_one"));
271         &cmp    ($len,0x20);
272         &movups ($inout1,&QWP(0x10,$inp));
273         &je     (&label("ecb_enc_two"));
274         &cmp    ($len,0x30);
275         &movups ($inout2,&QWP(0x20,$inp));
276         &je     (&label("ecb_enc_three"));
277         &movups ($inout3,&QWP(0x30,$inp));
278         &call   ("_aesni_encrypt4");
279         &movups (&QWP(0,$out),$inout0);
280         &movups (&QWP(0x10,$out),$inout1);
281         &movups (&QWP(0x20,$out),$inout2);
282         &movups (&QWP(0x30,$out),$inout3);
283         jmp     (&label("ecb_ret"));
284
285 &set_label("ecb_enc_one",16);
286         &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
287         &movups (&QWP(0,$out),$inout0);
288         &jmp    (&label("ecb_ret"));
289
290 &set_label("ecb_enc_two",16);
291         &call   ("_aesni_encrypt3");
292         &movups (&QWP(0,$out),$inout0);
293         &movups (&QWP(0x10,$out),$inout1);
294         &jmp    (&label("ecb_ret"));
295
296 &set_label("ecb_enc_three",16);
297         &call   ("_aesni_encrypt3");
298         &movups (&QWP(0,$out),$inout0);
299         &movups (&QWP(0x10,$out),$inout1);
300         &movups (&QWP(0x20,$out),$inout2);
301         &jmp    (&label("ecb_ret"));
302
303 &set_label("ecb_decrypt",16);
304         &sub    ($len,0x40);
305         &jbe    (&label("ecb_dec_tail"));
306         &jmp    (&label("ecb_dec_loop3"));
307
308 &set_label("ecb_dec_loop3",16);
309         &movups ($inout0,&QWP(0,$inp));
310         &movups ($inout1,&QWP(0x10,$inp));
311         &movups ($inout2,&QWP(0x20,$inp));
312         &call   ("_aesni_decrypt3");
313         &sub    ($len,0x30);
314         &lea    ($inp,&DWP(0x30,$inp));
315         &lea    ($out,&DWP(0x30,$out));
316         &movups (&QWP(-0x30,$out),$inout0);
317         &mov    ($key,$key_);           # restore $key
318         &movups (&QWP(-0x20,$out),$inout1);
319         &mov    ($rounds,$rounds_);     # restore $rounds
320         &movups (&QWP(-0x10,$out),$inout2);
321         &ja     (&label("ecb_dec_loop3"));
322
323 &set_label("ecb_dec_tail");
324         &add    ($len,0x40);
325         &jz     (&label("ecb_ret"));
326
327         &cmp    ($len,0x10);
328         &movups ($inout0,&QWP(0,$inp));
329         &je     (&label("ecb_dec_one"));
330         &cmp    ($len,0x20);
331         &movups ($inout1,&QWP(0x10,$inp));
332         &je     (&label("ecb_dec_two"));
333         &cmp    ($len,0x30);
334         &movups ($inout2,&QWP(0x20,$inp));
335         &je     (&label("ecb_dec_three"));
336         &movups ($inout3,&QWP(0x30,$inp));
337         &call   ("_aesni_decrypt4");
338         &movups (&QWP(0,$out),$inout0);
339         &movups (&QWP(0x10,$out),$inout1);
340         &movups (&QWP(0x20,$out),$inout2);
341         &movups (&QWP(0x30,$out),$inout3);
342         &jmp    (&label("ecb_ret"));
343
344 &set_label("ecb_dec_one",16);
345         &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
346         &movups (&QWP(0,$out),$inout0);
347         &jmp    (&label("ecb_ret"));
348
349 &set_label("ecb_dec_two",16);
350         &call   ("_aesni_decrypt3");
351         &movups (&QWP(0,$out),$inout0);
352         &movups (&QWP(0x10,$out),$inout1);
353         &jmp    (&label("ecb_ret"));
354
355 &set_label("ecb_dec_three",16);
356         &call   ("_aesni_decrypt3");
357         &movups (&QWP(0,$out),$inout0);
358         &movups (&QWP(0x10,$out),$inout1);
359         &movups (&QWP(0x20,$out),$inout2);
360
361 &set_label("ecb_ret");
362 &function_end("aesni_ecb_encrypt");
363 }
364
365 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
366 #                           size_t length, const AES_KEY *key,
367 #                           unsigned char *ivp,const int enc);
368 &function_begin("${PREFIX}_cbc_encrypt");
369         &mov    ($inp,&wparam(0));
370         &mov    ($out,&wparam(1));
371         &mov    ($len,&wparam(2));
372         &mov    ($key,&wparam(3));
373         &test   ($len,$len);
374         &mov    ($key_,&wparam(4));
375         &jz     (&label("cbc_ret"));
376
377         &cmp    (&wparam(5),0);
378         &movups ($ivec,&QWP(0,$key_));  # load IV
379         &mov    ($rounds,&DWP(240,$key));
380         &mov    ($key_,$key);           # backup $key
381         &mov    ($rounds_,$rounds);     # backup $rounds
382         &je     (&label("cbc_decrypt"));
383
384         &movaps ($inout0,$ivec);
385         &cmp    ($len,16);
386         &jb     (&label("cbc_enc_tail"));
387         &sub    ($len,16);
388         &jmp    (&label("cbc_enc_loop"));
389
390 &set_label("cbc_enc_loop",16);
391         &movups ($ivec,&QWP(0,$inp));
392         &lea    ($inp,&DWP(16,$inp));
393         &pxor   ($inout0,$ivec);
394         &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3");
395         &sub    ($len,16);
396         &lea    ($out,&DWP(16,$out));
397         &mov    ($rounds,$rounds_);     # restore $rounds
398         &mov    ($key,$key_);           # restore $key
399         &movups (&QWP(-16,$out),$inout0);
400         &jnc    (&label("cbc_enc_loop"));
401         &add    ($len,16);
402         &jnz    (&label("cbc_enc_tail"));
403         &movaps ($ivec,$inout0);
404         &jmp    (&label("cbc_ret"));
405
406 &set_label("cbc_enc_tail");
407         &mov    ("ecx",$len);           # zaps $rounds
408         &data_word(0xA4F3F689);         # rep movsb
409         &mov    ("ecx",16);             # zero tail
410         &sub    ("ecx",$len);
411         &xor    ("eax","eax");          # zaps $len
412         &data_word(0xAAF3F689);         # rep stosb
413         &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
414         &mov    ($rounds,$rounds_);     # restore $rounds
415         &mov    ($inp,$out);            # $inp and $out are the same
416         &mov    ($key,$key_);           # restore $key
417         &jmp    (&label("cbc_enc_loop"));
418
419 &set_label("cbc_decrypt",16);
420         &sub    ($len,0x40);
421         &jbe    (&label("cbc_dec_tail"));
422         &jmp    (&label("cbc_dec_loop3"));
423
424 &set_label("cbc_dec_loop3",16);
425         &movups ($inout0,&QWP(0,$inp));
426         &movups ($inout1,&QWP(0x10,$inp));
427         &movups ($inout2,&QWP(0x20,$inp));
428         &movaps ($in0,$inout0);
429         &movaps ($in1,$inout1);
430         &call   ("_aesni_decrypt3");
431         &sub    ($len,0x30);
432         &lea    ($inp,&DWP(0x30,$inp));
433         &lea    ($out,&DWP(0x30,$out));
434         &pxor   ($inout0,$ivec);
435         &pxor   ($inout1,$in0);
436         &movups ($ivec,&QWP(-0x10,$inp));
437         &pxor   ($inout2,$in1);
438         &movups (&QWP(-0x30,$out),$inout0);
439         &mov    ($rounds,$rounds_)      # restore $rounds
440         &movups (&QWP(-0x20,$out),$inout1);
441         &mov    ($key,$key_);           # restore $key
442         &movups (&QWP(-0x10,$out),$inout2);
443         &ja     (&label("cbc_dec_loop3"));
444
445 &set_label("cbc_dec_tail");
446         &add    ($len,0x40);
447         &jz     (&label("cbc_ret"));
448
449         &movups ($inout0,&QWP(0,$inp));
450         &cmp    ($len,0x10);
451         &movaps ($in0,$inout0);
452         &jbe    (&label("cbc_dec_one"));
453         &movups ($inout1,&QWP(0x10,$inp));
454         &cmp    ($len,0x20);
455         &movaps ($in1,$inout1);
456         &jbe    (&label("cbc_dec_two"));
457         &movups ($inout2,&QWP(0x20,$inp));
458         &cmp    ($len,0x30);
459         &jbe    (&label("cbc_dec_three"));
460         &movups ($inout3,&QWP(0x30,$inp));
461         &call   ("_aesni_decrypt4");
462         &movups ($rndkey0,&QWP(0x10,$inp));
463         &movups ($rndkey1,&QWP(0x20,$inp));
464         &pxor   ($inout0,$ivec);
465         &pxor   ($inout1,$in0);
466         &movups ($ivec,&QWP(0x30,$inp));
467         &movups (&QWP(0,$out),$inout0);
468         &pxor   ($inout2,$rndkey0);
469         &pxor   ($inout3,$rndkey1);
470         &movups (&QWP(0x10,$out),$inout1);
471         &movups (&QWP(0x20,$out),$inout2);
472         &movaps ($inout0,$inout3);
473         &lea    ($out,&DWP(0x30,$out));
474         &jmp    (&label("cbc_dec_tail_collected"));
475
476 &set_label("cbc_dec_one");
477         &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
478         &pxor   ($inout0,$ivec);
479         &movaps ($ivec,$in0);
480         &jmp    (&label("cbc_dec_tail_collected"));
481
482 &set_label("cbc_dec_two");
483         &call   ("_aesni_decrypt3");
484         &pxor   ($inout0,$ivec);
485         &pxor   ($inout1,$in0);
486         &movups (&QWP(0,$out),$inout0);
487         &movaps ($inout0,$inout1);
488         &movaps ($ivec,$in1);
489         &lea    ($out,&DWP(0x10,$out));
490         &jmp    (&label("cbc_dec_tail_collected"));
491
492 &set_label("cbc_dec_three");
493         &call   ("_aesni_decrypt3");
494         &pxor   ($inout0,$ivec);
495         &pxor   ($inout1,$in0);
496         &pxor   ($inout2,$in1);
497         &movups (&QWP(0,$out),$inout0);
498         &movups (&QWP(0x10,$out),$inout1);
499         &movaps ($inout0,$inout2);
500         &movups ($ivec,&QWP(0x20,$inp));
501         &lea    ($out,&DWP(0x20,$out));
502
503 &set_label("cbc_dec_tail_collected");
504         &and    ($len,15);
505         &jnz    (&label("cbc_dec_tail_partial"));
506         &movups (&QWP(0,$out),$inout0);
507         &jmp    (&label("cbc_ret"));
508
509 &set_label("cbc_dec_tail_partial");
510         &mov    ($key_,"esp");
511         &sub    ("esp",16);
512         &and    ("esp",-16);
513         &movaps (&QWP(0,"esp"),$inout0);
514         &mov    ($inp,"esp");
515         &mov    ("ecx",$len);
516         &data_word(0xA4F3F689);         # rep movsb
517         &mov    ("esp",$key_);
518
519 &set_label("cbc_ret");
520         &mov    ($key_,&wparam(4));
521         &movups (&QWP(0,$key_),$ivec);  # output IV
522 &function_end("${PREFIX}_cbc_encrypt");
523
524 # Mechanical port from aesni-x86_64.pl.
525 #
526 # _aesni_set_encrypt_key is private interface,
527 # input:
528 #       "eax"   const unsigned char *userKey
529 #       $rounds int bits
530 #       $key    AES_KEY *key
531 # output:
532 #       "eax"   return code
533 #       $round  rounds
534
535 &function_begin_B("_aesni_set_encrypt_key");
536         &test   ("eax","eax");
537         &jz     (&label("bad_pointer"));
538         &test   ($key,$key);
539         &jz     (&label("bad_pointer"));
540
541         &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
542         &pxor   ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
543         &lea    ($key,&DWP(16,$key));
544         &cmp    ($rounds,256);
545         &je     (&label("14rounds"));
546         &cmp    ($rounds,192);
547         &je     (&label("12rounds"));
548         &cmp    ($rounds,128);
549         &jne    (&label("bad_keybits"));
550
551 &set_label("10rounds",16);
552         &mov            ($rounds,9);
553         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
554         &aeskeygenassist("xmm1","xmm0",0x01);           # round 1
555         &call           (&label("key_128_cold"));
556         &aeskeygenassist("xmm1","xmm0",0x2);            # round 2
557         &call           (&label("key_128"));
558         &aeskeygenassist("xmm1","xmm0",0x04);           # round 3
559         &call           (&label("key_128"));
560         &aeskeygenassist("xmm1","xmm0",0x08);           # round 4
561         &call           (&label("key_128"));
562         &aeskeygenassist("xmm1","xmm0",0x10);           # round 5
563         &call           (&label("key_128"));
564         &aeskeygenassist("xmm1","xmm0",0x20);           # round 6
565         &call           (&label("key_128"));
566         &aeskeygenassist("xmm1","xmm0",0x40);           # round 7
567         &call           (&label("key_128"));
568         &aeskeygenassist("xmm1","xmm0",0x80);           # round 8
569         &call           (&label("key_128"));
570         &aeskeygenassist("xmm1","xmm0",0x1b);           # round 9
571         &call           (&label("key_128"));
572         &aeskeygenassist("xmm1","xmm0",0x36);           # round 10
573         &call           (&label("key_128"));
574         &$movekey       (&QWP(0,$key),"xmm0");
575         &mov            (&DWP(80,$key),$rounds);
576         &xor            ("eax","eax");
577         &ret();
578
579 &set_label("key_128",16);
580         &$movekey       (&QWP(0,$key),"xmm0");
581         &lea            ($key,&DWP(16,$key));
582 &set_label("key_128_cold");
583         &shufps         ("xmm4","xmm0",0b00010000);
584         &pxor           ("xmm0","xmm4");
585         &shufps         ("xmm4","xmm0",0b10001100,);
586         &pxor           ("xmm0","xmm4");
587         &pshufd         ("xmm1","xmm1",0b11111111);     # critical path
588         &pxor           ("xmm0","xmm1");
589         &ret();
590
591 &set_label("12rounds",16);
592         &movq           ("xmm2",&QWP(16,"eax"));        # remaining 1/3 of *userKey
593         &mov            ($rounds,11);
594         &$movekey       (&QWP(-16,$key),"xmm0")         # round 0
595         &aeskeygenassist("xmm1","xmm2",0x01);           # round 1,2
596         &call           (&label("key_192a_cold"));
597         &aeskeygenassist("xmm1","xmm2",0x02);           # round 2,3
598         &call           (&label("key_192b"));
599         &aeskeygenassist("xmm1","xmm2",0x04);           # round 4,5
600         &call           (&label("key_192a"));
601         &aeskeygenassist("xmm1","xmm2",0x08);           # round 5,6
602         &call           (&label("key_192b"));
603         &aeskeygenassist("xmm1","xmm2",0x10);           # round 7,8
604         &call           (&label("key_192a"));
605         &aeskeygenassist("xmm1","xmm2",0x20);           # round 8,9
606         &call           (&label("key_192b"));
607         &aeskeygenassist("xmm1","xmm2",0x40);           # round 10,11
608         &call           (&label("key_192a"));
609         &aeskeygenassist("xmm1","xmm2",0x80);           # round 11,12
610         &call           (&label("key_192b"));
611         &$movekey       (&QWP(0,$key),"xmm0");
612         &mov            (&DWP(48,$key),$rounds);
613         &xor            ("eax","eax");
614         &ret();
615
616 &set_label("key_192a",16);
617         &$movekey       (&QWP(0,$key),"xmm0");
618         &lea            ($key,&DWP(16,$key));
619 &set_label("key_192a_cold",16);
620         &movaps         ("xmm5","xmm2");
621 &set_label("key_192b_warm");
622         &shufps         ("xmm4","xmm0",0b00010000);
623         &movaps         ("xmm3","xmm2");
624         &pxor           ("xmm0","xmm4");
625         &shufps         ("xmm4","xmm0",0b10001100);
626         &pslldq         ("xmm3",4);
627         &pxor           ("xmm0","xmm4");
628         &pshufd         ("xmm1","xmm1",0b01010101);     # critical path
629         &pxor           ("xmm2","xmm3");
630         &pxor           ("xmm0","xmm1");
631         &pshufd         ("xmm3","xmm0",0b11111111);
632         &pxor           ("xmm2","xmm3");
633         &ret();
634
635 &set_label("key_192b",16);
636         &movaps         ("xmm3","xmm0");
637         &shufps         ("xmm5","xmm0",0b01000100);
638         &$movekey       (&QWP(0,$key),"xmm5");
639         &shufps         ("xmm3","xmm2",0b01001110);
640         &$movekey       (&QWP(16,$key),"xmm3");
641         &lea            ($key,&DWP(32,$key));
642         &jmp            (&label("key_192b_warm"));
643
644 &set_label("14rounds",16);
645         &movups         ("xmm2",&QWP(16,"eax"));        # remaining half of *userKey
646         &mov            ($rounds,13);
647         &lea            ($key,&DWP(16,$key));
648         &$movekey       (&QWP(-32,$key),"xmm0");        # round 0
649         &$movekey       (&QWP(-16,$key),"xmm2");        # round 1
650         &aeskeygenassist("xmm1","xmm2",0x01);           # round 2
651         &call           (&label("key_256a_cold"));
652         &aeskeygenassist("xmm1","xmm0",0x01);           # round 3
653         &call           (&label("key_256b"));
654         &aeskeygenassist("xmm1","xmm2",0x02);           # round 4
655         &call           (&label("key_256a"));
656         &aeskeygenassist("xmm1","xmm0",0x02);           # round 5
657         &call           (&label("key_256b"));
658         &aeskeygenassist("xmm1","xmm2",0x04);           # round 6
659         &call           (&label("key_256a"));
660         &aeskeygenassist("xmm1","xmm0",0x04);           # round 7
661         &call           (&label("key_256b"));
662         &aeskeygenassist("xmm1","xmm2",0x08);           # round 8
663         &call           (&label("key_256a"));
664         &aeskeygenassist("xmm1","xmm0",0x08);           # round 9
665         &call           (&label("key_256b"));
666         &aeskeygenassist("xmm1","xmm2",0x10);           # round 10
667         &call           (&label("key_256a"));
668         &aeskeygenassist("xmm1","xmm0",0x10);           # round 11
669         &call           (&label("key_256b"));
670         &aeskeygenassist("xmm1","xmm2",0x20);           # round 12
671         &call           (&label("key_256a"));
672         &aeskeygenassist("xmm1","xmm0",0x20);           # round 13
673         &call           (&label("key_256b"));
674         &aeskeygenassist("xmm1","xmm2",0x40);           # round 14
675         &call           (&label("key_256a"));
676         &$movekey       (&QWP(0,$key),"xmm0");
677         &mov            (&DWP(16,$key),$rounds);
678         &xor            ("eax","eax");
679         &ret();
680
681 &set_label("key_256a",16);
682         &$movekey       (&QWP(0,$key),"xmm2");
683         &lea            ($key,&DWP(16,$key));
684 &set_label("key_256a_cold");
685         &shufps         ("xmm4","xmm0",0b00010000);
686         &pxor           ("xmm0","xmm4");
687         &shufps         ("xmm4","xmm0",0b10001100);
688         &pxor           ("xmm0","xmm4");
689         &pshufd         ("xmm1","xmm1",0b11111111);     # critical path
690         &pxor           ("xmm0","xmm1");
691         &ret();
692
693 &set_label("key_256b",16);
694         &$movekey       (&QWP(0,$key),"xmm0");
695         &lea            ($key,&DWP(16,$key));
696
697         &shufps         ("xmm4","xmm2",0b00010000);
698         &pxor           ("xmm2","xmm4");
699         &shufps         ("xmm4","xmm2",0b10001100);
700         &pxor           ("xmm2","xmm4");
701         &pshufd         ("xmm1","xmm1",0b10101010);     # critical path
702         &pxor           ("xmm2","xmm1");
703         &ret();
704
705 &set_label("bad_pointer",4);
706         &mov    ("eax",-1);
707         &ret    ();
708 &set_label("bad_keybits",4);
709         &mov    ("eax",-2);
710         &ret    ();
711 &function_end_B("_aesni_set_encrypt_key");
712
713 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
714 #                              AES_KEY *key)
715 &function_begin_B("${PREFIX}_set_encrypt_key");
716         &mov    ("eax",&wparam(0));
717         &mov    ($rounds,&wparam(1));
718         &mov    ($key,&wparam(2));
719         &call   ("_aesni_set_encrypt_key");
720         &ret    ();
721 &function_end_B("${PREFIX}_set_encrypt_key");
722
723 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
724 #                              AES_KEY *key)
725 &function_begin_B("${PREFIX}_set_decrypt_key");
726         &mov    ("eax",&wparam(0));
727         &mov    ($rounds,&wparam(1));
728         &mov    ($key,&wparam(2));
729         &call   ("_aesni_set_encrypt_key");
730         &mov    ($key,&wparam(2));
731         &shl    ($rounds,4)     # rounds-1 after _aesni_set_encrypt_key
732         &test   ("eax","eax");
733         &jnz    (&label("dec_key_ret"));
734         &lea    ("eax",&DWP(16,$key,$rounds));  # end of key schedule
735
736         &$movekey       ("xmm0",&QWP(0,$key));  # just swap
737         &$movekey       ("xmm1",&QWP(0,"eax"));
738         &$movekey       (&QWP(0,"eax"),"xmm0");
739         &$movekey       (&QWP(0,$key),"xmm1");
740         &lea            ($key,&DWP(16,$key));
741         &lea            ("eax",&DWP(-16,"eax"));
742
743 &set_label("dec_key_inverse");
744         &$movekey       ("xmm0",&QWP(0,$key));  # swap and inverse
745         &$movekey       ("xmm1",&QWP(0,"eax"));
746         &aesimc         ("xmm0","xmm0");
747         &aesimc         ("xmm1","xmm1");
748         &lea            ($key,&DWP(16,$key));
749         &lea            ("eax",&DWP(-16,"eax"));
750         &cmp            ("eax",$key);
751         &$movekey       (&QWP(16,"eax"),"xmm0");
752         &$movekey       (&QWP(-16,$key),"xmm1");
753         &ja             (&label("dec_key_inverse"));
754
755         &$movekey       ("xmm0",&QWP(0,$key));  # inverse middle
756         &aesimc         ("xmm0","xmm0");
757         &$movekey       (&QWP(0,$key),"xmm0");
758
759         &xor            ("eax","eax");          # return success
760 &set_label("dec_key_ret");
761         &ret    ();
762 &function_end_B("${PREFIX}_set_decrypt_key");
763 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
764
765 &asm_finish();