fe0cbe0b5e6950d1b06ac9532bd62d3c44e1a60f
[openssl.git] / crypto / aes / asm / aesni-x86.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
13 # details].
14
15 $PREFIX="aesni";        # if $PREFIX is set to "AES", the script
16                         # generates drop-in replacement for
17                         # crypto/aes/asm/aes-586.pl:-)
18
19 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
20 push(@INC,"${dir}","${dir}../../perlasm");
21 require "x86asm.pl";
22
23 &asm_init($ARGV[0],$0);
24
25 $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
26
27 $len="eax";
28 $rounds="ecx";
29 $key="edx";
30 $inp="esi";
31 $out="edi";
32 $rounds_="ebx";
33 $key_="ebp";
34
35 $inout0="xmm0";
36 $inout1="xmm1";
37 $inout2="xmm2";
38 $rndkey0="xmm3";
39 $rndkey1="xmm4";
40 $ivec="xmm5";
41 $in0="xmm6";
42 $in1="xmm7";
43
44 sub _aesni_generate1    # folded loop
45 { my $p=shift;
46
47     &function_begin_B("_aesni_${p}rypt1");
48         &$movekey       ($rndkey0,&QWP(0,$key));
49         &$movekey       ($rndkey1,&QWP(16,$key));
50         &lea            ($key,&DWP(16,$key));
51         &pxor           ($inout0,$rndkey0);
52         &dec            ($rounds);
53     &set_label("${p}1_loop",16);
54         eval"&aes${p}   ($inout0,$rndkey1)";
55         &dec            ($rounds);
56         &lea            ($key,&DWP(16,$key));
57         &$movekey       ($rndkey1,&QWP(0,$key));
58         &jnz            (&label("${p}1_loop"));
59     eval"&aes${p}last   ($inout0,$rndkey1)";
60     &ret();
61     &function_end_B("_aesni_${p}rypt1");
62 }
63
64 sub aesni_generate1     # fully unrolled loop
65 { my $p=shift;
66
67     &function_begin_B("_aesni_${p}rypt1");
68         &$movekey       ($rndkey0,&QWP(0,$key));
69         &$movekey       ($rndkey1,&QWP(0x10,$key));
70         &cmp            ($rounds,12);
71         &pxor           ($inout0,$rndkey0);
72         &$movekey       ($rndkey0,&QWP(0x20,$key));
73         &lea            ($key,&DWP(0x30,$key));
74         &jb             (&label("${p}128"));
75         &lea            ($key,&DWP(0x20,$key));
76         &je             (&label("${p}192"));
77         &lea            ($key,&DWP(0x20,$key));
78         eval"&aes${p}   ($inout0,$rndkey1)";
79         &$movekey       ($rndkey1,&QWP(-0x40,$key));
80         eval"&aes${p}   ($inout0,$rndkey0)";
81         &$movekey       ($rndkey0,&QWP(-0x30,$key));
82     &set_label("${p}192");
83         eval"&aes${p}   ($inout0,$rndkey1)";
84         &$movekey       ($rndkey1,&QWP(-0x20,$key));
85         eval"&aes${p}   ($inout0,$rndkey0)";
86         &$movekey       ($rndkey0,&QWP(-0x10,$key));
87     &set_label("${p}128");
88         eval"&aes${p}   ($inout0,$rndkey1)";
89         &$movekey       ($rndkey1,&QWP(0,$key));
90         eval"&aes${p}   ($inout0,$rndkey0)";
91         &$movekey       ($rndkey0,&QWP(0x10,$key));
92         eval"&aes${p}   ($inout0,$rndkey1)";
93         &$movekey       ($rndkey1,&QWP(0x20,$key));
94         eval"&aes${p}   ($inout0,$rndkey0)";
95         &$movekey       ($rndkey0,&QWP(0x30,$key));
96         eval"&aes${p}   ($inout0,$rndkey1)";
97         &$movekey       ($rndkey1,&QWP(0x40,$key));
98         eval"&aes${p}   ($inout0,$rndkey0)";
99         &$movekey       ($rndkey0,&QWP(0x50,$key));
100         eval"&aes${p}   ($inout0,$rndkey1)";
101         &$movekey       ($rndkey1,&QWP(0x60,$key));
102         eval"&aes${p}   ($inout0,$rndkey0)";
103         &$movekey       ($rndkey0,&QWP(0x70,$key));
104         eval"&aes${p}   ($inout0,$rndkey1)";
105     eval"&aes${p}last   ($inout0,$rndkey0)";
106     &ret();
107     &function_end_B("_aesni_${p}rypt1");
108 }
109
110 &aesni_generate1("enc");
111 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
112 &function_begin_B("${PREFIX}_encrypt");
113         &mov    ("eax",&wparam(0));
114         &mov    ($key,&wparam(2));
115         &movups ($inout0,&QWP(0,"eax"));
116         &mov    ($rounds,&DWP(240,$key));
117         &mov    ("eax",&wparam(1));
118         &call   ("_aesni_encrypt1");
119         &movups (&QWP(0,"eax"),$inout0);
120         &ret    ();
121 &function_end_B("${PREFIX}_encrypt");
122
123 &aesni_generate1("dec");
124 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
125 &function_begin_B("${PREFIX}_decrypt");
126         &mov    ("eax",&wparam(0));
127         &mov    ($key,&wparam(2));
128         &movups ($inout0,&QWP(0,"eax"));
129         &mov    ($rounds,&DWP(240,$key));
130         &mov    ("eax",&wparam(1));
131         &call   ("_aesni_decrypt1");
132         &movups (&QWP(0,"eax"),$inout0);
133         &ret    ();
134 &function_end_B("${PREFIX}_decrypt");
135
136 # _aesni_[en|de]crypt3 are private interfaces, 3 denotes interleave
137 # factor. Why 3x? Even though aes[enc|dec] latency is 6, it turned
138 # out that it can be scheduled only every *second* cycle. Thus 3x
139 # interleave is the one providing optimal utilization, i.e. when
140 # subroutine's throughput is virtually same as of non-interleaved
141 # subroutine for number of input blocks up to 3. This is why it
142 # handles even double-block inputs. Larger interleave factor would
143 # perform suboptimally on shorter inputs... 
144
145 sub aesni_generate3
146 { my $p=shift;
147
148     &function_begin_B("_aesni_${p}rypt3");
149         &$movekey       ($rndkey0,&QWP(0,$key));
150         &$movekey       ($rndkey1,&QWP(16,$key));
151         &shr            ($rounds,1);
152         &lea            ($key,&DWP(32,$key));
153         &pxor           ($inout0,$rndkey0);
154         &pxor           ($inout1,$rndkey0);
155         &dec            ($rounds);
156         &pxor           ($inout2,$rndkey0);
157         &jmp            (&label("${p}3_loop"));
158     &set_label("${p}3_loop",16);
159         eval"&aes${p}   ($inout0,$rndkey1)";
160         &$movekey       ($rndkey0,&QWP(0,$key));
161         eval"&aes${p}   ($inout1,$rndkey1)";
162         &dec            ($rounds);
163         eval"&aes${p}   ($inout2,$rndkey1)";
164         &$movekey       ($rndkey1,&QWP(16,$key));
165         eval"&aes${p}   ($inout0,$rndkey0)";
166         &lea            ($key,&DWP(32,$key));
167         eval"&aes${p}   ($inout1,$rndkey0)";
168         eval"&aes${p}   ($inout2,$rndkey0)";
169         &jnz            (&label("${p}3_loop"));
170     eval"&aes${p}       ($inout0,$rndkey1)";
171     &$movekey           ($rndkey0,&QWP(0,$key));
172     eval"&aes${p}       ($inout1,$rndkey1)";
173     eval"&aes${p}       ($inout2,$rndkey1)";
174     eval"&aes${p}last   ($inout0,$rndkey0)";
175     eval"&aes${p}last   ($inout1,$rndkey0)";
176     eval"&aes${p}last   ($inout2,$rndkey0)";
177     &ret();
178     &function_end_B("_aesni_${p}rypt3");
179 }
180 &aesni_generate3("enc") if ($PREFIX eq "aesni");
181 &aesni_generate3("dec");
182
183 if ($PREFIX eq "aesni") {
184 # void aesni_ecb_encrypt (const void *in, void *out,
185 #                         size_t length, const AES_KEY *key,
186 #                         int enc);
187
188 &function_begin("aesni_ecb_encrypt");
189         &mov    ($inp,&wparam(0));
190         &mov    ($out,&wparam(1));
191         &mov    ($len,&wparam(2));
192         &mov    ($key,&wparam(3));
193         &mov    ($rounds,&wparam(4));
194         &cmp    ($len,16);
195         &jb     (&label("ecb_ret"));
196         &and    ($len,-16);
197         &test   ($rounds,$rounds)
198         &mov    ($rounds,&DWP(240,$key));
199         &mov    ($key_,$key);           # backup $key
200         &mov    ($rounds_,$rounds);     # backup $rounds
201         &jz     (&label("ecb_decrypt"));
202
203         &sub    ($len,0x30);
204         &jc     (&label("ecb_enc_tail"));
205         jmp     (&label("ecb_enc_loop3"));
206
207 &set_label("ecb_enc_loop3",16);
208         &movups ($inout0,&QWP(0,$inp));
209         &movups ($inout1,&QWP(0x10,$inp));
210         &movups ($inout2,&QWP(0x20,$inp));
211         &lea    ($inp,&DWP(0x30,$inp));
212         &call   ("_aesni_encrypt3");
213         &movups (&QWP(0,$out),$inout0);
214         &sub    ($len,0x30);
215         &movups (&QWP(0x10,$out),$inout1);
216         &mov    ($key,$key_);           # restore $key
217         &movups (&QWP(0x20,$out),$inout2);
218         &mov    ($rounds,$rounds_);     # restore $rounds
219         &lea    ($out,&DWP(0x30,$out));
220         &jnc    (&label("ecb_enc_loop3"));
221
222 &set_label("ecb_enc_tail");
223         &add    ($len,0x30);
224         &jz     (&label("ecb_ret"));
225
226         &cmp    ($len,0x10);
227         &movups ($inout0,&QWP(0,$inp));
228         je      (&label("ecb_enc_one"));
229         &movups ($inout1,&QWP(0x10,$inp));
230         &call   ("_aesni_encrypt3");
231         &movups (&QWP(0,$out),$inout0);
232         &movups (&QWP(0x10,$out),$inout1);
233         jmp     (&label("ecb_ret"));
234
235 &set_label("ecb_enc_one",16);
236         &call   ("_aesni_encrypt1");
237         &movups (&QWP(0,$out),$inout0);
238         &jmp    (&label("ecb_ret"));
239
240 &set_label("ecb_decrypt",16);
241         &sub    ($len,0x30);
242         &jc     (&label("ecb_dec_tail"));
243         jmp     (&label("ecb_dec_loop3"));
244
245 &set_label("ecb_dec_loop3",16);
246         &movups ($inout0,&QWP(0,$inp));
247         &movups ($inout1,&QWP(0x10,$inp));
248         &movups ($inout2,&QWP(0x20,$inp));
249         &call   ("_aesni_decrypt3");
250         &movups (&QWP(0,$out),$inout0);
251         &sub    ($len,0x30);
252         &lea    ($inp,&DWP(0x30,$inp));
253         &movups (&QWP(0x10,$out),$inout1);
254         &mov    ($key,$key_);           # restore $key
255         &movups (&QWP(0x20,$out),$inout2);
256         &mov    ($rounds,$rounds_);     # restore $rounds
257         &lea    ($out,&DWP(0x30,$out));
258         &jnc    (&label("ecb_dec_loop3"));
259
260 &set_label("ecb_dec_tail");
261         &add    ($len,0x30);
262         &jz     (&label("ecb_ret"));
263
264         &cmp    ($len,0x10);
265         &movups ($inout0,&QWP(0,$inp));
266         je      (&label("ecb_dec_one"));
267         &movups ($inout1,&QWP(0x10,$inp));
268         &call   ("_aesni_decrypt3");
269         &movups (&QWP(0,$out),$inout0);
270         &movups (&QWP(0x10,$out),$inout1);
271         jmp     (&label("ecb_ret"));
272
273 &set_label("ecb_dec_one",16);
274         &call   ("_aesni_decrypt1");
275         &movups (&QWP(0,$out),$inout0);
276
277 &set_label("ecb_ret");
278 &function_end("aesni_ecb_encrypt");
279 }
280
281 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
282 #                           size_t length, const AES_KEY *key,
283 #                           unsigned char *ivp,const int enc);
284 &function_begin("${PREFIX}_cbc_encrypt");
285         &mov    ($inp,&wparam(0));
286         &mov    ($out,&wparam(1));
287         &mov    ($len,&wparam(2));
288         &mov    ($key,&wparam(3));
289         &test   ($len,$len);
290         &mov    ($key_,&wparam(4));
291         &je     (&label("cbc_ret"));
292
293         &cmp    (&wparam(5),0);
294         &movups ($ivec,&QWP(0,$key_));  # load IV
295         &mov    ($rounds,&DWP(240,$key));
296         &mov    ($key_,$key);           # backup $key
297         &mov    ($rounds_,$rounds);     # backup $rounds
298         &je     (&label("cbc_decrypt"));
299
300         &movaps ($inout0,$ivec);
301         &cmp    ($len,16);
302         &jb     (&label("cbc_enc_tail"));
303         &sub    ($len,16);
304         &jmp    (&label("cbc_enc_loop"));
305
306 &set_label("cbc_enc_loop",16);
307         &movups ($ivec,&QWP(0,$inp));
308         &lea    ($inp,&DWP(16,$inp));
309         &pxor   ($inout0,$ivec);
310         &call   ("_aesni_encrypt1");
311         &sub    ($len,16);
312         &mov    ($rounds,$rounds_);     # restore $rounds
313         &mov    ($key,$key_);           # restore $key
314         &movups (&QWP(0,$out),$inout0);
315         &lea    ($out,&DWP(16,$out));
316         &jnc    (&label("cbc_enc_loop"));
317         &add    ($len,16);
318         &jnz    (&label("cbc_enc_tail"));
319         &movaps ($ivec,$inout0);
320         &jmp    (&label("cbc_ret"));
321
322 &set_label("cbc_enc_tail");
323         &mov    ("ecx",$len);           # zaps $rounds
324         &data_word(0xA4F3F689);         # rep movsb
325         &mov    ("ecx",16);             # zero tail
326         &sub    ("ecx",$len);
327         &xor    ("eax","eax");          # zaps $len
328         &data_word(0xAAF3F689);         # rep stosb
329         &lea    ($out,&DWP(-16,$out));  # rewind $out by 1 block
330         &mov    ($rounds,$rounds_);     # restore $rounds
331         &mov    ($inp,$out);            # $inp and $out are the same
332         &mov    ($key,$key_);           # restore $key
333         &jmp    (&label("cbc_enc_loop"));
334
335 &set_label("cbc_decrypt",16);
336         &sub    ($len,0x30);
337         &jc     (&label("cbc_dec_tail"));
338         &jmp    (&label("cbc_dec_loop3"));
339
340 &set_label("cbc_dec_loop3",16);
341         &movups ($inout0,&QWP(0,$inp));
342         &movups ($inout1,&QWP(0x10,$inp));
343         &movups ($inout2,&QWP(0x20,$inp));
344         &movaps ($in0,$inout0);
345         &movaps ($in1,$inout1);
346         &call   ("_aesni_decrypt3");
347         &sub    ($len,0x30);
348         &lea    ($inp,&DWP(0x30,$inp));
349         &pxor   ($inout0,$ivec);
350         &pxor   ($inout1,$in0);
351         &movups ($ivec,&QWP(-0x10,$inp));
352         &pxor   ($inout2,$in1);
353         &movups (&QWP(0,$out),$inout0);
354         &mov    ($rounds,$rounds_)      # restore $rounds
355         &movups (&QWP(0x10,$out),$inout1);
356         &mov    ($key,$key_);           # restore $key
357         &movups (&QWP(0x20,$out),$inout2);
358         &lea    ($out,&DWP(0x30,$out));
359         &jnc    (&label("cbc_dec_loop3"));
360
361 &set_label("cbc_dec_tail");
362         &add    ($len,0x30);
363         &jz     (&label("cbc_ret"));
364
365         &movups ($inout0,&QWP(0,$inp));
366         &cmp    ($len,0x10);
367         &movaps ($in0,$inout0);
368         &jbe    (&label("cbc_dec_one"));
369         &movups ($inout1,&QWP(0x10,$inp));
370         &cmp    ($len,0x20);
371         &movaps ($in1,$inout1);
372         &jbe    (&label("cbc_dec_two"));
373         &movups ($inout2,&QWP(0x20,$inp));
374         &call   ("_aesni_decrypt3");
375         &pxor   ($inout0,$ivec);
376         &movups ($ivec,&QWP(0x20,$inp));
377         &pxor   ($inout1,$in0);
378         &pxor   ($inout2,$in1);
379         &movups (&QWP(0,$out),$inout0);
380         &movups (&QWP(0x10,$out),$inout1);
381         &movaps ($inout0,$inout2);
382         &lea    ($out,&DWP(0x20,$out));
383         &jmp    (&label("cbc_dec_tail_collected"));
384
385 &set_label("cbc_dec_one");
386         &call   ("_aesni_decrypt1");
387         &pxor   ($inout0,$ivec);
388         &movaps ($ivec,$in0);
389         &jmp    (&label("cbc_dec_tail_collected"));
390
391 &set_label("cbc_dec_two");
392         &call   ("_aesni_decrypt3");
393         &pxor   ($inout0,$ivec);
394         &pxor   ($inout1,$in0);
395         &movups (&QWP(0,$out),$inout0);
396         &movaps ($inout0,$inout1);
397         &movaps ($ivec,$in1);
398         &lea    ($out,&DWP(0x10,$out));
399
400 &set_label("cbc_dec_tail_collected");
401         &and    ($len,15);
402         &jnz    (&label("cbc_dec_tail_partial"));
403         &movups (&QWP(0,$out),$inout0);
404         &jmp    (&label("cbc_ret"));
405
406 &set_label("cbc_dec_tail_partial");
407         &mov    ($key_,"esp");
408         &sub    ("esp",16);
409         &and    ("esp",-16);
410         &movaps (&QWP(0,"esp"),$inout0);
411         &mov    ($inp,"esp");
412         &mov    ("ecx",$len);
413         &data_word(0xA4F3F689);         # rep movsb
414         &mov    ("esp",$key_);
415
416 &set_label("cbc_ret");
417         &mov    ($key_,&wparam(4));
418         &movups (&QWP(0,$key_),$ivec);  # output IV
419 &function_end("${PREFIX}_cbc_encrypt");
420
421 # Mechanical port from aesni-x86_64.pl.
422 #
423 # _aesni_set_encrypt_key is private interface,
424 # input:
425 #       "eax"   const unsigned char *userKey
426 #       $rounds int bits
427 #       $key    AES_KEY *key
428 # output:
429 #       "eax"   return code
430 #       $round  rounds
431
432 &function_begin_B("_aesni_set_encrypt_key");
433         &test   ("eax","eax");
434         &jz     (&label("bad_pointer"));
435         &test   ($key,$key);
436         &jz     (&label("bad_pointer"));
437
438         &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
439         &pxor   ("xmm4","xmm4");        # low dword of xmm4 is assumed 0
440         &lea    ($key,&DWP(16,$key));
441         &cmp    ($rounds,256);
442         &je     (&label("14rounds"));
443         &cmp    ($rounds,192);
444         &je     (&label("12rounds"));
445         &cmp    ($rounds,128);
446         &jne    (&label("bad_keybits"));
447
448 &set_label("10rounds",16);
449         &mov            ($rounds,10);
450         &$movekey       (&QWP(-16,$key),"xmm0");        # round 0
451         &aeskeygenassist("xmm1","xmm0",0x01);           # round 1
452         &call           (&label("key_128_cold"));
453         &aeskeygenassist("xmm1","xmm0",0x2);            # round 2
454         &call           (&label("key_128"));
455         &aeskeygenassist("xmm1","xmm0",0x04);           # round 3
456         &call           (&label("key_128"));
457         &aeskeygenassist("xmm1","xmm0",0x08);           # round 4
458         &call           (&label("key_128"));
459         &aeskeygenassist("xmm1","xmm0",0x10);           # round 5
460         &call           (&label("key_128"));
461         &aeskeygenassist("xmm1","xmm0",0x20);           # round 6
462         &call           (&label("key_128"));
463         &aeskeygenassist("xmm1","xmm0",0x40);           # round 7
464         &call           (&label("key_128"));
465         &aeskeygenassist("xmm1","xmm0",0x80);           # round 8
466         &call           (&label("key_128"));
467         &aeskeygenassist("xmm1","xmm0",0x1b);           # round 9
468         &call           (&label("key_128"));
469         &aeskeygenassist("xmm1","xmm0",0x36);           # round 10
470         &call           (&label("key_128"));
471         &$movekey       (&QWP(0,$key),"xmm0");
472         &mov            (&DWP(80,$key),$rounds);
473         &xor            ("eax","eax");
474         &ret();
475
476 &set_label("key_128",16);
477         &$movekey       (&QWP(0,$key),"xmm0");
478         &lea            ($key,&DWP(16,$key));
479 &set_label("key_128_cold");
480         &shufps         ("xmm4","xmm0",0b00010000);
481         &pxor           ("xmm0","xmm4");
482         &shufps         ("xmm4","xmm0",0b10001100,);
483         &pxor           ("xmm0","xmm4");
484         &pshufd         ("xmm1","xmm1",0b11111111);     # critical path
485         &pxor           ("xmm0","xmm1");
486         &ret();
487
488 &set_label("12rounds",16);
489         &movq           ("xmm2",&QWP(16,"eax"));        # remaining 1/3 of *userKey
490         &mov            ($rounds,12);
491         &$movekey       (&QWP(-16,$key),"xmm0")         # round 0
492         &aeskeygenassist("xmm1","xmm2",0x01);           # round 1,2
493         &call           (&label("key_192a_cold"));
494         &aeskeygenassist("xmm1","xmm2",0x02);           # round 2,3
495         &call           (&label("key_192b"));
496         &aeskeygenassist("xmm1","xmm2",0x04);           # round 4,5
497         &call           (&label("key_192a"));
498         &aeskeygenassist("xmm1","xmm2",0x08);           # round 5,6
499         &call           (&label("key_192b"));
500         &aeskeygenassist("xmm1","xmm2",0x10);           # round 7,8
501         &call           (&label("key_192a"));
502         &aeskeygenassist("xmm1","xmm2",0x20);           # round 8,9
503         &call           (&label("key_192b"));
504         &aeskeygenassist("xmm1","xmm2",0x40);           # round 10,11
505         &call           (&label("key_192a"));
506         &aeskeygenassist("xmm1","xmm2",0x80);           # round 11,12
507         &call           (&label("key_192b"));
508         &$movekey       (&QWP(0,$key),"xmm0");
509         &mov            (&DWP(48,$key),$rounds);
510         &xor            ("eax","eax");
511         &ret();
512
513 &set_label("key_192a",16);
514         &$movekey       (&QWP(0,$key),"xmm0");
515         &lea            ($key,&DWP(16,$key));
516 &set_label("key_192a_cold",16);
517         &movaps         ("xmm5","xmm2");
518 &set_label("key_192b_warm");
519         &shufps         ("xmm4","xmm0",0b00010000);
520         &movaps         ("xmm3","xmm2");
521         &pxor           ("xmm0","xmm4");
522         &shufps         ("xmm4","xmm0",0b10001100);
523         &pslldq         ("xmm3",4);
524         &pxor           ("xmm0","xmm4");
525         &pshufd         ("xmm1","xmm1",0b01010101);     # critical path
526         &pxor           ("xmm2","xmm3");
527         &pxor           ("xmm0","xmm1");
528         &pshufd         ("xmm3","xmm0",0b11111111);
529         &pxor           ("xmm2","xmm3");
530         &ret();
531
532 &set_label("key_192b",16);
533         &movaps         ("xmm3","xmm0");
534         &shufps         ("xmm5","xmm0",0b01000100);
535         &$movekey       (&QWP(0,$key),"xmm5");
536         &shufps         ("xmm3","xmm2",0b01001110);
537         &$movekey       (&QWP(16,$key),"xmm3");
538         &lea            ($key,&DWP(32,$key));
539         &jmp            (&label("key_192b_warm"));
540
541 &set_label("14rounds",16);
542         &movups         ("xmm2",&QWP(16,"eax"));        # remaining half of *userKey
543         &mov            ($rounds,14);
544         &lea            ($key,&DWP(16,$key));
545         &$movekey       (&QWP(-32,$key),"xmm0");        # round 0
546         &$movekey       (&QWP(-16,$key),"xmm2");        # round 1
547         &aeskeygenassist("xmm1","xmm2",0x01);           # round 2
548         &call           (&label("key_256a_cold"));
549         &aeskeygenassist("xmm1","xmm0",0x01);           # round 3
550         &call           (&label("key_256b"));
551         &aeskeygenassist("xmm1","xmm2",0x02);           # round 4
552         &call           (&label("key_256a"));
553         &aeskeygenassist("xmm1","xmm0",0x02);           # round 5
554         &call           (&label("key_256b"));
555         &aeskeygenassist("xmm1","xmm2",0x04);           # round 6
556         &call           (&label("key_256a"));
557         &aeskeygenassist("xmm1","xmm0",0x04);           # round 7
558         &call           (&label("key_256b"));
559         &aeskeygenassist("xmm1","xmm2",0x08);           # round 8
560         &call           (&label("key_256a"));
561         &aeskeygenassist("xmm1","xmm0",0x08);           # round 9
562         &call           (&label("key_256b"));
563         &aeskeygenassist("xmm1","xmm2",0x10);           # round 10
564         &call           (&label("key_256a"));
565         &aeskeygenassist("xmm1","xmm0",0x10);           # round 11
566         &call           (&label("key_256b"));
567         &aeskeygenassist("xmm1","xmm2",0x20);           # round 12
568         &call           (&label("key_256a"));
569         &aeskeygenassist("xmm1","xmm0",0x20);           # round 13
570         &call           (&label("key_256b"));
571         &aeskeygenassist("xmm1","xmm2",0x40);           # round 14
572         &call           (&label("key_256a"));
573         &$movekey       (&QWP(0,$key),"xmm0");
574         &mov            (&DWP(16,$key),$rounds);
575         &xor            ("eax","eax");
576         &ret();
577
578 &set_label("key_256a",16);
579         &$movekey       (&QWP(0,$key),"xmm2");
580         &lea            ($key,&DWP(16,$key));
581 &set_label("key_256a_cold");
582         &shufps         ("xmm4","xmm0",0b00010000);
583         &pxor           ("xmm0","xmm4");
584         &shufps         ("xmm4","xmm0",0b10001100);
585         &pxor           ("xmm0","xmm4");
586         &pshufd         ("xmm1","xmm1",0b11111111);     # critical path
587         &pxor           ("xmm0","xmm1");
588         &ret();
589
590 &set_label("key_256b",16);
591         &$movekey       (&QWP(0,$key),"xmm0");
592         &lea            ($key,&DWP(16,$key));
593
594         &shufps         ("xmm4","xmm2",0b00010000);
595         &pxor           ("xmm2","xmm4");
596         &shufps         ("xmm4","xmm2",0b10001100);
597         &pxor           ("xmm2","xmm4");
598         &pshufd         ("xmm1","xmm1",0b10101010);     # critical path
599         &pxor           ("xmm2","xmm1");
600         &ret();
601
602 &set_label("bad_pointer",4);
603         &mov    ("eax",-1);
604         &ret    ();
605 &set_label("bad_keybits",4);
606         &mov    ("eax",-2);
607         &ret    ();
608 &function_end_B("_aesni_set_encrypt_key");
609
610 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
611 #                              AES_KEY *key)
612 &function_begin_B("${PREFIX}_set_encrypt_key");
613         &mov    ("eax",&wparam(0));
614         &mov    ($rounds,&wparam(1));
615         &mov    ($key,&wparam(2));
616         &call   ("_aesni_set_encrypt_key");
617         &ret    ();
618 &function_end_B("${PREFIX}_set_encrypt_key");
619
620 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
621 #                              AES_KEY *key)
622 &function_begin_B("${PREFIX}_set_decrypt_key");
623         &mov    ("eax",&wparam(0));
624         &mov    ($rounds,&wparam(1));
625         &mov    ($key,&wparam(2));
626         &call   ("_aesni_set_encrypt_key");
627         &mov    ($key,&wparam(2));
628         &shl    ($rounds,4)     # actually rounds after _aesni_set_encrypt_key
629         &test   ("eax","eax");
630         &jnz    (&label("dec_key_ret"));
631         &lea    ("eax",&DWP(0,$key,$rounds));   # end of key schedule
632
633         &$movekey       ("xmm0",&QWP(0,$key));  # just swap
634         &$movekey       ("xmm1",&QWP(0,"eax"));
635         &$movekey       (&QWP(0,"eax"),"xmm0");
636         &$movekey       (&QWP(0,$key),"xmm1");
637         &lea            ($key,&DWP(16,$key));
638         &lea            ("eax",&DWP(-16,"eax"));
639         &jmp            (&label("dec_key_inverse"));
640
641 &set_label("dec_key_inverse",16);
642         &$movekey       ("xmm0",&QWP(0,$key));  # swap and inverse
643         &$movekey       ("xmm1",&QWP(0,"eax"));
644         &aesimc         ("xmm0","xmm0");
645         &aesimc         ("xmm1","xmm1");
646         &lea            ($key,&DWP(16,$key));
647         &lea            ("eax",&DWP(-16,"eax"));
648         &cmp            ("eax",$key);
649         &$movekey       (&QWP(16,"eax"),"xmm0");
650         &$movekey       (&QWP(-16,$key),"xmm1");
651         &ja             (&label("dec_key_inverse"));
652
653         &$movekey       ("xmm0",&QWP(0,$key));  # inverse middle
654         &aesimc         ("xmm0","xmm0");
655         &$movekey       (&QWP(0,$key),"xmm0");
656
657         &xor            ("eax","eax");          # return success
658 &set_label("dec_key_ret");
659         &ret    ();
660 &function_end_B("${PREFIX}_set_decrypt_key");
661 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
662
663 &asm_finish();