3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
15 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
16 # generates drop-in replacement for
17 # crypto/aes/asm/aes-586.pl:-)
19 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
20 push(@INC,"${dir}","${dir}../../perlasm");
23 &asm_init($ARGV[0],$0);
25 $movekey = eval($RREFIX eq "aseni" ? "*movaps" : "*movups");
32 $rounds_="ebx"; # backup copy for $rounds
33 $key_="ebp"; # backup copy for $key
42 $in1="xmm7"; $inout3="xmm7";
44 # Inline version of internal aesni_[en|de]crypt1
45 sub aesni_inline_generate1
48 &$movekey ($rndkey0,&QWP(0,$key));
49 &$movekey ($rndkey1,&QWP(16,$key));
50 &lea ($key,&DWP(32,$key));
51 &pxor ($inout0,$rndkey0);
52 &set_label("${p}1_loop");
53 eval"&aes${p} ($inout0,$rndkey1)";
55 &$movekey ($rndkey1,&QWP(0,$key));
56 &lea ($key,&DWP(16,$key));
57 &jnz (&label("${p}1_loop"));
58 eval"&aes${p}last ($inout0,$rndkey1)";
61 sub aesni_generate1 # fully unrolled loop
64 &function_begin_B("_aesni_${p}rypt1");
65 &$movekey ($rndkey0,&QWP(0,$key));
66 &$movekey ($rndkey1,&QWP(0x10,$key));
68 &pxor ($inout0,$rndkey0);
69 &$movekey ($rndkey0,&QWP(0x20,$key));
70 &lea ($key,&DWP(0x30,$key));
71 &jb (&label("${p}128"));
72 &lea ($key,&DWP(0x20,$key));
73 &je (&label("${p}192"));
74 &lea ($key,&DWP(0x20,$key));
75 eval"&aes${p} ($inout0,$rndkey1)";
76 &$movekey ($rndkey1,&QWP(-0x40,$key));
77 eval"&aes${p} ($inout0,$rndkey0)";
78 &$movekey ($rndkey0,&QWP(-0x30,$key));
79 &set_label("${p}192");
80 eval"&aes${p} ($inout0,$rndkey1)";
81 &$movekey ($rndkey1,&QWP(-0x20,$key));
82 eval"&aes${p} ($inout0,$rndkey0)";
83 &$movekey ($rndkey0,&QWP(-0x10,$key));
84 &set_label("${p}128");
85 eval"&aes${p} ($inout0,$rndkey1)";
86 &$movekey ($rndkey1,&QWP(0,$key));
87 eval"&aes${p} ($inout0,$rndkey0)";
88 &$movekey ($rndkey0,&QWP(0x10,$key));
89 eval"&aes${p} ($inout0,$rndkey1)";
90 &$movekey ($rndkey1,&QWP(0x20,$key));
91 eval"&aes${p} ($inout0,$rndkey0)";
92 &$movekey ($rndkey0,&QWP(0x30,$key));
93 eval"&aes${p} ($inout0,$rndkey1)";
94 &$movekey ($rndkey1,&QWP(0x40,$key));
95 eval"&aes${p} ($inout0,$rndkey0)";
96 &$movekey ($rndkey0,&QWP(0x50,$key));
97 eval"&aes${p} ($inout0,$rndkey1)";
98 &$movekey ($rndkey1,&QWP(0x60,$key));
99 eval"&aes${p} ($inout0,$rndkey0)";
100 &$movekey ($rndkey0,&QWP(0x70,$key));
101 eval"&aes${p} ($inout0,$rndkey1)";
102 eval"&aes${p}last ($inout0,$rndkey0)";
104 &function_end_B("_aesni_${p}rypt1");
107 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
108 # &aesni_generate1("dec");
109 &function_begin_B("${PREFIX}_encrypt");
110 &mov ("eax",&wparam(0));
111 &mov ($key,&wparam(2));
112 &movups ($inout0,&QWP(0,"eax"));
113 &mov ($rounds,&DWP(240,$key));
114 &mov ("eax",&wparam(1));
115 &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
116 &movups (&QWP(0,"eax"),$inout0);
118 &function_end_B("${PREFIX}_encrypt");
120 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
121 # &aesni_generate1("dec");
122 &function_begin_B("${PREFIX}_decrypt");
123 &mov ("eax",&wparam(0));
124 &mov ($key,&wparam(2));
125 &movups ($inout0,&QWP(0,"eax"));
126 &mov ($rounds,&DWP(240,$key));
127 &mov ("eax",&wparam(1));
128 &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt1");
129 &movups (&QWP(0,"eax"),$inout0);
131 &function_end_B("${PREFIX}_decrypt");
133 # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
134 # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
135 # latency is 6, it turned out that it can be scheduled only every
136 # *second* cycle. Thus 3x interleave is the one providing optimal
137 # utilization, i.e. when subroutine's throughput is virtually same as
138 # of non-interleaved subroutine [for number of input blocks up to 3].
139 # This is why it makes no sense to implement 2x subroutine. As soon
140 # as/if Intel improves throughput by making it possible to schedule
141 # the instructions in question *every* cycles I would have to
142 # implement 6x interleave and use it in loop...
146 &function_begin_B("_aesni_${p}rypt3");
147 &$movekey ($rndkey0,&QWP(0,$key));
149 &$movekey ($rndkey1,&QWP(16,$key));
150 &lea ($key,&DWP(32,$key));
151 &pxor ($inout0,$rndkey0);
152 &pxor ($inout1,$rndkey0);
153 &pxor ($inout2,$rndkey0);
154 &jmp (&label("${p}3_loop"));
155 &set_label("${p}3_loop",16);
156 eval"&aes${p} ($inout0,$rndkey1)";
157 &$movekey ($rndkey0,&QWP(0,$key));
158 eval"&aes${p} ($inout1,$rndkey1)";
160 eval"&aes${p} ($inout2,$rndkey1)";
161 &$movekey ($rndkey1,&QWP(16,$key));
162 eval"&aes${p} ($inout0,$rndkey0)";
163 &lea ($key,&DWP(32,$key));
164 eval"&aes${p} ($inout1,$rndkey0)";
165 eval"&aes${p} ($inout2,$rndkey0)";
166 &jnz (&label("${p}3_loop"));
167 eval"&aes${p} ($inout0,$rndkey1)";
168 &$movekey ($rndkey0,&QWP(0,$key));
169 eval"&aes${p} ($inout1,$rndkey1)";
170 eval"&aes${p} ($inout2,$rndkey1)";
171 eval"&aes${p}last ($inout0,$rndkey0)";
172 eval"&aes${p}last ($inout1,$rndkey0)";
173 eval"&aes${p}last ($inout2,$rndkey0)";
175 &function_end_B("_aesni_${p}rypt3");
178 # 4x interleave is implemented to improve small block performance,
179 # most notably [and naturally] 4 block by ~30%. One can argue that one
180 # should have implemented 5x as well, but improvement would be <20%,
181 # so it's not worth it...
185 &function_begin_B("_aesni_${p}rypt4");
186 &$movekey ($rndkey0,&QWP(0,$key));
187 &$movekey ($rndkey1,&QWP(16,$key));
189 &lea ($key,&DWP(32,$key));
190 &pxor ($inout0,$rndkey0);
191 &pxor ($inout1,$rndkey0);
192 &pxor ($inout2,$rndkey0);
193 &pxor ($inout3,$rndkey0);
194 &jmp (&label("${p}3_loop"));
195 &set_label("${p}3_loop",16);
196 eval"&aes${p} ($inout0,$rndkey1)";
197 &$movekey ($rndkey0,&QWP(0,$key));
198 eval"&aes${p} ($inout1,$rndkey1)";
200 eval"&aes${p} ($inout2,$rndkey1)";
201 eval"&aes${p} ($inout3,$rndkey1)";
202 &$movekey ($rndkey1,&QWP(16,$key));
203 eval"&aes${p} ($inout0,$rndkey0)";
204 &lea ($key,&DWP(32,$key));
205 eval"&aes${p} ($inout1,$rndkey0)";
206 eval"&aes${p} ($inout2,$rndkey0)";
207 eval"&aes${p} ($inout3,$rndkey0)";
208 &jnz (&label("${p}3_loop"));
209 eval"&aes${p} ($inout0,$rndkey1)";
210 &$movekey ($rndkey0,&QWP(0,$key));
211 eval"&aes${p} ($inout1,$rndkey1)";
212 eval"&aes${p} ($inout2,$rndkey1)";
213 eval"&aes${p} ($inout3,$rndkey1)";
214 eval"&aes${p}last ($inout0,$rndkey0)";
215 eval"&aes${p}last ($inout1,$rndkey0)";
216 eval"&aes${p}last ($inout2,$rndkey0)";
217 eval"&aes${p}last ($inout3,$rndkey0)";
219 &function_end_B("_aesni_${p}rypt4");
221 &aesni_generate3("enc") if ($PREFIX eq "aesni");
222 &aesni_generate3("dec");
223 &aesni_generate4("enc") if ($PREFIX eq "aesni");
224 &aesni_generate4("dec");
226 if ($PREFIX eq "aesni") {
227 # void aesni_ecb_encrypt (const void *in, void *out,
228 # size_t length, const AES_KEY *key,
230 &function_begin("aesni_ecb_encrypt");
231 &mov ($inp,&wparam(0));
232 &mov ($out,&wparam(1));
233 &mov ($len,&wparam(2));
234 &mov ($key,&wparam(3));
235 &mov ($rounds,&wparam(4));
237 &jb (&label("ecb_ret"));
239 &test ($rounds,$rounds)
240 &mov ($rounds,&DWP(240,$key));
241 &mov ($key_,$key); # backup $key
242 &mov ($rounds_,$rounds); # backup $rounds
243 &jz (&label("ecb_decrypt"));
246 &jbe (&label("ecb_enc_tail"));
247 &jmp (&label("ecb_enc_loop3"));
249 &set_label("ecb_enc_loop3",16);
250 &movups ($inout0,&QWP(0,$inp));
251 &movups ($inout1,&QWP(0x10,$inp));
252 &movups ($inout2,&QWP(0x20,$inp));
253 &call ("_aesni_encrypt3");
255 &lea ($inp,&DWP(0x30,$inp));
256 &lea ($out,&DWP(0x30,$out));
257 &movups (&QWP(-0x30,$out),$inout0);
258 &mov ($key,$key_); # restore $key
259 &movups (&QWP(-0x20,$out),$inout1);
260 &mov ($rounds,$rounds_); # restore $rounds
261 &movups (&QWP(-0x10,$out),$inout2);
262 &ja (&label("ecb_enc_loop3"));
264 &set_label("ecb_enc_tail");
266 &jz (&label("ecb_ret"));
269 &movups ($inout0,&QWP(0,$inp));
270 &je (&label("ecb_enc_one"));
272 &movups ($inout1,&QWP(0x10,$inp));
273 &je (&label("ecb_enc_two"));
275 &movups ($inout2,&QWP(0x20,$inp));
276 &je (&label("ecb_enc_three"));
277 &movups ($inout3,&QWP(0x30,$inp));
278 &call ("_aesni_encrypt4");
279 &movups (&QWP(0,$out),$inout0);
280 &movups (&QWP(0x10,$out),$inout1);
281 &movups (&QWP(0x20,$out),$inout2);
282 &movups (&QWP(0x30,$out),$inout3);
283 jmp (&label("ecb_ret"));
285 &set_label("ecb_enc_one",16);
286 &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt1");
287 &movups (&QWP(0,$out),$inout0);
288 &jmp (&label("ecb_ret"));
290 &set_label("ecb_enc_two",16);
291 &call ("_aesni_encrypt3");
292 &movups (&QWP(0,$out),$inout0);
293 &movups (&QWP(0x10,$out),$inout1);
294 &jmp (&label("ecb_ret"));
296 &set_label("ecb_enc_three",16);
297 &call ("_aesni_encrypt3");
298 &movups (&QWP(0,$out),$inout0);
299 &movups (&QWP(0x10,$out),$inout1);
300 &movups (&QWP(0x20,$out),$inout2);
301 &jmp (&label("ecb_ret"));
303 &set_label("ecb_decrypt",16);
305 &jbe (&label("ecb_dec_tail"));
306 &jmp (&label("ecb_dec_loop3"));
308 &set_label("ecb_dec_loop3",16);
309 &movups ($inout0,&QWP(0,$inp));
310 &movups ($inout1,&QWP(0x10,$inp));
311 &movups ($inout2,&QWP(0x20,$inp));
312 &call ("_aesni_decrypt3");
314 &lea ($inp,&DWP(0x30,$inp));
315 &lea ($out,&DWP(0x30,$out));
316 &movups (&QWP(-0x30,$out),$inout0);
317 &mov ($key,$key_); # restore $key
318 &movups (&QWP(-0x20,$out),$inout1);
319 &mov ($rounds,$rounds_); # restore $rounds
320 &movups (&QWP(-0x10,$out),$inout2);
321 &ja (&label("ecb_dec_loop3"));
323 &set_label("ecb_dec_tail");
325 &jz (&label("ecb_ret"));
328 &movups ($inout0,&QWP(0,$inp));
329 &je (&label("ecb_dec_one"));
331 &movups ($inout1,&QWP(0x10,$inp));
332 &je (&label("ecb_dec_two"));
334 &movups ($inout2,&QWP(0x20,$inp));
335 &je (&label("ecb_dec_three"));
336 &movups ($inout3,&QWP(0x30,$inp));
337 &call ("_aesni_decrypt4");
338 &movups (&QWP(0,$out),$inout0);
339 &movups (&QWP(0x10,$out),$inout1);
340 &movups (&QWP(0x20,$out),$inout2);
341 &movups (&QWP(0x30,$out),$inout3);
342 &jmp (&label("ecb_ret"));
344 &set_label("ecb_dec_one",16);
345 &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
346 &movups (&QWP(0,$out),$inout0);
347 &jmp (&label("ecb_ret"));
349 &set_label("ecb_dec_two",16);
350 &call ("_aesni_decrypt3");
351 &movups (&QWP(0,$out),$inout0);
352 &movups (&QWP(0x10,$out),$inout1);
353 &jmp (&label("ecb_ret"));
355 &set_label("ecb_dec_three",16);
356 &call ("_aesni_decrypt3");
357 &movups (&QWP(0,$out),$inout0);
358 &movups (&QWP(0x10,$out),$inout1);
359 &movups (&QWP(0x20,$out),$inout2);
361 &set_label("ecb_ret");
362 &function_end("aesni_ecb_encrypt");
365 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
366 # size_t length, const AES_KEY *key,
367 # unsigned char *ivp,const int enc);
368 &function_begin("${PREFIX}_cbc_encrypt");
369 &mov ($inp,&wparam(0));
370 &mov ($out,&wparam(1));
371 &mov ($len,&wparam(2));
372 &mov ($key,&wparam(3));
374 &mov ($key_,&wparam(4));
375 &jz (&label("cbc_ret"));
378 &movups ($ivec,&QWP(0,$key_)); # load IV
379 &mov ($rounds,&DWP(240,$key));
380 &mov ($key_,$key); # backup $key
381 &mov ($rounds_,$rounds); # backup $rounds
382 &je (&label("cbc_decrypt"));
384 &movaps ($inout0,$ivec);
386 &jb (&label("cbc_enc_tail"));
388 &jmp (&label("cbc_enc_loop"));
390 &set_label("cbc_enc_loop",16);
391 &movups ($ivec,&QWP(0,$inp));
392 &lea ($inp,&DWP(16,$inp));
393 &pxor ($inout0,$ivec);
394 &aesni_inline_generate1("enc"); # &call ("_aesni_encrypt3");
396 &lea ($out,&DWP(16,$out));
397 &mov ($rounds,$rounds_); # restore $rounds
398 &mov ($key,$key_); # restore $key
399 &movups (&QWP(-16,$out),$inout0);
400 &jnc (&label("cbc_enc_loop"));
402 &jnz (&label("cbc_enc_tail"));
403 &movaps ($ivec,$inout0);
404 &jmp (&label("cbc_ret"));
406 &set_label("cbc_enc_tail");
407 &mov ("ecx",$len); # zaps $rounds
408 &data_word(0xA4F3F689); # rep movsb
409 &mov ("ecx",16); # zero tail
411 &xor ("eax","eax"); # zaps $len
412 &data_word(0xAAF3F689); # rep stosb
413 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
414 &mov ($rounds,$rounds_); # restore $rounds
415 &mov ($inp,$out); # $inp and $out are the same
416 &mov ($key,$key_); # restore $key
417 &jmp (&label("cbc_enc_loop"));
419 &set_label("cbc_decrypt",16);
421 &jbe (&label("cbc_dec_tail"));
422 &jmp (&label("cbc_dec_loop3"));
424 &set_label("cbc_dec_loop3",16);
425 &movups ($inout0,&QWP(0,$inp));
426 &movups ($inout1,&QWP(0x10,$inp));
427 &movups ($inout2,&QWP(0x20,$inp));
428 &movaps ($in0,$inout0);
429 &movaps ($in1,$inout1);
430 &call ("_aesni_decrypt3");
432 &lea ($inp,&DWP(0x30,$inp));
433 &lea ($out,&DWP(0x30,$out));
434 &pxor ($inout0,$ivec);
435 &pxor ($inout1,$in0);
436 &movups ($ivec,&QWP(-0x10,$inp));
437 &pxor ($inout2,$in1);
438 &movups (&QWP(-0x30,$out),$inout0);
439 &mov ($rounds,$rounds_) # restore $rounds
440 &movups (&QWP(-0x20,$out),$inout1);
441 &mov ($key,$key_); # restore $key
442 &movups (&QWP(-0x10,$out),$inout2);
443 &ja (&label("cbc_dec_loop3"));
445 &set_label("cbc_dec_tail");
447 &jz (&label("cbc_ret"));
449 &movups ($inout0,&QWP(0,$inp));
451 &movaps ($in0,$inout0);
452 &jbe (&label("cbc_dec_one"));
453 &movups ($inout1,&QWP(0x10,$inp));
455 &movaps ($in1,$inout1);
456 &jbe (&label("cbc_dec_two"));
457 &movups ($inout2,&QWP(0x20,$inp));
459 &jbe (&label("cbc_dec_three"));
460 &movups ($inout3,&QWP(0x30,$inp));
461 &call ("_aesni_decrypt4");
462 &movups ($rndkey0,&QWP(0x10,$inp));
463 &movups ($rndkey1,&QWP(0x20,$inp));
464 &pxor ($inout0,$ivec);
465 &pxor ($inout1,$in0);
466 &movups ($ivec,&QWP(0x30,$inp));
467 &movups (&QWP(0,$out),$inout0);
468 &pxor ($inout2,$rndkey0);
469 &pxor ($inout3,$rndkey1);
470 &movups (&QWP(0x10,$out),$inout1);
471 &movups (&QWP(0x20,$out),$inout2);
472 &movaps ($inout0,$inout3);
473 &lea ($out,&DWP(0x30,$out));
474 &jmp (&label("cbc_dec_tail_collected"));
476 &set_label("cbc_dec_one");
477 &aesni_inline_generate1("dec"); # &call ("_aesni_decrypt3");
478 &pxor ($inout0,$ivec);
479 &movaps ($ivec,$in0);
480 &jmp (&label("cbc_dec_tail_collected"));
482 &set_label("cbc_dec_two");
483 &call ("_aesni_decrypt3");
484 &pxor ($inout0,$ivec);
485 &pxor ($inout1,$in0);
486 &movups (&QWP(0,$out),$inout0);
487 &movaps ($inout0,$inout1);
488 &movaps ($ivec,$in1);
489 &lea ($out,&DWP(0x10,$out));
490 &jmp (&label("cbc_dec_tail_collected"));
492 &set_label("cbc_dec_three");
493 &call ("_aesni_decrypt3");
494 &pxor ($inout0,$ivec);
495 &pxor ($inout1,$in0);
496 &pxor ($inout2,$in1);
497 &movups (&QWP(0,$out),$inout0);
498 &movups (&QWP(0x10,$out),$inout1);
499 &movaps ($inout0,$inout2);
500 &movups ($ivec,&QWP(0x20,$inp));
501 &lea ($out,&DWP(0x20,$out));
503 &set_label("cbc_dec_tail_collected");
505 &jnz (&label("cbc_dec_tail_partial"));
506 &movups (&QWP(0,$out),$inout0);
507 &jmp (&label("cbc_ret"));
509 &set_label("cbc_dec_tail_partial");
513 &movaps (&QWP(0,"esp"),$inout0);
516 &data_word(0xA4F3F689); # rep movsb
519 &set_label("cbc_ret");
520 &mov ($key_,&wparam(4));
521 &movups (&QWP(0,$key_),$ivec); # output IV
522 &function_end("${PREFIX}_cbc_encrypt");
524 # Mechanical port from aesni-x86_64.pl.
526 # _aesni_set_encrypt_key is private interface,
528 # "eax" const unsigned char *userKey
535 &function_begin_B("_aesni_set_encrypt_key");
537 &jz (&label("bad_pointer"));
539 &jz (&label("bad_pointer"));
541 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
542 &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
543 &lea ($key,&DWP(16,$key));
545 &je (&label("14rounds"));
547 &je (&label("12rounds"));
549 &jne (&label("bad_keybits"));
551 &set_label("10rounds",16);
553 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
554 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
555 &call (&label("key_128_cold"));
556 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
557 &call (&label("key_128"));
558 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
559 &call (&label("key_128"));
560 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
561 &call (&label("key_128"));
562 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
563 &call (&label("key_128"));
564 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
565 &call (&label("key_128"));
566 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
567 &call (&label("key_128"));
568 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
569 &call (&label("key_128"));
570 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
571 &call (&label("key_128"));
572 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
573 &call (&label("key_128"));
574 &$movekey (&QWP(0,$key),"xmm0");
575 &mov (&DWP(80,$key),$rounds);
579 &set_label("key_128",16);
580 &$movekey (&QWP(0,$key),"xmm0");
581 &lea ($key,&DWP(16,$key));
582 &set_label("key_128_cold");
583 &shufps ("xmm4","xmm0",0b00010000);
584 &pxor ("xmm0","xmm4");
585 &shufps ("xmm4","xmm0",0b10001100,);
586 &pxor ("xmm0","xmm4");
587 &pshufd ("xmm1","xmm1",0b11111111); # critical path
588 &pxor ("xmm0","xmm1");
591 &set_label("12rounds",16);
592 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
594 &$movekey (&QWP(-16,$key),"xmm0") # round 0
595 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
596 &call (&label("key_192a_cold"));
597 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
598 &call (&label("key_192b"));
599 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
600 &call (&label("key_192a"));
601 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
602 &call (&label("key_192b"));
603 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
604 &call (&label("key_192a"));
605 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
606 &call (&label("key_192b"));
607 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
608 &call (&label("key_192a"));
609 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
610 &call (&label("key_192b"));
611 &$movekey (&QWP(0,$key),"xmm0");
612 &mov (&DWP(48,$key),$rounds);
616 &set_label("key_192a",16);
617 &$movekey (&QWP(0,$key),"xmm0");
618 &lea ($key,&DWP(16,$key));
619 &set_label("key_192a_cold",16);
620 &movaps ("xmm5","xmm2");
621 &set_label("key_192b_warm");
622 &shufps ("xmm4","xmm0",0b00010000);
623 &movaps ("xmm3","xmm2");
624 &pxor ("xmm0","xmm4");
625 &shufps ("xmm4","xmm0",0b10001100);
627 &pxor ("xmm0","xmm4");
628 &pshufd ("xmm1","xmm1",0b01010101); # critical path
629 &pxor ("xmm2","xmm3");
630 &pxor ("xmm0","xmm1");
631 &pshufd ("xmm3","xmm0",0b11111111);
632 &pxor ("xmm2","xmm3");
635 &set_label("key_192b",16);
636 &movaps ("xmm3","xmm0");
637 &shufps ("xmm5","xmm0",0b01000100);
638 &$movekey (&QWP(0,$key),"xmm5");
639 &shufps ("xmm3","xmm2",0b01001110);
640 &$movekey (&QWP(16,$key),"xmm3");
641 &lea ($key,&DWP(32,$key));
642 &jmp (&label("key_192b_warm"));
644 &set_label("14rounds",16);
645 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
647 &lea ($key,&DWP(16,$key));
648 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
649 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
650 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
651 &call (&label("key_256a_cold"));
652 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
653 &call (&label("key_256b"));
654 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
655 &call (&label("key_256a"));
656 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
657 &call (&label("key_256b"));
658 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
659 &call (&label("key_256a"));
660 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
661 &call (&label("key_256b"));
662 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
663 &call (&label("key_256a"));
664 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
665 &call (&label("key_256b"));
666 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
667 &call (&label("key_256a"));
668 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
669 &call (&label("key_256b"));
670 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
671 &call (&label("key_256a"));
672 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
673 &call (&label("key_256b"));
674 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
675 &call (&label("key_256a"));
676 &$movekey (&QWP(0,$key),"xmm0");
677 &mov (&DWP(16,$key),$rounds);
681 &set_label("key_256a",16);
682 &$movekey (&QWP(0,$key),"xmm2");
683 &lea ($key,&DWP(16,$key));
684 &set_label("key_256a_cold");
685 &shufps ("xmm4","xmm0",0b00010000);
686 &pxor ("xmm0","xmm4");
687 &shufps ("xmm4","xmm0",0b10001100);
688 &pxor ("xmm0","xmm4");
689 &pshufd ("xmm1","xmm1",0b11111111); # critical path
690 &pxor ("xmm0","xmm1");
693 &set_label("key_256b",16);
694 &$movekey (&QWP(0,$key),"xmm0");
695 &lea ($key,&DWP(16,$key));
697 &shufps ("xmm4","xmm2",0b00010000);
698 &pxor ("xmm2","xmm4");
699 &shufps ("xmm4","xmm2",0b10001100);
700 &pxor ("xmm2","xmm4");
701 &pshufd ("xmm1","xmm1",0b10101010); # critical path
702 &pxor ("xmm2","xmm1");
705 &set_label("bad_pointer",4);
708 &set_label("bad_keybits",4);
711 &function_end_B("_aesni_set_encrypt_key");
713 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
715 &function_begin_B("${PREFIX}_set_encrypt_key");
716 &mov ("eax",&wparam(0));
717 &mov ($rounds,&wparam(1));
718 &mov ($key,&wparam(2));
719 &call ("_aesni_set_encrypt_key");
721 &function_end_B("${PREFIX}_set_encrypt_key");
723 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
725 &function_begin_B("${PREFIX}_set_decrypt_key");
726 &mov ("eax",&wparam(0));
727 &mov ($rounds,&wparam(1));
728 &mov ($key,&wparam(2));
729 &call ("_aesni_set_encrypt_key");
730 &mov ($key,&wparam(2));
731 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
733 &jnz (&label("dec_key_ret"));
734 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
736 &$movekey ("xmm0",&QWP(0,$key)); # just swap
737 &$movekey ("xmm1",&QWP(0,"eax"));
738 &$movekey (&QWP(0,"eax"),"xmm0");
739 &$movekey (&QWP(0,$key),"xmm1");
740 &lea ($key,&DWP(16,$key));
741 &lea ("eax",&DWP(-16,"eax"));
743 &set_label("dec_key_inverse");
744 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
745 &$movekey ("xmm1",&QWP(0,"eax"));
746 &aesimc ("xmm0","xmm0");
747 &aesimc ("xmm1","xmm1");
748 &lea ($key,&DWP(16,$key));
749 &lea ("eax",&DWP(-16,"eax"));
751 &$movekey (&QWP(16,"eax"),"xmm0");
752 &$movekey (&QWP(-16,$key),"xmm1");
753 &ja (&label("dec_key_inverse"));
755 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
756 &aesimc ("xmm0","xmm0");
757 &$movekey (&QWP(0,$key),"xmm0");
759 &xor ("eax","eax"); # return success
760 &set_label("dec_key_ret");
762 &function_end_B("${PREFIX}_set_decrypt_key");
763 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");