3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
17 # To start with see corresponding paragraph in aesni-x86_64.pl...
18 # Instead of filling table similar to one found there I've chosen to
19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20 # The simplified table below represents 32-bit performance relative
21 # to 64-bit one in every given point. Ratios vary for different
22 # encryption modes, therefore interval values.
24 # 16-byte 64-byte 256-byte 1-KB 8-KB
25 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
27 # Lower ratios for smaller block sizes are perfectly understandable,
28 # because function call overhead is higher in 32-bit mode. Largest
29 # 8-KB block performance is virtually same: 32-bit code is less than
30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
32 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
33 # generates drop-in replacement for
34 # crypto/aes/asm/aes-586.pl:-)
35 $inline=1; # inline _aesni_[en|de]crypt
37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38 push(@INC,"${dir}","${dir}../../perlasm");
41 &asm_init($ARGV[0],$0);
43 if ($PREFIX eq "aesni") { $movekey=*movaps; }
44 else { $movekey=*movups; }
51 $rounds_="ebx"; # backup copy for $rounds
52 $key_="ebp"; # backup copy for $key
61 $in1="xmm7"; $inout3="xmm7";
63 # Inline version of internal aesni_[en|de]crypt1
65 sub aesni_inline_generate1
66 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
69 &$movekey ($rndkey0,&QWP(0,$key));
70 &$movekey ($rndkey1,&QWP(16,$key));
71 &lea ($key,&DWP(32,$key));
72 &pxor ($inout,$rndkey0);
73 &set_label("${p}1_loop_$sn");
74 eval"&aes${p} ($inout,$rndkey1)";
76 &$movekey ($rndkey1,&QWP(0,$key));
77 &lea ($key,&DWP(16,$key));
78 &jnz (&label("${p}1_loop_$sn"));
79 eval"&aes${p}last ($inout,$rndkey1)";
82 sub aesni_generate1 # fully unrolled loop
83 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
85 &function_begin_B("_aesni_${p}rypt1");
86 &$movekey ($rndkey0,&QWP(0,$key));
87 &$movekey ($rndkey1,&QWP(0x10,$key));
88 &pxor ($inout,$rndkey0);
89 &$movekey ($rndkey0,&QWP(0x20,$key));
90 &lea ($key,&DWP(0x30,$key));
92 &jb (&label("${p}128"));
93 &lea ($key,&DWP(0x20,$key));
94 &je (&label("${p}192"));
95 &lea ($key,&DWP(0x20,$key));
96 eval"&aes${p} ($inout,$rndkey1)";
97 &$movekey ($rndkey1,&QWP(-0x40,$key));
98 eval"&aes${p} ($inout,$rndkey0)";
99 &$movekey ($rndkey0,&QWP(-0x30,$key));
100 &set_label("${p}192");
101 eval"&aes${p} ($inout,$rndkey1)";
102 &$movekey ($rndkey1,&QWP(-0x20,$key));
103 eval"&aes${p} ($inout,$rndkey0)";
104 &$movekey ($rndkey0,&QWP(-0x10,$key));
105 &set_label("${p}128");
106 eval"&aes${p} ($inout,$rndkey1)";
107 &$movekey ($rndkey1,&QWP(0,$key));
108 eval"&aes${p} ($inout,$rndkey0)";
109 &$movekey ($rndkey0,&QWP(0x10,$key));
110 eval"&aes${p} ($inout,$rndkey1)";
111 &$movekey ($rndkey1,&QWP(0x20,$key));
112 eval"&aes${p} ($inout,$rndkey0)";
113 &$movekey ($rndkey0,&QWP(0x30,$key));
114 eval"&aes${p} ($inout,$rndkey1)";
115 &$movekey ($rndkey1,&QWP(0x40,$key));
116 eval"&aes${p} ($inout,$rndkey0)";
117 &$movekey ($rndkey0,&QWP(0x50,$key));
118 eval"&aes${p} ($inout,$rndkey1)";
119 &$movekey ($rndkey1,&QWP(0x60,$key));
120 eval"&aes${p} ($inout,$rndkey0)";
121 &$movekey ($rndkey0,&QWP(0x70,$key));
122 eval"&aes${p} ($inout,$rndkey1)";
123 eval"&aes${p}last ($inout,$rndkey0)";
125 &function_end_B("_aesni_${p}rypt1");
128 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
129 &aesni_generate1("enc") if (!$inline);
130 &function_begin_B("${PREFIX}_encrypt");
131 &mov ("eax",&wparam(0));
132 &mov ($key,&wparam(2));
133 &movups ($inout0,&QWP(0,"eax"));
134 &mov ($rounds,&DWP(240,$key));
135 &mov ("eax",&wparam(1));
137 { &aesni_inline_generate1("enc"); }
139 { &call ("_aesni_encrypt1"); }
140 &movups (&QWP(0,"eax"),$inout0);
142 &function_end_B("${PREFIX}_encrypt");
144 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
145 &aesni_generate1("dec") if(!$inline);
146 &function_begin_B("${PREFIX}_decrypt");
147 &mov ("eax",&wparam(0));
148 &mov ($key,&wparam(2));
149 &movups ($inout0,&QWP(0,"eax"));
150 &mov ($rounds,&DWP(240,$key));
151 &mov ("eax",&wparam(1));
153 { &aesni_inline_generate1("dec"); }
155 { &call ("_aesni_decrypt1"); }
156 &movups (&QWP(0,"eax"),$inout0);
158 &function_end_B("${PREFIX}_decrypt");
160 # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
161 # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
162 # latency is 6, it turned out that it can be scheduled only every
163 # *second* cycle. Thus 3x interleave is the one providing optimal
164 # utilization, i.e. when subroutine's throughput is virtually same as
165 # of non-interleaved subroutine [for number of input blocks up to 3].
166 # This is why it makes no sense to implement 2x subroutine. As soon
167 # as/if Intel improves throughput by making it possible to schedule
168 # the instructions in question *every* cycles I would have to
169 # implement 6x interleave and use it in loop...
173 &function_begin_B("_aesni_${p}rypt3");
174 &$movekey ($rndkey0,&QWP(0,$key));
176 &$movekey ($rndkey1,&QWP(16,$key));
177 &lea ($key,&DWP(32,$key));
178 &pxor ($inout0,$rndkey0);
179 &pxor ($inout1,$rndkey0);
180 &pxor ($inout2,$rndkey0);
181 &$movekey ($rndkey0,&QWP(0,$key));
183 &set_label("${p}3_loop");
184 eval"&aes${p} ($inout0,$rndkey1)";
185 eval"&aes${p} ($inout1,$rndkey1)";
187 eval"&aes${p} ($inout2,$rndkey1)";
188 &$movekey ($rndkey1,&QWP(16,$key));
189 eval"&aes${p} ($inout0,$rndkey0)";
190 eval"&aes${p} ($inout1,$rndkey0)";
191 &lea ($key,&DWP(32,$key));
192 eval"&aes${p} ($inout2,$rndkey0)";
193 &$movekey ($rndkey0,&QWP(0,$key));
194 &jnz (&label("${p}3_loop"));
195 eval"&aes${p} ($inout0,$rndkey1)";
196 eval"&aes${p} ($inout1,$rndkey1)";
197 eval"&aes${p} ($inout2,$rndkey1)";
198 eval"&aes${p}last ($inout0,$rndkey0)";
199 eval"&aes${p}last ($inout1,$rndkey0)";
200 eval"&aes${p}last ($inout2,$rndkey0)";
202 &function_end_B("_aesni_${p}rypt3");
205 # 4x interleave is implemented to improve small block performance,
206 # most notably [and naturally] 4 block by ~30%. One can argue that one
207 # should have implemented 5x as well, but improvement would be <20%,
208 # so it's not worth it...
212 &function_begin_B("_aesni_${p}rypt4");
213 &$movekey ($rndkey0,&QWP(0,$key));
214 &$movekey ($rndkey1,&QWP(16,$key));
216 &lea ($key,&DWP(32,$key));
217 &pxor ($inout0,$rndkey0);
218 &pxor ($inout1,$rndkey0);
219 &pxor ($inout2,$rndkey0);
220 &pxor ($inout3,$rndkey0);
221 &$movekey ($rndkey0,&QWP(0,$key));
223 &set_label("${p}3_loop");
224 eval"&aes${p} ($inout0,$rndkey1)";
225 eval"&aes${p} ($inout1,$rndkey1)";
227 eval"&aes${p} ($inout2,$rndkey1)";
228 eval"&aes${p} ($inout3,$rndkey1)";
229 &$movekey ($rndkey1,&QWP(16,$key));
230 eval"&aes${p} ($inout0,$rndkey0)";
231 eval"&aes${p} ($inout1,$rndkey0)";
232 &lea ($key,&DWP(32,$key));
233 eval"&aes${p} ($inout2,$rndkey0)";
234 eval"&aes${p} ($inout3,$rndkey0)";
235 &$movekey ($rndkey0,&QWP(0,$key));
236 &jnz (&label("${p}3_loop"));
238 eval"&aes${p} ($inout0,$rndkey1)";
239 eval"&aes${p} ($inout1,$rndkey1)";
240 eval"&aes${p} ($inout2,$rndkey1)";
241 eval"&aes${p} ($inout3,$rndkey1)";
242 eval"&aes${p}last ($inout0,$rndkey0)";
243 eval"&aes${p}last ($inout1,$rndkey0)";
244 eval"&aes${p}last ($inout2,$rndkey0)";
245 eval"&aes${p}last ($inout3,$rndkey0)";
247 &function_end_B("_aesni_${p}rypt4");
249 &aesni_generate3("enc") if ($PREFIX eq "aesni");
250 &aesni_generate3("dec");
251 &aesni_generate4("enc") if ($PREFIX eq "aesni");
252 &aesni_generate4("dec");
254 if ($PREFIX eq "aesni") {
255 ######################################################################
256 # void aesni_ecb_encrypt (const void *in, void *out,
257 # size_t length, const AES_KEY *key,
259 &function_begin("aesni_ecb_encrypt");
260 &mov ($inp,&wparam(0));
261 &mov ($out,&wparam(1));
262 &mov ($len,&wparam(2));
263 &mov ($key,&wparam(3));
264 &mov ($rounds,&wparam(4));
266 &jb (&label("ecb_ret"));
268 &test ($rounds,$rounds)
269 &mov ($rounds,&DWP(240,$key));
270 &mov ($key_,$key); # backup $key
271 &mov ($rounds_,$rounds); # backup $rounds
272 &jz (&label("ecb_decrypt"));
275 &jbe (&label("ecb_enc_tail"));
277 &jmp (&label("ecb_enc_loop3"));
279 &set_label("ecb_enc_loop3",16);
280 &movups ($inout0,&QWP(0,$inp));
281 &movups ($inout1,&QWP(0x10,$inp));
282 &movups ($inout2,&QWP(0x20,$inp));
283 &call ("_aesni_encrypt3");
284 &lea ($inp,&DWP(0x30,$inp));
285 &movups (&QWP(0,$out),$inout0);
286 &mov ($key,$key_); # restore $key
287 &movups (&QWP(0x10,$out),$inout1);
288 &mov ($rounds,$rounds_); # restore $rounds
289 &movups (&QWP(0x20,$out),$inout2);
290 &lea ($out,&DWP(0x30,$out));
292 &ja (&label("ecb_enc_loop3"));
295 &set_label("ecb_enc_tail");
296 &movups ($inout0,&QWP(0,$inp));
298 &jb (&label("ecb_enc_one"));
299 &movups ($inout1,&QWP(0x10,$inp));
300 &je (&label("ecb_enc_two"));
301 &movups ($inout2,&QWP(0x20,$inp));
303 &je (&label("ecb_enc_three"));
304 &movups ($inout3,&QWP(0x30,$inp));
305 &call ("_aesni_encrypt4");
306 &movups (&QWP(0,$out),$inout0);
307 &movups (&QWP(0x10,$out),$inout1);
308 &movups (&QWP(0x20,$out),$inout2);
309 &movups (&QWP(0x30,$out),$inout3);
310 jmp (&label("ecb_ret"));
312 &set_label("ecb_enc_one",16);
314 { &aesni_inline_generate1("enc"); }
316 { &call ("_aesni_encrypt1"); }
317 &movups (&QWP(0,$out),$inout0);
318 &jmp (&label("ecb_ret"));
320 &set_label("ecb_enc_two",16);
321 &pxor ($inout2,$inout2);
322 &call ("_aesni_encrypt3");
323 &movups (&QWP(0,$out),$inout0);
324 &movups (&QWP(0x10,$out),$inout1);
325 &jmp (&label("ecb_ret"));
327 &set_label("ecb_enc_three",16);
328 &call ("_aesni_encrypt3");
329 &movups (&QWP(0,$out),$inout0);
330 &movups (&QWP(0x10,$out),$inout1);
331 &movups (&QWP(0x20,$out),$inout2);
332 &jmp (&label("ecb_ret"));
333 ######################################################################
334 &set_label("ecb_decrypt",16);
336 &jbe (&label("ecb_dec_tail"));
338 &jmp (&label("ecb_dec_loop3"));
340 &set_label("ecb_dec_loop3",16);
341 &movups ($inout0,&QWP(0,$inp));
342 &movups ($inout1,&QWP(0x10,$inp));
343 &movups ($inout2,&QWP(0x20,$inp));
344 &call ("_aesni_decrypt3");
345 &lea ($inp,&DWP(0x30,$inp));
346 &movups (&QWP(0,$out),$inout0);
347 &mov ($key,$key_); # restore $key
348 &movups (&QWP(0x10,$out),$inout1);
349 &mov ($rounds,$rounds_); # restore $rounds
350 &movups (&QWP(0x20,$out),$inout2);
351 &lea ($out,&DWP(0x30,$out));
353 &ja (&label("ecb_dec_loop3"));
356 &set_label("ecb_dec_tail");
357 &movups ($inout0,&QWP(0,$inp));
359 &jb (&label("ecb_dec_one"));
360 &movups ($inout1,&QWP(0x10,$inp));
361 &je (&label("ecb_dec_two"));
362 &movups ($inout2,&QWP(0x20,$inp));
364 &je (&label("ecb_dec_three"));
365 &movups ($inout3,&QWP(0x30,$inp));
366 &call ("_aesni_decrypt4");
367 &movups (&QWP(0,$out),$inout0);
368 &movups (&QWP(0x10,$out),$inout1);
369 &movups (&QWP(0x20,$out),$inout2);
370 &movups (&QWP(0x30,$out),$inout3);
371 &jmp (&label("ecb_ret"));
373 &set_label("ecb_dec_one",16);
375 { &aesni_inline_generate1("dec"); }
377 { &call ("_aesni_decrypt1"); }
378 &movups (&QWP(0,$out),$inout0);
379 &jmp (&label("ecb_ret"));
381 &set_label("ecb_dec_two",16);
382 &pxor ($inout2,$inout2);
383 &call ("_aesni_decrypt3");
384 &movups (&QWP(0,$out),$inout0);
385 &movups (&QWP(0x10,$out),$inout1);
386 &jmp (&label("ecb_ret"));
388 &set_label("ecb_dec_three",16);
389 &call ("_aesni_decrypt3");
390 &movups (&QWP(0,$out),$inout0);
391 &movups (&QWP(0x10,$out),$inout1);
392 &movups (&QWP(0x20,$out),$inout2);
394 &set_label("ecb_ret");
395 &function_end("aesni_ecb_encrypt");
397 ######################################################################
398 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
399 # size_t blocks, const AES_KEY *key,
400 # const char *ivec,char *cmac);
402 # Handles only complete blocks, operates on 64-bit counter and
403 # does not update *ivec! Nor does it finalize CMAC value
404 # (see engine/eng_aesni.c for details)
406 &function_begin("aesni_ccm64_encrypt_blocks");
407 &mov ($inp,&wparam(0));
408 &mov ($out,&wparam(1));
409 &mov ($len,&wparam(2));
410 &mov ($key,&wparam(3));
411 &mov ($rounds_,&wparam(4));
412 &mov ($rounds,&wparam(5));
415 &and ("esp",-16); # align stack
416 &mov (&DWP(48,"esp"),$key_);
418 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
419 &movdqu ($inout1,&QWP(0,$rounds)); # load cmac
421 # compose byte-swap control mask for pshufb on stack
422 &mov (&DWP(0,"esp"),0x0c0d0e0f);
423 &mov (&DWP(4,"esp"),0x08090a0b);
424 &mov (&DWP(8,"esp"),0x04050607);
425 &mov (&DWP(12,"esp"),0x00010203);
427 # compose counter increment vector on stack
430 &mov (&DWP(16,"esp"),$rounds);
431 &mov (&DWP(20,"esp"),$key_);
432 &mov (&DWP(24,"esp"),$key_);
433 &mov (&DWP(28,"esp"),$key_);
435 &movdqa ($inout3,&QWP(0,"esp"));
436 &pshufb ($ivec,$inout3); # keep iv in reverse order
438 &mov ($rounds,&DWP(240,$key));
440 &mov ($rounds_,$rounds);
441 &movdqa ($inout0,$ivec);
443 &set_label("ccm64_enc_outer");
444 &movdqu ($in0,&QWP(0,$inp));
445 &pshufb ($inout0,$inout3);
447 &mov ($rounds,$rounds_);
448 &pxor ($inout1,$in0); # cmac^=inp
449 &pxor ($inout2,$inout2);
451 &call ("_aesni_encrypt3");
453 &paddq ($ivec,&QWP(16,"esp"));
455 &lea ($inp,&DWP(16,$inp));
456 &pxor ($in0,$inout0); # inp^=E(ivec)
457 &movdqa ($inout0,$ivec);
458 &movdqu (&QWP(0,$out),$in0);
459 &lea ($out,&DWP(16,$out));
460 &jnz (&label("ccm64_enc_outer"));
462 &mov ("esp",&DWP(48,"esp"));
463 &mov ($out,&wparam(5));
464 &movdqu (&QWP(0,$out),$inout1);
465 &function_end("aesni_ccm64_encrypt_blocks");
467 &function_begin("aesni_ccm64_decrypt_blocks");
468 &mov ($inp,&wparam(0));
469 &mov ($out,&wparam(1));
470 &mov ($len,&wparam(2));
471 &mov ($key,&wparam(3));
472 &mov ($rounds_,&wparam(4));
473 &mov ($rounds,&wparam(5));
476 &and ("esp",-16); # align stack
477 &mov (&DWP(48,"esp"),$key_);
479 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
480 &movdqu ($inout1,&QWP(0,$rounds)); # load cmac
482 # compose byte-swap control mask for pshufb on stack
483 &mov (&DWP(0,"esp"),0x0c0d0e0f);
484 &mov (&DWP(4,"esp"),0x08090a0b);
485 &mov (&DWP(8,"esp"),0x04050607);
486 &mov (&DWP(12,"esp"),0x00010203);
488 # compose counter increment vector on stack
491 &mov (&DWP(16,"esp"),$rounds);
492 &mov (&DWP(20,"esp"),$key_);
493 &mov (&DWP(24,"esp"),$key_);
494 &mov (&DWP(28,"esp"),$key_);
496 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
497 &movdqa ($inout0,$ivec);
498 &pshufb ($ivec,$inout3); # keep iv in reverse order
500 &mov ($rounds,&DWP(240,$key));
502 &mov ($rounds_,$rounds);
505 { &aesni_inline_generate1("enc"); }
507 { &call ("_aesni_encrypt1"); }
509 &set_label("ccm64_dec_outer");
510 &movdqu ($in0,&QWP(0,$inp));
511 &paddq ($ivec,&QWP(16,"esp"));
513 &lea ($inp,&QWP(16,$inp));
514 &pxor ($in0,$inout0);
515 &movdqa ($inout0,$ivec);
517 &mov ($rounds,$rounds_);
518 &pshufb ($inout0,$inout3);
519 &movdqu (&QWP(0,$out),$in0);
520 &lea ($out,&DWP(16,$out));
522 &jz (&label("ccm64_dec_break"));
524 &pxor ($inout2,$inout2);
525 &call ("_aesni_encrypt3");
527 &jmp (&label("ccm64_dec_outer"));
529 &set_label("ccm64_dec_break",16);
531 { &aesni_inline_generate1("enc",$inout1); }
533 { &call ("_aesni_encrypt1",$inout1); }
535 &mov ("esp",&DWP(48,"esp"));
536 &mov ($out,&wparam(5));
537 &movdqu (&QWP(0,$out),$inout1);
538 &function_end("aesni_ccm64_decrypt_blocks");
540 ######################################################################
541 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
542 # size_t blocks, const AES_KEY *key,
545 # Handles only complete blocks, operates on 32-bit counter and
546 # does not update *ivec! (see engine/eng_aesni.c for details)
548 &function_begin("aesni_ctr32_encrypt_blocks");
549 &mov ($inp,&wparam(0));
550 &mov ($out,&wparam(1));
551 &mov ($len,&wparam(2));
552 &mov ($key,&wparam(3));
553 &mov ($rounds_,&wparam(4));
556 &and ("esp",-16); # align stack
557 &mov (&DWP(48,"esp"),$key_);
560 &je (&label("ctr32_one_shortcut"));
562 &movups ($inout3,&QWP(0,$rounds_)); # load ivec
564 # compose byte-swap control mask for pshufb on stack
565 &mov (&DWP(0,"esp"),0x0c0d0e0f);
566 &mov (&DWP(4,"esp"),0x08090a0b);
567 &mov (&DWP(8,"esp"),0x04050607);
568 &mov (&DWP(12,"esp"),0x00010203);
570 # compose counter increment vector on stack
573 &mov (&DWP(16,"esp"),$rounds);
574 &mov (&DWP(20,"esp"),$rounds);
575 &mov (&DWP(24,"esp"),$rounds);
576 &mov (&DWP(28,"esp"),$key_);
578 &pextrd ($rounds_,$inout3,3); # pull 32-bit counter
579 &pinsrd ($inout3,$key_,3); # wipe 32-bit counter
581 &mov ($rounds,&DWP(240,$key)); # key->rounds
582 &movdqa ($rndkey0,&QWP(0,"esp")); # load byte-swap mask
584 # $ivec is vector of 3 32-bit counters
587 &pinsrd ($ivec,$rounds_,0);
589 &pinsrd ($ivec,$rounds_,1);
591 &pinsrd ($ivec,$rounds_,2);
592 &pshufb ($ivec,$rndkey0); # byte swap
595 &jbe (&label("ctr32_tail"));
596 &movdqa (&QWP(32,"esp"),$inout3); # save counter-less ivec
597 &mov ($rounds_,$rounds);
600 &jmp (&label("ctr32_loop3"));
602 &set_label("ctr32_loop3",16);
603 &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword
604 &pshufd ($inout1,$ivec,2<<6);
605 &por ($inout0,$inout3); # merge counter-less ivec
606 &pshufd ($inout2,$ivec,1<<6);
607 &por ($inout1,$inout3);
608 &por ($inout2,$inout3);
610 # inline _aesni_encrypt3 and interleave last round
613 &$movekey ($rndkey0,&QWP(0,$key));
615 &$movekey ($rndkey1,&QWP(16,$key));
616 &lea ($key,&DWP(32,$key));
617 &pxor ($inout0,$rndkey0);
618 &pxor ($inout1,$rndkey0);
619 &pxor ($inout2,$rndkey0);
620 &$movekey ($rndkey0,&QWP(0,$key));
622 &set_label("ctr32_enc_loop3");
623 &aesenc ($inout0,$rndkey1);
624 &aesenc ($inout1,$rndkey1);
626 &aesenc ($inout2,$rndkey1);
627 &$movekey ($rndkey1,&QWP(16,$key));
628 &aesenc ($inout0,$rndkey0);
629 &aesenc ($inout1,$rndkey0);
630 &lea ($key,&DWP(32,$key));
631 &aesenc ($inout2,$rndkey0);
632 &$movekey ($rndkey0,&QWP(0,$key));
633 &jnz (&label("ctr32_enc_loop3"));
635 &aesenc ($inout0,$rndkey1);
636 &aesenc ($inout1,$rndkey1);
637 &aesenc ($inout2,$rndkey1);
638 &movdqa ($rndkey1,&QWP(0,"esp")); # load byte-swap mask
640 &aesenclast ($inout0,$rndkey0);
641 &pshufb ($ivec,$rndkey1); # byte swap
642 &movdqu ($in0,&QWP(0,$inp));
643 &aesenclast ($inout1,$rndkey0);
644 &paddd ($ivec,&QWP(16,"esp")); # counter increment
645 &movdqu ($in1,&QWP(0x10,$inp));
646 &aesenclast ($inout2,$rndkey0);
647 &pshufb ($ivec,$rndkey1); # byte swap
648 &movdqu ($rndkey0,&QWP(0x20,$inp));
649 &lea ($inp,&DWP(0x30,$inp));
651 &pxor ($in0,$inout0);
653 &pxor ($in1,$inout1);
654 &movdqu (&QWP(0,$out),$in0);
655 &pxor ($rndkey0,$inout2);
656 &movdqu (&QWP(0x10,$out),$in1);
657 &movdqu (&QWP(0x20,$out),$rndkey0);
658 &movdqa ($inout3,&QWP(32,"esp")); # load counter-less ivec
661 &lea ($out,&DWP(0x30,$out));
662 &mov ($rounds,$rounds_);
663 &ja (&label("ctr32_loop3"));
665 &pextrd ($rounds_,$ivec,1); # might need last counter value
669 &set_label("ctr32_tail");
670 &pshufd ($inout0,$ivec,3<<6);
671 &pshufd ($inout1,$ivec,2<<6);
672 &por ($inout0,$inout3);
674 &jb (&label("ctr32_one"));
675 &lea ($rounds_,&DWP(1,$rounds_));
676 &pshufd ($inout2,$ivec,1<<6);
677 &por ($inout1,$inout3);
678 &je (&label("ctr32_two"));
680 &por ($inout2,$inout3);
682 &je (&label("ctr32_three"));
684 &pinsrd ($inout3,$rounds_,3); # compose last counter value
686 &call ("_aesni_encrypt4");
688 &movdqu ($in0,&QWP(0,$inp));
689 &movdqu ($rndkey1,&QWP(0x10,$inp));
690 &pxor ($in0,$inout0);
691 &movdqu ($rndkey0,&QWP(0x20,$inp));
692 &pxor ($rndkey1,$inout1);
693 &movdqu ($ivec,&QWP(0x30,$inp));
694 &pxor ($rndkey0,$inout2);
695 &movdqu (&QWP(0,$out),$in0);
696 &pxor ($ivec,$inout3);
697 &movdqu (&QWP(0x10,$out),$rndkey1);
698 &movdqu (&QWP(0x20,$out),$rndkey0);
699 &movdqu (&QWP(0x30,$out),$ivec);
700 &jmp (&label("ctr32_ret"));
702 &set_label("ctr32_one_shortcut",16);
703 &movdqu ($inout0,&QWP(0,$rounds_)); # load ivec
704 &mov ($rounds,&DWP(240,$key));
706 &set_label("ctr32_one");
708 { &aesni_inline_generate1("enc"); }
710 { &call ("_aesni_encrypt1"); }
711 &movdqu ($in0,&QWP(0,$inp));
712 &pxor ($in0,$inout0);
713 &movdqu (&QWP(0,$out),$in0);
714 &jmp (&label("ctr32_ret"));
716 &set_label("ctr32_two",16);
717 &pxor ($inout2,$inout2);
718 &call ("_aesni_encrypt3");
719 &movdqu ($in0,&QWP(0,$inp));
720 &movdqu ($in1,&QWP(0x10,$inp));
721 &pxor ($in0,$inout0);
722 &pxor ($in1,$inout1);
723 &movdqu (&QWP(0,$out),$in0);
724 &movdqu (&QWP(0x10,$out),$in1);
725 &jmp (&label("ctr32_ret"));
727 &set_label("ctr32_three",16);
728 &call ("_aesni_encrypt3");
729 &movdqu ($in0,&QWP(0,$inp));
730 &movdqu ($in1,&QWP(0x10,$inp));
731 &movdqu ($rndkey1,&QWP(0x20,$inp));
732 &pxor ($in0,$inout0);
733 &pxor ($in1,$inout1);
734 &movdqu (&QWP(0,$out),$in0);
735 &pxor ($rndkey1,$inout2);
736 &movdqu (&QWP(0x10,$out),$in1);
737 &movdqu (&QWP(0x20,$out),$rndkey1);
739 &set_label("ctr32_ret");
740 &mov ("esp",&DWP(48,"esp"));
741 &function_end("aesni_ctr32_encrypt_blocks");
744 ######################################################################
745 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
746 # size_t length, const AES_KEY *key,
747 # unsigned char *ivp,const int enc);
748 &function_begin("${PREFIX}_cbc_encrypt");
749 &mov ($inp,&wparam(0));
750 &mov ($out,&wparam(1));
751 &mov ($len,&wparam(2));
752 &mov ($key,&wparam(3));
753 &mov ($key_,&wparam(4));
755 &jz (&label("cbc_ret"));
758 &movdqu ($ivec,&QWP(0,$key_)); # load IV
759 &mov ($rounds,&DWP(240,$key));
760 &mov ($key_,$key); # backup $key
761 &mov ($rounds_,$rounds); # backup $rounds
762 &je (&label("cbc_decrypt"));
764 &movdqa ($inout0,$ivec);
766 &jb (&label("cbc_enc_tail"));
768 &jmp (&label("cbc_enc_loop"));
770 &set_label("cbc_enc_loop",16);
771 &movdqu ($ivec,&QWP(0,$inp));
772 &lea ($inp,&DWP(16,$inp));
773 &pxor ($inout0,$ivec);
775 { &aesni_inline_generate1("enc"); }
777 { &call ("_aesni_encrypt1"); }
778 &mov ($rounds,$rounds_); # restore $rounds
779 &mov ($key,$key_); # restore $key
780 &movups (&QWP(0,$out),$inout0); # store output
781 &lea ($out,&DWP(16,$out));
783 &jnc (&label("cbc_enc_loop"));
785 &jnz (&label("cbc_enc_tail"));
786 &movaps ($ivec,$inout0);
787 &jmp (&label("cbc_ret"));
789 &set_label("cbc_enc_tail");
790 &mov ("ecx",$len); # zaps $rounds
791 &data_word(0xA4F3F689); # rep movsb
792 &mov ("ecx",16); # zero tail
794 &xor ("eax","eax"); # zaps $len
795 &data_word(0xAAF3F689); # rep stosb
796 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
797 &mov ($rounds,$rounds_); # restore $rounds
798 &mov ($inp,$out); # $inp and $out are the same
799 &mov ($key,$key_); # restore $key
800 &jmp (&label("cbc_enc_loop"));
801 ######################################################################
802 &set_label("cbc_decrypt",16);
804 &jbe (&label("cbc_dec_tail"));
806 &jmp (&label("cbc_dec_loop3"));
808 &set_label("cbc_dec_loop3",16);
809 &movups ($inout0,&QWP(0,$inp));
810 &movups ($inout1,&QWP(0x10,$inp));
811 &movups ($inout2,&QWP(0x20,$inp));
812 &movaps ($in0,$inout0);
813 &movaps ($in1,$inout1);
815 &call ("_aesni_decrypt3");
817 &pxor ($inout0,$ivec);
818 &pxor ($inout1,$in0);
819 &movdqu ($ivec,&QWP(0x20,$inp));
820 &lea ($inp,&DWP(0x30,$inp));
821 &pxor ($inout2,$in1);
822 &movdqu (&QWP(0,$out),$inout0);
823 &mov ($rounds,$rounds_) # restore $rounds
824 &movdqu (&QWP(0x10,$out),$inout1);
825 &mov ($key,$key_); # restore $key
826 &movdqu (&QWP(0x20,$out),$inout2);
827 &lea ($out,&DWP(0x30,$out));
829 &ja (&label("cbc_dec_loop3"));
832 &set_label("cbc_dec_tail");
833 &movups ($inout0,&QWP(0,$inp));
834 &movaps ($in0,$inout0);
836 &jbe (&label("cbc_dec_one"));
837 &movups ($inout1,&QWP(0x10,$inp));
838 &movaps ($in1,$inout1);
840 &jbe (&label("cbc_dec_two"));
841 &movups ($inout2,&QWP(0x20,$inp));
843 &jbe (&label("cbc_dec_three"));
844 &movups ($inout3,&QWP(0x30,$inp));
845 &call ("_aesni_decrypt4");
846 &movdqu ($rndkey0,&QWP(0x10,$inp));
847 &movdqu ($rndkey1,&QWP(0x20,$inp));
848 &pxor ($inout0,$ivec);
849 &pxor ($inout1,$in0);
850 &movdqu ($ivec,&QWP(0x30,$inp));
851 &movdqu (&QWP(0,$out),$inout0);
852 &pxor ($inout2,$rndkey0);
853 &pxor ($inout3,$rndkey1);
854 &movdqu (&QWP(0x10,$out),$inout1);
855 &movdqu (&QWP(0x20,$out),$inout2);
856 &movdqa ($inout0,$inout3);
857 &lea ($out,&DWP(0x30,$out));
858 &jmp (&label("cbc_dec_tail_collected"));
860 &set_label("cbc_dec_one",16);
862 { &aesni_inline_generate1("dec"); }
864 { &call ("_aesni_decrypt1"); }
865 &pxor ($inout0,$ivec);
866 &movdqa ($ivec,$in0);
867 &jmp (&label("cbc_dec_tail_collected"));
869 &set_label("cbc_dec_two",16);
870 &pxor ($inout2,$inout2);
871 &call ("_aesni_decrypt3");
872 &pxor ($inout0,$ivec);
873 &pxor ($inout1,$in0);
874 &movdqu (&QWP(0,$out),$inout0);
875 &movdqa ($inout0,$inout1);
876 &movdqa ($ivec,$in1);
877 &lea ($out,&DWP(0x10,$out));
878 &jmp (&label("cbc_dec_tail_collected"));
880 &set_label("cbc_dec_three",16);
881 &call ("_aesni_decrypt3");
882 &pxor ($inout0,$ivec);
883 &pxor ($inout1,$in0);
884 &pxor ($inout2,$in1);
885 &movdqu (&QWP(0,$out),$inout0);
886 &movdqu (&QWP(0x10,$out),$inout1);
887 &movdqa ($inout0,$inout2);
888 &movdqu ($ivec,&QWP(0x20,$inp));
889 &lea ($out,&DWP(0x20,$out));
891 &set_label("cbc_dec_tail_collected");
893 &jnz (&label("cbc_dec_tail_partial"));
894 &movdqu (&QWP(0,$out),$inout0);
895 &jmp (&label("cbc_ret"));
897 &set_label("cbc_dec_tail_partial",16);
901 &movdqa (&QWP(0,"esp"),$inout0);
904 &data_word(0xA4F3F689); # rep movsb
907 &set_label("cbc_ret");
908 &mov ($key_,&wparam(4));
909 &movups (&QWP(0,$key_),$ivec); # output IV
910 &function_end("${PREFIX}_cbc_encrypt");
912 ######################################################################
913 # Mechanical port from aesni-x86_64.pl.
915 # _aesni_set_encrypt_key is private interface,
917 # "eax" const unsigned char *userKey
924 &function_begin_B("_aesni_set_encrypt_key");
926 &jz (&label("bad_pointer"));
928 &jz (&label("bad_pointer"));
930 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
931 &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
932 &lea ($key,&DWP(16,$key));
934 &je (&label("14rounds"));
936 &je (&label("12rounds"));
938 &jne (&label("bad_keybits"));
940 &set_label("10rounds",16);
942 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
943 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
944 &call (&label("key_128_cold"));
945 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
946 &call (&label("key_128"));
947 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
948 &call (&label("key_128"));
949 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
950 &call (&label("key_128"));
951 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
952 &call (&label("key_128"));
953 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
954 &call (&label("key_128"));
955 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
956 &call (&label("key_128"));
957 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
958 &call (&label("key_128"));
959 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
960 &call (&label("key_128"));
961 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
962 &call (&label("key_128"));
963 &$movekey (&QWP(0,$key),"xmm0");
964 &mov (&DWP(80,$key),$rounds);
968 &set_label("key_128",16);
969 &$movekey (&QWP(0,$key),"xmm0");
970 &lea ($key,&DWP(16,$key));
971 &set_label("key_128_cold");
972 &shufps ("xmm4","xmm0",0b00010000);
973 &pxor ("xmm0","xmm4");
974 &shufps ("xmm4","xmm0",0b10001100,);
975 &pxor ("xmm0","xmm4");
976 &pshufd ("xmm1","xmm1",0b11111111); # critical path
977 &pxor ("xmm0","xmm1");
980 &set_label("12rounds",16);
981 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
983 &$movekey (&QWP(-16,$key),"xmm0") # round 0
984 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
985 &call (&label("key_192a_cold"));
986 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
987 &call (&label("key_192b"));
988 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
989 &call (&label("key_192a"));
990 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
991 &call (&label("key_192b"));
992 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
993 &call (&label("key_192a"));
994 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
995 &call (&label("key_192b"));
996 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
997 &call (&label("key_192a"));
998 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
999 &call (&label("key_192b"));
1000 &$movekey (&QWP(0,$key),"xmm0");
1001 &mov (&DWP(48,$key),$rounds);
1005 &set_label("key_192a",16);
1006 &$movekey (&QWP(0,$key),"xmm0");
1007 &lea ($key,&DWP(16,$key));
1008 &set_label("key_192a_cold",16);
1009 &movaps ("xmm5","xmm2");
1010 &set_label("key_192b_warm");
1011 &shufps ("xmm4","xmm0",0b00010000);
1012 &movaps ("xmm3","xmm2");
1013 &pxor ("xmm0","xmm4");
1014 &shufps ("xmm4","xmm0",0b10001100);
1016 &pxor ("xmm0","xmm4");
1017 &pshufd ("xmm1","xmm1",0b01010101); # critical path
1018 &pxor ("xmm2","xmm3");
1019 &pxor ("xmm0","xmm1");
1020 &pshufd ("xmm3","xmm0",0b11111111);
1021 &pxor ("xmm2","xmm3");
1024 &set_label("key_192b",16);
1025 &movaps ("xmm3","xmm0");
1026 &shufps ("xmm5","xmm0",0b01000100);
1027 &$movekey (&QWP(0,$key),"xmm5");
1028 &shufps ("xmm3","xmm2",0b01001110);
1029 &$movekey (&QWP(16,$key),"xmm3");
1030 &lea ($key,&DWP(32,$key));
1031 &jmp (&label("key_192b_warm"));
1033 &set_label("14rounds",16);
1034 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
1036 &lea ($key,&DWP(16,$key));
1037 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
1038 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
1039 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
1040 &call (&label("key_256a_cold"));
1041 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
1042 &call (&label("key_256b"));
1043 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
1044 &call (&label("key_256a"));
1045 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
1046 &call (&label("key_256b"));
1047 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
1048 &call (&label("key_256a"));
1049 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
1050 &call (&label("key_256b"));
1051 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
1052 &call (&label("key_256a"));
1053 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
1054 &call (&label("key_256b"));
1055 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
1056 &call (&label("key_256a"));
1057 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
1058 &call (&label("key_256b"));
1059 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
1060 &call (&label("key_256a"));
1061 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
1062 &call (&label("key_256b"));
1063 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
1064 &call (&label("key_256a"));
1065 &$movekey (&QWP(0,$key),"xmm0");
1066 &mov (&DWP(16,$key),$rounds);
1070 &set_label("key_256a",16);
1071 &$movekey (&QWP(0,$key),"xmm2");
1072 &lea ($key,&DWP(16,$key));
1073 &set_label("key_256a_cold");
1074 &shufps ("xmm4","xmm0",0b00010000);
1075 &pxor ("xmm0","xmm4");
1076 &shufps ("xmm4","xmm0",0b10001100);
1077 &pxor ("xmm0","xmm4");
1078 &pshufd ("xmm1","xmm1",0b11111111); # critical path
1079 &pxor ("xmm0","xmm1");
1082 &set_label("key_256b",16);
1083 &$movekey (&QWP(0,$key),"xmm0");
1084 &lea ($key,&DWP(16,$key));
1086 &shufps ("xmm4","xmm2",0b00010000);
1087 &pxor ("xmm2","xmm4");
1088 &shufps ("xmm4","xmm2",0b10001100);
1089 &pxor ("xmm2","xmm4");
1090 &pshufd ("xmm1","xmm1",0b10101010); # critical path
1091 &pxor ("xmm2","xmm1");
1094 &set_label("bad_pointer",4);
1097 &set_label("bad_keybits",4);
1100 &function_end_B("_aesni_set_encrypt_key");
1102 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
1104 &function_begin_B("${PREFIX}_set_encrypt_key");
1105 &mov ("eax",&wparam(0));
1106 &mov ($rounds,&wparam(1));
1107 &mov ($key,&wparam(2));
1108 &call ("_aesni_set_encrypt_key");
1110 &function_end_B("${PREFIX}_set_encrypt_key");
1112 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
1114 &function_begin_B("${PREFIX}_set_decrypt_key");
1115 &mov ("eax",&wparam(0));
1116 &mov ($rounds,&wparam(1));
1117 &mov ($key,&wparam(2));
1118 &call ("_aesni_set_encrypt_key");
1119 &mov ($key,&wparam(2));
1120 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
1121 &test ("eax","eax");
1122 &jnz (&label("dec_key_ret"));
1123 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
1125 &$movekey ("xmm0",&QWP(0,$key)); # just swap
1126 &$movekey ("xmm1",&QWP(0,"eax"));
1127 &$movekey (&QWP(0,"eax"),"xmm0");
1128 &$movekey (&QWP(0,$key),"xmm1");
1129 &lea ($key,&DWP(16,$key));
1130 &lea ("eax",&DWP(-16,"eax"));
1132 &set_label("dec_key_inverse");
1133 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
1134 &$movekey ("xmm1",&QWP(0,"eax"));
1135 &aesimc ("xmm0","xmm0");
1136 &aesimc ("xmm1","xmm1");
1137 &lea ($key,&DWP(16,$key));
1138 &lea ("eax",&DWP(-16,"eax"));
1139 &$movekey (&QWP(16,"eax"),"xmm0");
1140 &$movekey (&QWP(-16,$key),"xmm1");
1142 &ja (&label("dec_key_inverse"));
1144 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
1145 &aesimc ("xmm0","xmm0");
1146 &$movekey (&QWP(0,$key),"xmm0");
1148 &xor ("eax","eax"); # return success
1149 &set_label("dec_key_ret");
1151 &function_end_B("${PREFIX}_set_decrypt_key");
1152 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");