3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
17 # To start with see corresponding paragraph in aesni-x86_64.pl...
18 # Instead of filling table similar to one found there I've chosen to
19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20 # The simplified table below represents 32-bit performance relative
21 # to 64-bit one in every given point. Ratios vary for different
22 # encryption modes, therefore interval values.
24 # 16-byte 64-byte 256-byte 1-KB 8-KB
25 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
27 # Lower ratios for smaller block sizes are perfectly understandable,
28 # because function call overhead is higher in 32-bit mode. Largest
29 # 8-KB block performance is virtually same: 32-bit code is less than
30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
32 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
33 # generates drop-in replacement for
34 # crypto/aes/asm/aes-586.pl:-)
35 $inline=1; # inline _aesni_[en|de]crypt
37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38 push(@INC,"${dir}","${dir}../../perlasm");
41 &asm_init($ARGV[0],$0);
43 if ($PREFIX eq "aesni") { $movekey=*movaps; }
44 else { $movekey=*movups; }
51 $rounds_="ebx"; # backup copy for $rounds
52 $key_="ebp"; # backup copy for $key
61 $in1="xmm7"; $inout3="xmm7";
65 { my($dst,$src,$imm)=@_;
66 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
67 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
70 { my($opcodelet,$dst,$src)=@_;
71 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
72 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
74 sub aesimc { aescommon(0xdb,@_); }
75 sub aesenc { aescommon(0xdc,@_); }
76 sub aesenclast { aescommon(0xdd,@_); }
77 sub aesdec { aescommon(0xde,@_); }
78 sub aesdeclast { aescommon(0xdf,@_); }
80 # Inline version of internal aesni_[en|de]crypt1
82 sub aesni_inline_generate1
83 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
86 &movdqu ($rndkey0,&QWP(0,$key));
87 &$movekey ($rndkey1,&QWP(16,$key));
88 &lea ($key,&DWP(32,$key));
89 &pxor ($inout,$rndkey0);
90 &set_label("${p}1_loop_$sn");
91 eval"&aes${p} ($inout,$rndkey1)";
93 &$movekey ($rndkey1,&QWP(0,$key));
94 &lea ($key,&DWP(16,$key));
95 &jnz (&label("${p}1_loop_$sn"));
96 eval"&aes${p}last ($inout,$rndkey1)";
99 sub aesni_generate1 # fully unrolled loop
100 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
102 &function_begin_B("_aesni_${p}rypt1");
103 &movdqu ($rndkey0,&QWP(0,$key));
104 &$movekey ($rndkey1,&QWP(0x10,$key));
105 &pxor ($inout,$rndkey0);
106 &$movekey ($rndkey0,&QWP(0x20,$key));
107 &lea ($key,&DWP(0x30,$key));
109 &jb (&label("${p}128"));
110 &lea ($key,&DWP(0x20,$key));
111 &je (&label("${p}192"));
112 &lea ($key,&DWP(0x20,$key));
113 eval"&aes${p} ($inout,$rndkey1)";
114 &$movekey ($rndkey1,&QWP(-0x40,$key));
115 eval"&aes${p} ($inout,$rndkey0)";
116 &$movekey ($rndkey0,&QWP(-0x30,$key));
117 &set_label("${p}192");
118 eval"&aes${p} ($inout,$rndkey1)";
119 &$movekey ($rndkey1,&QWP(-0x20,$key));
120 eval"&aes${p} ($inout,$rndkey0)";
121 &$movekey ($rndkey0,&QWP(-0x10,$key));
122 &set_label("${p}128");
123 eval"&aes${p} ($inout,$rndkey1)";
124 &$movekey ($rndkey1,&QWP(0,$key));
125 eval"&aes${p} ($inout,$rndkey0)";
126 &$movekey ($rndkey0,&QWP(0x10,$key));
127 eval"&aes${p} ($inout,$rndkey1)";
128 &$movekey ($rndkey1,&QWP(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey0)";
130 &$movekey ($rndkey0,&QWP(0x30,$key));
131 eval"&aes${p} ($inout,$rndkey1)";
132 &$movekey ($rndkey1,&QWP(0x40,$key));
133 eval"&aes${p} ($inout,$rndkey0)";
134 &$movekey ($rndkey0,&QWP(0x50,$key));
135 eval"&aes${p} ($inout,$rndkey1)";
136 &$movekey ($rndkey1,&QWP(0x60,$key));
137 eval"&aes${p} ($inout,$rndkey0)";
138 &$movekey ($rndkey0,&QWP(0x70,$key));
139 eval"&aes${p} ($inout,$rndkey1)";
140 eval"&aes${p}last ($inout,$rndkey0)";
142 &function_end_B("_aesni_${p}rypt1");
145 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
146 &aesni_generate1("enc") if (!$inline);
147 &function_begin_B("${PREFIX}_encrypt");
148 &mov ("eax",&wparam(0));
149 &mov ($key,&wparam(2));
150 &movdqu ($inout0,&QWP(0,"eax"));
151 &mov ($rounds,&DWP(240,$key));
152 &mov ("eax",&wparam(1));
154 { &aesni_inline_generate1("enc"); }
156 { &call ("_aesni_encrypt1"); }
157 &movups (&QWP(0,"eax"),$inout0);
159 &function_end_B("${PREFIX}_encrypt");
161 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
162 &aesni_generate1("dec") if(!$inline);
163 &function_begin_B("${PREFIX}_decrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
166 &movdqu ($inout0,&QWP(0,"eax"));
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
170 { &aesni_inline_generate1("dec"); }
172 { &call ("_aesni_decrypt1"); }
173 &movups (&QWP(0,"eax"),$inout0);
175 &function_end_B("${PREFIX}_decrypt");
177 # _aesni_[en|de]crypt[34] are private interfaces, N denotes interleave
178 # factor. Why 3x subroutine is used in loops? Even though aes[enc|dec]
179 # latency is 6, it turned out that it can be scheduled only every
180 # *second* cycle. Thus 3x interleave is the one providing optimal
181 # utilization, i.e. when subroutine's throughput is virtually same as
182 # of non-interleaved subroutine [for number of input blocks up to 3].
183 # This is why it makes no sense to implement 2x subroutine. As soon
184 # as/if Intel improves throughput by making it possible to schedule
185 # the instructions in question *every* cycles I would have to
186 # implement 6x interleave and use it in loop...
190 &function_begin_B("_aesni_${p}rypt3");
191 &$movekey ($rndkey0,&QWP(0,$key));
193 &$movekey ($rndkey1,&QWP(16,$key));
194 &lea ($key,&DWP(32,$key));
195 &pxor ($inout0,$rndkey0);
196 &pxor ($inout1,$rndkey0);
197 &pxor ($inout2,$rndkey0);
198 &$movekey ($rndkey0,&QWP(0,$key));
200 &set_label("${p}3_loop");
201 eval"&aes${p} ($inout0,$rndkey1)";
202 eval"&aes${p} ($inout1,$rndkey1)";
204 eval"&aes${p} ($inout2,$rndkey1)";
205 &$movekey ($rndkey1,&QWP(16,$key));
206 eval"&aes${p} ($inout0,$rndkey0)";
207 eval"&aes${p} ($inout1,$rndkey0)";
208 &lea ($key,&DWP(32,$key));
209 eval"&aes${p} ($inout2,$rndkey0)";
210 &$movekey ($rndkey0,&QWP(0,$key));
211 &jnz (&label("${p}3_loop"));
212 eval"&aes${p} ($inout0,$rndkey1)";
213 eval"&aes${p} ($inout1,$rndkey1)";
214 eval"&aes${p} ($inout2,$rndkey1)";
215 eval"&aes${p}last ($inout0,$rndkey0)";
216 eval"&aes${p}last ($inout1,$rndkey0)";
217 eval"&aes${p}last ($inout2,$rndkey0)";
219 &function_end_B("_aesni_${p}rypt3");
222 # 4x interleave is implemented to improve small block performance,
223 # most notably [and naturally] 4 block by ~30%. One can argue that one
224 # should have implemented 5x as well, but improvement would be <20%,
225 # so it's not worth it...
229 &function_begin_B("_aesni_${p}rypt4");
230 &$movekey ($rndkey0,&QWP(0,$key));
231 &$movekey ($rndkey1,&QWP(16,$key));
233 &lea ($key,&DWP(32,$key));
234 &pxor ($inout0,$rndkey0);
235 &pxor ($inout1,$rndkey0);
236 &pxor ($inout2,$rndkey0);
237 &pxor ($inout3,$rndkey0);
238 &$movekey ($rndkey0,&QWP(0,$key));
240 &set_label("${p}3_loop");
241 eval"&aes${p} ($inout0,$rndkey1)";
242 eval"&aes${p} ($inout1,$rndkey1)";
244 eval"&aes${p} ($inout2,$rndkey1)";
245 eval"&aes${p} ($inout3,$rndkey1)";
246 &$movekey ($rndkey1,&QWP(16,$key));
247 eval"&aes${p} ($inout0,$rndkey0)";
248 eval"&aes${p} ($inout1,$rndkey0)";
249 &lea ($key,&DWP(32,$key));
250 eval"&aes${p} ($inout2,$rndkey0)";
251 eval"&aes${p} ($inout3,$rndkey0)";
252 &$movekey ($rndkey0,&QWP(0,$key));
253 &jnz (&label("${p}3_loop"));
255 eval"&aes${p} ($inout0,$rndkey1)";
256 eval"&aes${p} ($inout1,$rndkey1)";
257 eval"&aes${p} ($inout2,$rndkey1)";
258 eval"&aes${p} ($inout3,$rndkey1)";
259 eval"&aes${p}last ($inout0,$rndkey0)";
260 eval"&aes${p}last ($inout1,$rndkey0)";
261 eval"&aes${p}last ($inout2,$rndkey0)";
262 eval"&aes${p}last ($inout3,$rndkey0)";
264 &function_end_B("_aesni_${p}rypt4");
266 &aesni_generate3("enc") if ($PREFIX eq "aesni");
267 &aesni_generate3("dec");
268 &aesni_generate4("enc") if ($PREFIX eq "aesni");
269 &aesni_generate4("dec");
271 if ($PREFIX eq "aesni") {
272 ######################################################################
273 # void aesni_ecb_encrypt (const void *in, void *out,
274 # size_t length, const AES_KEY *key,
276 &function_begin("aesni_ecb_encrypt");
277 &mov ($inp,&wparam(0));
278 &mov ($out,&wparam(1));
279 &mov ($len,&wparam(2));
280 &mov ($key,&wparam(3));
281 &mov ($rounds,&wparam(4));
283 &jb (&label("ecb_ret"));
285 &test ($rounds,$rounds)
286 &mov ($rounds,&DWP(240,$key));
287 &mov ($key_,$key); # backup $key
288 &mov ($rounds_,$rounds); # backup $rounds
289 &jz (&label("ecb_decrypt"));
292 &jbe (&label("ecb_enc_tail"));
294 &jmp (&label("ecb_enc_loop3"));
296 &set_label("ecb_enc_loop3",16);
297 &movups ($inout0,&QWP(0,$inp));
298 &movups ($inout1,&QWP(0x10,$inp));
299 &movups ($inout2,&QWP(0x20,$inp));
300 &call ("_aesni_encrypt3");
301 &lea ($inp,&DWP(0x30,$inp));
302 &movups (&QWP(0,$out),$inout0);
303 &mov ($key,$key_); # restore $key
304 &movups (&QWP(0x10,$out),$inout1);
305 &mov ($rounds,$rounds_); # restore $rounds
306 &movups (&QWP(0x20,$out),$inout2);
307 &lea ($out,&DWP(0x30,$out));
309 &ja (&label("ecb_enc_loop3"));
312 &set_label("ecb_enc_tail");
313 &movups ($inout0,&QWP(0,$inp));
315 &jb (&label("ecb_enc_one"));
316 &movups ($inout1,&QWP(0x10,$inp));
317 &je (&label("ecb_enc_two"));
318 &movups ($inout2,&QWP(0x20,$inp));
320 &je (&label("ecb_enc_three"));
321 &movups ($inout3,&QWP(0x30,$inp));
322 &call ("_aesni_encrypt4");
323 &movups (&QWP(0,$out),$inout0);
324 &movups (&QWP(0x10,$out),$inout1);
325 &movups (&QWP(0x20,$out),$inout2);
326 &movups (&QWP(0x30,$out),$inout3);
327 jmp (&label("ecb_ret"));
329 &set_label("ecb_enc_one",16);
331 { &aesni_inline_generate1("enc"); }
333 { &call ("_aesni_encrypt1"); }
334 &movups (&QWP(0,$out),$inout0);
335 &jmp (&label("ecb_ret"));
337 &set_label("ecb_enc_two",16);
338 &pxor ($inout2,$inout2);
339 &call ("_aesni_encrypt3");
340 &movups (&QWP(0,$out),$inout0);
341 &movups (&QWP(0x10,$out),$inout1);
342 &jmp (&label("ecb_ret"));
344 &set_label("ecb_enc_three",16);
345 &call ("_aesni_encrypt3");
346 &movups (&QWP(0,$out),$inout0);
347 &movups (&QWP(0x10,$out),$inout1);
348 &movups (&QWP(0x20,$out),$inout2);
349 &jmp (&label("ecb_ret"));
350 ######################################################################
351 &set_label("ecb_decrypt",16);
353 &jbe (&label("ecb_dec_tail"));
355 &jmp (&label("ecb_dec_loop3"));
357 &set_label("ecb_dec_loop3",16);
358 &movups ($inout0,&QWP(0,$inp));
359 &movups ($inout1,&QWP(0x10,$inp));
360 &movups ($inout2,&QWP(0x20,$inp));
361 &call ("_aesni_decrypt3");
362 &lea ($inp,&DWP(0x30,$inp));
363 &movups (&QWP(0,$out),$inout0);
364 &mov ($key,$key_); # restore $key
365 &movups (&QWP(0x10,$out),$inout1);
366 &mov ($rounds,$rounds_); # restore $rounds
367 &movups (&QWP(0x20,$out),$inout2);
368 &lea ($out,&DWP(0x30,$out));
370 &ja (&label("ecb_dec_loop3"));
373 &set_label("ecb_dec_tail");
374 &movups ($inout0,&QWP(0,$inp));
376 &jb (&label("ecb_dec_one"));
377 &movups ($inout1,&QWP(0x10,$inp));
378 &je (&label("ecb_dec_two"));
379 &movups ($inout2,&QWP(0x20,$inp));
381 &je (&label("ecb_dec_three"));
382 &movups ($inout3,&QWP(0x30,$inp));
383 &call ("_aesni_decrypt4");
384 &movups (&QWP(0,$out),$inout0);
385 &movups (&QWP(0x10,$out),$inout1);
386 &movups (&QWP(0x20,$out),$inout2);
387 &movups (&QWP(0x30,$out),$inout3);
388 &jmp (&label("ecb_ret"));
390 &set_label("ecb_dec_one",16);
392 { &aesni_inline_generate1("dec"); }
394 { &call ("_aesni_decrypt1"); }
395 &movups (&QWP(0,$out),$inout0);
396 &jmp (&label("ecb_ret"));
398 &set_label("ecb_dec_two",16);
399 &pxor ($inout2,$inout2);
400 &call ("_aesni_decrypt3");
401 &movups (&QWP(0,$out),$inout0);
402 &movups (&QWP(0x10,$out),$inout1);
403 &jmp (&label("ecb_ret"));
405 &set_label("ecb_dec_three",16);
406 &call ("_aesni_decrypt3");
407 &movups (&QWP(0,$out),$inout0);
408 &movups (&QWP(0x10,$out),$inout1);
409 &movups (&QWP(0x20,$out),$inout2);
411 &set_label("ecb_ret");
412 &function_end("aesni_ecb_encrypt");
414 ######################################################################
415 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
416 # size_t blocks, const AES_KEY *key,
417 # const char *ivec,char *cmac);
419 # Handles only complete blocks, operates on 64-bit counter and
420 # does not update *ivec! Nor does it finalize CMAC value
421 # (see engine/eng_aesni.c for details)
423 &function_begin("aesni_ccm64_encrypt_blocks");
424 &mov ($inp,&wparam(0));
425 &mov ($out,&wparam(1));
426 &mov ($len,&wparam(2));
427 &mov ($key,&wparam(3));
428 &mov ($rounds_,&wparam(4));
429 &mov ($rounds,&wparam(5));
432 &and ("esp",-16); # align stack
433 &mov (&DWP(48,"esp"),$key_);
435 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
436 &movdqu ($inout1,&QWP(0,$rounds)); # load cmac
438 # compose byte-swap control mask for pshufb on stack
439 &mov (&DWP(0,"esp"),0x0c0d0e0f);
440 &mov (&DWP(4,"esp"),0x08090a0b);
441 &mov (&DWP(8,"esp"),0x04050607);
442 &mov (&DWP(12,"esp"),0x00010203);
444 # compose counter increment vector on stack
447 &mov (&DWP(16,"esp"),$rounds);
448 &mov (&DWP(20,"esp"),$key_);
449 &mov (&DWP(24,"esp"),$key_);
450 &mov (&DWP(28,"esp"),$key_);
452 &movdqa ($inout3,&QWP(0,"esp"));
453 &pshufb ($ivec,$inout3); # keep iv in reverse order
455 &mov ($rounds,&DWP(240,$key));
457 &mov ($rounds_,$rounds);
458 &movdqa ($inout0,$ivec);
460 &set_label("ccm64_enc_outer");
461 &movdqu ($in0,&QWP(0,$inp));
462 &pshufb ($inout0,$inout3);
464 &mov ($rounds,$rounds_);
465 &pxor ($inout1,$in0); # cmac^=inp
466 &pxor ($inout2,$inout2);
468 &call ("_aesni_encrypt3");
470 &paddq ($ivec,&QWP(16,"esp"));
472 &lea ($inp,&DWP(16,$inp));
473 &pxor ($in0,$inout0); # inp^=E(ivec)
474 &movdqa ($inout0,$ivec);
475 &movdqu (&QWP(0,$out),$in0);
476 &lea ($out,&DWP(16,$out));
477 &jnz (&label("ccm64_enc_outer"));
479 &mov ("esp",&DWP(48,"esp"));
480 &mov ($out,&wparam(5));
481 &movdqu (&QWP(0,$out),$inout1);
482 &function_end("aesni_ccm64_encrypt_blocks");
484 &function_begin("aesni_ccm64_decrypt_blocks");
485 &mov ($inp,&wparam(0));
486 &mov ($out,&wparam(1));
487 &mov ($len,&wparam(2));
488 &mov ($key,&wparam(3));
489 &mov ($rounds_,&wparam(4));
490 &mov ($rounds,&wparam(5));
493 &and ("esp",-16); # align stack
494 &mov (&DWP(48,"esp"),$key_);
496 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
497 &movdqu ($inout1,&QWP(0,$rounds)); # load cmac
499 # compose byte-swap control mask for pshufb on stack
500 &mov (&DWP(0,"esp"),0x0c0d0e0f);
501 &mov (&DWP(4,"esp"),0x08090a0b);
502 &mov (&DWP(8,"esp"),0x04050607);
503 &mov (&DWP(12,"esp"),0x00010203);
505 # compose counter increment vector on stack
508 &mov (&DWP(16,"esp"),$rounds);
509 &mov (&DWP(20,"esp"),$key_);
510 &mov (&DWP(24,"esp"),$key_);
511 &mov (&DWP(28,"esp"),$key_);
513 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
514 &movdqa ($inout0,$ivec);
515 &pshufb ($ivec,$inout3); # keep iv in reverse order
517 &mov ($rounds,&DWP(240,$key));
519 &mov ($rounds_,$rounds);
522 { &aesni_inline_generate1("enc"); }
524 { &call ("_aesni_encrypt1"); }
526 &set_label("ccm64_dec_outer");
527 &movdqu ($in0,&QWP(0,$inp));
528 &paddq ($ivec,&QWP(16,"esp"));
530 &lea ($inp,&QWP(16,$inp));
531 &pxor ($in0,$inout0);
532 &movdqa ($inout0,$ivec);
534 &mov ($rounds,$rounds_);
535 &pshufb ($inout0,$inout3);
536 &movdqu (&QWP(0,$out),$in0);
537 &lea ($out,&DWP(16,$out));
539 &jz (&label("ccm64_dec_break"));
541 &pxor ($inout2,$inout2);
542 &call ("_aesni_encrypt3");
544 &jmp (&label("ccm64_dec_outer"));
546 &set_label("ccm64_dec_break",16);
548 { &aesni_inline_generate1("enc",$inout1); }
550 { &call ("_aesni_encrypt1",$inout1); }
552 &mov ("esp",&DWP(48,"esp"));
553 &mov ($out,&wparam(5));
554 &movdqu (&QWP(0,$out),$inout1);
555 &function_end("aesni_ccm64_decrypt_blocks");
557 ######################################################################
558 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
559 # size_t blocks, const AES_KEY *key,
562 # Handles only complete blocks, operates on 32-bit counter and
563 # does not update *ivec! (see engine/eng_aesni.c for details)
565 &function_begin("aesni_ctr32_encrypt_blocks");
566 &mov ($inp,&wparam(0));
567 &mov ($out,&wparam(1));
568 &mov ($len,&wparam(2));
569 &mov ($key,&wparam(3));
570 &mov ($rounds_,&wparam(4));
573 &and ("esp",-16); # align stack
574 &mov (&DWP(48,"esp"),$key_);
577 &je (&label("ctr32_one_shortcut"));
579 &movups ($inout3,&QWP(0,$rounds_)); # load ivec
581 # compose byte-swap control mask for pshufb on stack
582 &mov (&DWP(0,"esp"),0x0c0d0e0f);
583 &mov (&DWP(4,"esp"),0x08090a0b);
584 &mov (&DWP(8,"esp"),0x04050607);
585 &mov (&DWP(12,"esp"),0x00010203);
587 # compose counter increment vector on stack
590 &mov (&DWP(16,"esp"),$rounds);
591 &mov (&DWP(20,"esp"),$rounds);
592 &mov (&DWP(24,"esp"),$rounds);
593 &mov (&DWP(28,"esp"),$key_);
595 &pextrd ($rounds_,$inout3,3); # pull 32-bit counter
596 &pinsrd ($inout3,$key_,3); # wipe 32-bit counter
598 &mov ($rounds,&DWP(240,$key)); # key->rounds
599 &movdqa ($rndkey0,&QWP(0,"esp")); # load byte-swap mask
601 # $ivec is vector of 3 32-bit counters
604 &pinsrd ($ivec,$rounds_,0);
606 &pinsrd ($ivec,$rounds_,1);
608 &pinsrd ($ivec,$rounds_,2);
609 &pshufb ($ivec,$rndkey0); # byte swap
612 &jbe (&label("ctr32_tail"));
613 &movdqa (&QWP(32,"esp"),$inout3); # save counter-less ivec
614 &mov ($rounds_,$rounds);
617 &jmp (&label("ctr32_loop3"));
619 &set_label("ctr32_loop3",16);
620 &pshufd ($inout0,$ivec,3<<6); # place counter to upper dword
621 &pshufd ($inout1,$ivec,2<<6);
622 &por ($inout0,$inout3); # merge counter-less ivec
623 &pshufd ($inout2,$ivec,1<<6);
624 &por ($inout1,$inout3);
625 &por ($inout2,$inout3);
627 # inline _aesni_encrypt3 and interleave last round
630 &$movekey ($rndkey0,&QWP(0,$key));
632 &$movekey ($rndkey1,&QWP(16,$key));
633 &lea ($key,&DWP(32,$key));
634 &pxor ($inout0,$rndkey0);
635 &pxor ($inout1,$rndkey0);
636 &pxor ($inout2,$rndkey0);
637 &$movekey ($rndkey0,&QWP(0,$key));
639 &set_label("ctr32_enc_loop3");
640 &aesenc ($inout0,$rndkey1);
641 &aesenc ($inout1,$rndkey1);
643 &aesenc ($inout2,$rndkey1);
644 &$movekey ($rndkey1,&QWP(16,$key));
645 &aesenc ($inout0,$rndkey0);
646 &aesenc ($inout1,$rndkey0);
647 &lea ($key,&DWP(32,$key));
648 &aesenc ($inout2,$rndkey0);
649 &$movekey ($rndkey0,&QWP(0,$key));
650 &jnz (&label("ctr32_enc_loop3"));
652 &aesenc ($inout0,$rndkey1);
653 &aesenc ($inout1,$rndkey1);
654 &aesenc ($inout2,$rndkey1);
655 &movdqa ($rndkey1,&QWP(0,"esp")); # load byte-swap mask
657 &aesenclast ($inout0,$rndkey0);
658 &pshufb ($ivec,$rndkey1); # byte swap
659 &movdqu ($in0,&QWP(0,$inp));
660 &aesenclast ($inout1,$rndkey0);
661 &paddd ($ivec,&QWP(16,"esp")); # counter increment
662 &movdqu ($in1,&QWP(0x10,$inp));
663 &aesenclast ($inout2,$rndkey0);
664 &pshufb ($ivec,$rndkey1); # byte swap
665 &movdqu ($rndkey0,&QWP(0x20,$inp));
666 &lea ($inp,&DWP(0x30,$inp));
668 &pxor ($in0,$inout0);
670 &pxor ($in1,$inout1);
671 &movdqu (&QWP(0,$out),$in0);
672 &pxor ($rndkey0,$inout2);
673 &movdqu (&QWP(0x10,$out),$in1);
674 &movdqu (&QWP(0x20,$out),$rndkey0);
675 &movdqa ($inout3,&QWP(32,"esp")); # load counter-less ivec
678 &lea ($out,&DWP(0x30,$out));
679 &mov ($rounds,$rounds_);
680 &ja (&label("ctr32_loop3"));
682 &pextrd ($rounds_,$ivec,1); # might need last counter value
686 &set_label("ctr32_tail");
687 &pshufd ($inout0,$ivec,3<<6);
688 &pshufd ($inout1,$ivec,2<<6);
689 &por ($inout0,$inout3);
691 &jb (&label("ctr32_one"));
692 &lea ($rounds_,&DWP(1,$rounds_));
693 &pshufd ($inout2,$ivec,1<<6);
694 &por ($inout1,$inout3);
695 &je (&label("ctr32_two"));
697 &por ($inout2,$inout3);
699 &je (&label("ctr32_three"));
701 &pinsrd ($inout3,$rounds_,3); # compose last counter value
703 &call ("_aesni_encrypt4");
705 &movdqu ($in0,&QWP(0,$inp));
706 &movdqu ($rndkey1,&QWP(0x10,$inp));
707 &pxor ($in0,$inout0);
708 &movdqu ($rndkey0,&QWP(0x20,$inp));
709 &pxor ($rndkey1,$inout1);
710 &movdqu ($ivec,&QWP(0x30,$inp));
711 &pxor ($rndkey0,$inout2);
712 &movdqu (&QWP(0,$out),$in0);
713 &pxor ($ivec,$inout3);
714 &movdqu (&QWP(0x10,$out),$rndkey1);
715 &movdqu (&QWP(0x20,$out),$rndkey0);
716 &movdqu (&QWP(0x30,$out),$ivec);
717 &jmp (&label("ctr32_ret"));
719 &set_label("ctr32_one_shortcut",16);
720 &movdqu ($inout0,&QWP(0,$rounds_)); # load ivec
721 &mov ($rounds,&DWP(240,$key));
723 &set_label("ctr32_one");
725 { &aesni_inline_generate1("enc"); }
727 { &call ("_aesni_encrypt1"); }
728 &movdqu ($in0,&QWP(0,$inp));
729 &pxor ($in0,$inout0);
730 &movdqu (&QWP(0,$out),$in0);
731 &jmp (&label("ctr32_ret"));
733 &set_label("ctr32_two",16);
734 &pxor ($inout2,$inout2);
735 &call ("_aesni_encrypt3");
736 &movdqu ($in0,&QWP(0,$inp));
737 &movdqu ($in1,&QWP(0x10,$inp));
738 &pxor ($in0,$inout0);
739 &pxor ($in1,$inout1);
740 &movdqu (&QWP(0,$out),$in0);
741 &movdqu (&QWP(0x10,$out),$in1);
742 &jmp (&label("ctr32_ret"));
744 &set_label("ctr32_three",16);
745 &call ("_aesni_encrypt3");
746 &movdqu ($in0,&QWP(0,$inp));
747 &movdqu ($in1,&QWP(0x10,$inp));
748 &movdqu ($rndkey1,&QWP(0x20,$inp));
749 &pxor ($in0,$inout0);
750 &pxor ($in1,$inout1);
751 &movdqu (&QWP(0,$out),$in0);
752 &pxor ($rndkey1,$inout2);
753 &movdqu (&QWP(0x10,$out),$in1);
754 &movdqu (&QWP(0x20,$out),$rndkey1);
756 &set_label("ctr32_ret");
757 &mov ("esp",&DWP(48,"esp"));
758 &function_end("aesni_ctr32_encrypt_blocks");
761 ######################################################################
762 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
763 # size_t length, const AES_KEY *key,
764 # unsigned char *ivp,const int enc);
765 &function_begin("${PREFIX}_cbc_encrypt");
766 &mov ($inp,&wparam(0));
767 &mov ($out,&wparam(1));
768 &mov ($len,&wparam(2));
769 &mov ($key,&wparam(3));
770 &mov ($key_,&wparam(4));
772 &jz (&label("cbc_ret"));
775 &movdqu ($ivec,&QWP(0,$key_)); # load IV
776 &mov ($rounds,&DWP(240,$key));
777 &mov ($key_,$key); # backup $key
778 &mov ($rounds_,$rounds); # backup $rounds
779 &je (&label("cbc_decrypt"));
781 &movdqa ($inout0,$ivec);
783 &jb (&label("cbc_enc_tail"));
785 &jmp (&label("cbc_enc_loop"));
787 &set_label("cbc_enc_loop",16);
788 &movdqu ($ivec,&QWP(0,$inp));
789 &lea ($inp,&DWP(16,$inp));
790 &pxor ($inout0,$ivec);
792 { &aesni_inline_generate1("enc"); }
794 { &call ("_aesni_encrypt1"); }
795 &mov ($rounds,$rounds_); # restore $rounds
796 &mov ($key,$key_); # restore $key
797 &movups (&QWP(0,$out),$inout0); # store output
798 &lea ($out,&DWP(16,$out));
800 &jnc (&label("cbc_enc_loop"));
802 &jnz (&label("cbc_enc_tail"));
803 &movaps ($ivec,$inout0);
804 &jmp (&label("cbc_ret"));
806 &set_label("cbc_enc_tail");
807 &mov ("ecx",$len); # zaps $rounds
808 &data_word(0xA4F3F689); # rep movsb
809 &mov ("ecx",16); # zero tail
811 &xor ("eax","eax"); # zaps $len
812 &data_word(0xAAF3F689); # rep stosb
813 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
814 &mov ($rounds,$rounds_); # restore $rounds
815 &mov ($inp,$out); # $inp and $out are the same
816 &mov ($key,$key_); # restore $key
817 &jmp (&label("cbc_enc_loop"));
818 ######################################################################
819 &set_label("cbc_decrypt",16);
821 &jbe (&label("cbc_dec_tail"));
823 &jmp (&label("cbc_dec_loop3"));
825 &set_label("cbc_dec_loop3",16);
826 &movups ($inout0,&QWP(0,$inp));
827 &movups ($inout1,&QWP(0x10,$inp));
828 &movups ($inout2,&QWP(0x20,$inp));
829 &movaps ($in0,$inout0);
830 &movaps ($in1,$inout1);
832 &call ("_aesni_decrypt3");
834 &pxor ($inout0,$ivec);
835 &pxor ($inout1,$in0);
836 &movdqu ($ivec,&QWP(0x20,$inp));
837 &lea ($inp,&DWP(0x30,$inp));
838 &pxor ($inout2,$in1);
839 &movdqu (&QWP(0,$out),$inout0);
840 &mov ($rounds,$rounds_) # restore $rounds
841 &movdqu (&QWP(0x10,$out),$inout1);
842 &mov ($key,$key_); # restore $key
843 &movdqu (&QWP(0x20,$out),$inout2);
844 &lea ($out,&DWP(0x30,$out));
846 &ja (&label("cbc_dec_loop3"));
849 &set_label("cbc_dec_tail");
850 &movups ($inout0,&QWP(0,$inp));
851 &movaps ($in0,$inout0);
853 &jbe (&label("cbc_dec_one"));
854 &movups ($inout1,&QWP(0x10,$inp));
855 &movaps ($in1,$inout1);
857 &jbe (&label("cbc_dec_two"));
858 &movups ($inout2,&QWP(0x20,$inp));
860 &jbe (&label("cbc_dec_three"));
861 &movups ($inout3,&QWP(0x30,$inp));
862 &call ("_aesni_decrypt4");
863 &movdqu ($rndkey0,&QWP(0x10,$inp));
864 &movdqu ($rndkey1,&QWP(0x20,$inp));
865 &pxor ($inout0,$ivec);
866 &pxor ($inout1,$in0);
867 &movdqu ($ivec,&QWP(0x30,$inp));
868 &movdqu (&QWP(0,$out),$inout0);
869 &pxor ($inout2,$rndkey0);
870 &pxor ($inout3,$rndkey1);
871 &movdqu (&QWP(0x10,$out),$inout1);
872 &movdqu (&QWP(0x20,$out),$inout2);
873 &movdqa ($inout0,$inout3);
874 &lea ($out,&DWP(0x30,$out));
875 &jmp (&label("cbc_dec_tail_collected"));
877 &set_label("cbc_dec_one",16);
879 { &aesni_inline_generate1("dec"); }
881 { &call ("_aesni_decrypt1"); }
882 &pxor ($inout0,$ivec);
883 &movdqa ($ivec,$in0);
884 &jmp (&label("cbc_dec_tail_collected"));
886 &set_label("cbc_dec_two",16);
887 &pxor ($inout2,$inout2);
888 &call ("_aesni_decrypt3");
889 &pxor ($inout0,$ivec);
890 &pxor ($inout1,$in0);
891 &movdqu (&QWP(0,$out),$inout0);
892 &movdqa ($inout0,$inout1);
893 &movdqa ($ivec,$in1);
894 &lea ($out,&DWP(0x10,$out));
895 &jmp (&label("cbc_dec_tail_collected"));
897 &set_label("cbc_dec_three",16);
898 &call ("_aesni_decrypt3");
899 &pxor ($inout0,$ivec);
900 &pxor ($inout1,$in0);
901 &pxor ($inout2,$in1);
902 &movdqu (&QWP(0,$out),$inout0);
903 &movdqu (&QWP(0x10,$out),$inout1);
904 &movdqa ($inout0,$inout2);
905 &movdqu ($ivec,&QWP(0x20,$inp));
906 &lea ($out,&DWP(0x20,$out));
908 &set_label("cbc_dec_tail_collected");
910 &jnz (&label("cbc_dec_tail_partial"));
911 &movdqu (&QWP(0,$out),$inout0);
912 &jmp (&label("cbc_ret"));
914 &set_label("cbc_dec_tail_partial",16);
918 &movdqa (&QWP(0,"esp"),$inout0);
921 &data_word(0xA4F3F689); # rep movsb
924 &set_label("cbc_ret");
925 &mov ($key_,&wparam(4));
926 &movups (&QWP(0,$key_),$ivec); # output IV
927 &function_end("${PREFIX}_cbc_encrypt");
929 ######################################################################
930 # Mechanical port from aesni-x86_64.pl.
932 # _aesni_set_encrypt_key is private interface,
934 # "eax" const unsigned char *userKey
941 &function_begin_B("_aesni_set_encrypt_key");
943 &jz (&label("bad_pointer"));
945 &jz (&label("bad_pointer"));
947 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
948 &pxor ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
949 &lea ($key,&DWP(16,$key));
951 &je (&label("14rounds"));
953 &je (&label("12rounds"));
955 &jne (&label("bad_keybits"));
957 &set_label("10rounds",16);
959 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
960 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
961 &call (&label("key_128_cold"));
962 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
963 &call (&label("key_128"));
964 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
965 &call (&label("key_128"));
966 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
967 &call (&label("key_128"));
968 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
969 &call (&label("key_128"));
970 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
971 &call (&label("key_128"));
972 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
973 &call (&label("key_128"));
974 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
975 &call (&label("key_128"));
976 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
977 &call (&label("key_128"));
978 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
979 &call (&label("key_128"));
980 &$movekey (&QWP(0,$key),"xmm0");
981 &mov (&DWP(80,$key),$rounds);
985 &set_label("key_128",16);
986 &$movekey (&QWP(0,$key),"xmm0");
987 &lea ($key,&DWP(16,$key));
988 &set_label("key_128_cold");
989 &shufps ("xmm4","xmm0",0b00010000);
990 &pxor ("xmm0","xmm4");
991 &shufps ("xmm4","xmm0",0b10001100,);
992 &pxor ("xmm0","xmm4");
993 &pshufd ("xmm1","xmm1",0b11111111); # critical path
994 &pxor ("xmm0","xmm1");
997 &set_label("12rounds",16);
998 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
1000 &$movekey (&QWP(-16,$key),"xmm0") # round 0
1001 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
1002 &call (&label("key_192a_cold"));
1003 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
1004 &call (&label("key_192b"));
1005 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
1006 &call (&label("key_192a"));
1007 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
1008 &call (&label("key_192b"));
1009 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
1010 &call (&label("key_192a"));
1011 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
1012 &call (&label("key_192b"));
1013 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
1014 &call (&label("key_192a"));
1015 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
1016 &call (&label("key_192b"));
1017 &$movekey (&QWP(0,$key),"xmm0");
1018 &mov (&DWP(48,$key),$rounds);
1022 &set_label("key_192a",16);
1023 &$movekey (&QWP(0,$key),"xmm0");
1024 &lea ($key,&DWP(16,$key));
1025 &set_label("key_192a_cold",16);
1026 &movaps ("xmm5","xmm2");
1027 &set_label("key_192b_warm");
1028 &shufps ("xmm4","xmm0",0b00010000);
1029 &movaps ("xmm3","xmm2");
1030 &pxor ("xmm0","xmm4");
1031 &shufps ("xmm4","xmm0",0b10001100);
1033 &pxor ("xmm0","xmm4");
1034 &pshufd ("xmm1","xmm1",0b01010101); # critical path
1035 &pxor ("xmm2","xmm3");
1036 &pxor ("xmm0","xmm1");
1037 &pshufd ("xmm3","xmm0",0b11111111);
1038 &pxor ("xmm2","xmm3");
1041 &set_label("key_192b",16);
1042 &movaps ("xmm3","xmm0");
1043 &shufps ("xmm5","xmm0",0b01000100);
1044 &$movekey (&QWP(0,$key),"xmm5");
1045 &shufps ("xmm3","xmm2",0b01001110);
1046 &$movekey (&QWP(16,$key),"xmm3");
1047 &lea ($key,&DWP(32,$key));
1048 &jmp (&label("key_192b_warm"));
1050 &set_label("14rounds",16);
1051 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
1053 &lea ($key,&DWP(16,$key));
1054 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
1055 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
1056 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
1057 &call (&label("key_256a_cold"));
1058 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
1059 &call (&label("key_256b"));
1060 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
1061 &call (&label("key_256a"));
1062 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
1063 &call (&label("key_256b"));
1064 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
1065 &call (&label("key_256a"));
1066 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
1067 &call (&label("key_256b"));
1068 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
1069 &call (&label("key_256a"));
1070 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
1071 &call (&label("key_256b"));
1072 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
1073 &call (&label("key_256a"));
1074 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
1075 &call (&label("key_256b"));
1076 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
1077 &call (&label("key_256a"));
1078 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
1079 &call (&label("key_256b"));
1080 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
1081 &call (&label("key_256a"));
1082 &$movekey (&QWP(0,$key),"xmm0");
1083 &mov (&DWP(16,$key),$rounds);
1087 &set_label("key_256a",16);
1088 &$movekey (&QWP(0,$key),"xmm2");
1089 &lea ($key,&DWP(16,$key));
1090 &set_label("key_256a_cold");
1091 &shufps ("xmm4","xmm0",0b00010000);
1092 &pxor ("xmm0","xmm4");
1093 &shufps ("xmm4","xmm0",0b10001100);
1094 &pxor ("xmm0","xmm4");
1095 &pshufd ("xmm1","xmm1",0b11111111); # critical path
1096 &pxor ("xmm0","xmm1");
1099 &set_label("key_256b",16);
1100 &$movekey (&QWP(0,$key),"xmm0");
1101 &lea ($key,&DWP(16,$key));
1103 &shufps ("xmm4","xmm2",0b00010000);
1104 &pxor ("xmm2","xmm4");
1105 &shufps ("xmm4","xmm2",0b10001100);
1106 &pxor ("xmm2","xmm4");
1107 &pshufd ("xmm1","xmm1",0b10101010); # critical path
1108 &pxor ("xmm2","xmm1");
1111 &set_label("bad_pointer",4);
1114 &set_label("bad_keybits",4);
1117 &function_end_B("_aesni_set_encrypt_key");
1119 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
1121 &function_begin_B("${PREFIX}_set_encrypt_key");
1122 &mov ("eax",&wparam(0));
1123 &mov ($rounds,&wparam(1));
1124 &mov ($key,&wparam(2));
1125 &call ("_aesni_set_encrypt_key");
1127 &function_end_B("${PREFIX}_set_encrypt_key");
1129 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
1131 &function_begin_B("${PREFIX}_set_decrypt_key");
1132 &mov ("eax",&wparam(0));
1133 &mov ($rounds,&wparam(1));
1134 &mov ($key,&wparam(2));
1135 &call ("_aesni_set_encrypt_key");
1136 &mov ($key,&wparam(2));
1137 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
1138 &test ("eax","eax");
1139 &jnz (&label("dec_key_ret"));
1140 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
1142 &$movekey ("xmm0",&QWP(0,$key)); # just swap
1143 &$movekey ("xmm1",&QWP(0,"eax"));
1144 &$movekey (&QWP(0,"eax"),"xmm0");
1145 &$movekey (&QWP(0,$key),"xmm1");
1146 &lea ($key,&DWP(16,$key));
1147 &lea ("eax",&DWP(-16,"eax"));
1149 &set_label("dec_key_inverse");
1150 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
1151 &$movekey ("xmm1",&QWP(0,"eax"));
1152 &aesimc ("xmm0","xmm0");
1153 &aesimc ("xmm1","xmm1");
1154 &lea ($key,&DWP(16,$key));
1155 &lea ("eax",&DWP(-16,"eax"));
1156 &$movekey (&QWP(16,"eax"),"xmm0");
1157 &$movekey (&QWP(-16,$key),"xmm1");
1159 &ja (&label("dec_key_inverse"));
1161 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
1162 &aesimc ("xmm0","xmm0");
1163 &$movekey (&QWP(0,$key),"xmm0");
1165 &xor ("eax","eax"); # return success
1166 &set_label("dec_key_ret");
1168 &function_end_B("${PREFIX}_set_decrypt_key");
1169 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");