3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
17 # To start with see corresponding paragraph in aesni-x86_64.pl...
18 # Instead of filling table similar to one found there I've chosen to
19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20 # The simplified table below represents 32-bit performance relative
21 # to 64-bit one in every given point. Ratios vary for different
22 # encryption modes, therefore interval values.
24 # 16-byte 64-byte 256-byte 1-KB 8-KB
25 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
27 # Lower ratios for smaller block sizes are perfectly understandable,
28 # because function call overhead is higher in 32-bit mode. Largest
29 # 8-KB block performance is virtually same: 32-bit code is less than
30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
35 # interleaves at most 6 aes[enc|dec] instructions, because there are
36 # not enough registers for 8x interleave [which should be optimal for
37 # Sandy Bridge]. Actually, performance results for 6x interleave
38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
46 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
49 $inline=1; # inline _aesni_[en|de]crypt
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 push(@INC,"${dir}","${dir}../../perlasm");
55 &asm_init($ARGV[0],$0);
57 if ($PREFIX eq "aesni") { $movekey=\&movups; }
58 else { $movekey=\&movups; }
65 $rounds_="ebx"; # backup copy for $rounds
66 $key_="ebp"; # backup copy for $key
73 $inout3="xmm5"; $in1="xmm5";
74 $inout4="xmm6"; $in0="xmm6";
75 $inout5="xmm7"; $ivec="xmm7";
79 { my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
84 { my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
88 sub aesimc { aescommon(0xdb,@_); }
89 sub aesenc { aescommon(0xdc,@_); }
90 sub aesenclast { aescommon(0xdd,@_); }
91 sub aesdec { aescommon(0xde,@_); }
92 sub aesdeclast { aescommon(0xdf,@_); }
94 # Inline version of internal aesni_[en|de]crypt1
96 sub aesni_inline_generate1
97 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
100 &$movekey ($rndkey0,&QWP(0,$key));
101 &$movekey ($rndkey1,&QWP(16,$key));
102 &xorps ($ivec,$rndkey0) if (defined($ivec));
103 &lea ($key,&DWP(32,$key));
104 &xorps ($inout,$ivec) if (defined($ivec));
105 &xorps ($inout,$rndkey0) if (!defined($ivec));
106 &set_label("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
109 &$movekey ($rndkey1,&QWP(0,$key));
110 &lea ($key,&DWP(16,$key));
111 &jnz (&label("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
115 sub aesni_generate1 # fully unrolled loop
116 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
118 &function_begin_B("_aesni_${p}rypt1");
119 &movups ($rndkey0,&QWP(0,$key));
120 &$movekey ($rndkey1,&QWP(0x10,$key));
121 &xorps ($inout,$rndkey0);
122 &$movekey ($rndkey0,&QWP(0x20,$key));
123 &lea ($key,&DWP(0x30,$key));
125 &jb (&label("${p}128"));
126 &lea ($key,&DWP(0x20,$key));
127 &je (&label("${p}192"));
128 &lea ($key,&DWP(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey1)";
130 &$movekey ($rndkey1,&QWP(-0x40,$key));
131 eval"&aes${p} ($inout,$rndkey0)";
132 &$movekey ($rndkey0,&QWP(-0x30,$key));
133 &set_label("${p}192");
134 eval"&aes${p} ($inout,$rndkey1)";
135 &$movekey ($rndkey1,&QWP(-0x20,$key));
136 eval"&aes${p} ($inout,$rndkey0)";
137 &$movekey ($rndkey0,&QWP(-0x10,$key));
138 &set_label("${p}128");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP(0,$key));
141 eval"&aes${p} ($inout,$rndkey0)";
142 &$movekey ($rndkey0,&QWP(0x10,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP(0x20,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP(0x30,$key));
147 eval"&aes${p} ($inout,$rndkey1)";
148 &$movekey ($rndkey1,&QWP(0x40,$key));
149 eval"&aes${p} ($inout,$rndkey0)";
150 &$movekey ($rndkey0,&QWP(0x50,$key));
151 eval"&aes${p} ($inout,$rndkey1)";
152 &$movekey ($rndkey1,&QWP(0x60,$key));
153 eval"&aes${p} ($inout,$rndkey0)";
154 &$movekey ($rndkey0,&QWP(0x70,$key));
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
158 &function_end_B("_aesni_${p}rypt1");
161 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162 &aesni_generate1("enc") if (!$inline);
163 &function_begin_B("${PREFIX}_encrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
166 &movups ($inout0,&QWP(0,"eax"));
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
170 { &aesni_inline_generate1("enc"); }
172 { &call ("_aesni_encrypt1"); }
173 &movups (&QWP(0,"eax"),$inout0);
175 &function_end_B("${PREFIX}_encrypt");
177 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178 &aesni_generate1("dec") if(!$inline);
179 &function_begin_B("${PREFIX}_decrypt");
180 &mov ("eax",&wparam(0));
181 &mov ($key,&wparam(2));
182 &movups ($inout0,&QWP(0,"eax"));
183 &mov ($rounds,&DWP(240,$key));
184 &mov ("eax",&wparam(1));
186 { &aesni_inline_generate1("dec"); }
188 { &call ("_aesni_decrypt1"); }
189 &movups (&QWP(0,"eax"),$inout0);
191 &function_end_B("${PREFIX}_decrypt");
193 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194 # factor. Why 3x subroutine were originally used in loops? Even though
195 # aes[enc|dec] latency was originally 6, it could be scheduled only
196 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
197 # utilization, i.e. when subroutine's throughput is virtually same as
198 # of non-interleaved subroutine [for number of input blocks up to 3].
199 # This is why it makes no sense to implement 2x subroutine.
200 # aes[enc|dec] latency in next processor generation is 8, but the
201 # instructions can be scheduled every cycle. Optimal interleave for
202 # new processor is therefore 8x, but it's unfeasible to accommodate it
203 # in XMM registers addreassable in 32-bit mode and therefore 6x is
209 &function_begin_B("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP(0,$key));
212 &$movekey ($rndkey1,&QWP(16,$key));
213 &xorps ($inout0,$rndkey0);
214 &pxor ($inout1,$rndkey0);
215 &pxor ($inout2,$rndkey0);
216 &$movekey ($rndkey0,&QWP(32,$key));
217 &lea ($key,&DWP(32,$key,$rounds));
221 &set_label("${p}3_loop");
222 eval"&aes${p} ($inout0,$rndkey1)";
223 eval"&aes${p} ($inout1,$rndkey1)";
224 eval"&aes${p} ($inout2,$rndkey1)";
225 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
227 eval"&aes${p} ($inout0,$rndkey0)";
228 eval"&aes${p} ($inout1,$rndkey0)";
229 eval"&aes${p} ($inout2,$rndkey0)";
230 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
231 &jnz (&label("${p}3_loop"));
232 eval"&aes${p} ($inout0,$rndkey1)";
233 eval"&aes${p} ($inout1,$rndkey1)";
234 eval"&aes${p} ($inout2,$rndkey1)";
235 eval"&aes${p}last ($inout0,$rndkey0)";
236 eval"&aes${p}last ($inout1,$rndkey0)";
237 eval"&aes${p}last ($inout2,$rndkey0)";
239 &function_end_B("_aesni_${p}rypt3");
242 # 4x interleave is implemented to improve small block performance,
243 # most notably [and naturally] 4 block by ~30%. One can argue that one
244 # should have implemented 5x as well, but improvement would be <20%,
245 # so it's not worth it...
249 &function_begin_B("_aesni_${p}rypt4");
250 &$movekey ($rndkey0,&QWP(0,$key));
251 &$movekey ($rndkey1,&QWP(16,$key));
253 &xorps ($inout0,$rndkey0);
254 &pxor ($inout1,$rndkey0);
255 &pxor ($inout2,$rndkey0);
256 &pxor ($inout3,$rndkey0);
257 &$movekey ($rndkey0,&QWP(32,$key));
258 &lea ($key,&DWP(32,$key,$rounds));
260 &data_byte (0x0f,0x1f,0x40,0x00);
263 &set_label("${p}4_loop");
264 eval"&aes${p} ($inout0,$rndkey1)";
265 eval"&aes${p} ($inout1,$rndkey1)";
266 eval"&aes${p} ($inout2,$rndkey1)";
267 eval"&aes${p} ($inout3,$rndkey1)";
268 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
270 eval"&aes${p} ($inout0,$rndkey0)";
271 eval"&aes${p} ($inout1,$rndkey0)";
272 eval"&aes${p} ($inout2,$rndkey0)";
273 eval"&aes${p} ($inout3,$rndkey0)";
274 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
275 &jnz (&label("${p}4_loop"));
277 eval"&aes${p} ($inout0,$rndkey1)";
278 eval"&aes${p} ($inout1,$rndkey1)";
279 eval"&aes${p} ($inout2,$rndkey1)";
280 eval"&aes${p} ($inout3,$rndkey1)";
281 eval"&aes${p}last ($inout0,$rndkey0)";
282 eval"&aes${p}last ($inout1,$rndkey0)";
283 eval"&aes${p}last ($inout2,$rndkey0)";
284 eval"&aes${p}last ($inout3,$rndkey0)";
286 &function_end_B("_aesni_${p}rypt4");
292 &function_begin_B("_aesni_${p}rypt6");
293 &static_label("_aesni_${p}rypt6_enter");
294 &$movekey ($rndkey0,&QWP(0,$key));
296 &$movekey ($rndkey1,&QWP(16,$key));
297 &xorps ($inout0,$rndkey0);
298 &pxor ($inout1,$rndkey0); # pxor does better here
299 &pxor ($inout2,$rndkey0);
300 eval"&aes${p} ($inout0,$rndkey1)";
301 &pxor ($inout3,$rndkey0);
302 &pxor ($inout4,$rndkey0);
303 eval"&aes${p} ($inout1,$rndkey1)";
304 &lea ($key,&DWP(32,$key,$rounds));
306 eval"&aes${p} ($inout2,$rndkey1)";
307 &pxor ($inout5,$rndkey0);
309 eval"&aes${p} ($inout3,$rndkey1)";
310 eval"&aes${p} ($inout4,$rndkey1)";
311 eval"&aes${p} ($inout5,$rndkey1)";
312 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
313 &jmp (&label("_aesni_${p}rypt6_enter"));
315 &set_label("${p}6_loop",16);
316 eval"&aes${p} ($inout0,$rndkey1)";
317 eval"&aes${p} ($inout1,$rndkey1)";
318 eval"&aes${p} ($inout2,$rndkey1)";
319 eval"&aes${p} ($inout3,$rndkey1)";
320 eval"&aes${p} ($inout4,$rndkey1)";
321 eval"&aes${p} ($inout5,$rndkey1)";
322 &set_label("_aesni_${p}rypt6_enter");
323 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
325 eval"&aes${p} ($inout0,$rndkey0)";
326 eval"&aes${p} ($inout1,$rndkey0)";
327 eval"&aes${p} ($inout2,$rndkey0)";
328 eval"&aes${p} ($inout3,$rndkey0)";
329 eval"&aes${p} ($inout4,$rndkey0)";
330 eval"&aes${p} ($inout5,$rndkey0)";
331 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
332 &jnz (&label("${p}6_loop"));
334 eval"&aes${p} ($inout0,$rndkey1)";
335 eval"&aes${p} ($inout1,$rndkey1)";
336 eval"&aes${p} ($inout2,$rndkey1)";
337 eval"&aes${p} ($inout3,$rndkey1)";
338 eval"&aes${p} ($inout4,$rndkey1)";
339 eval"&aes${p} ($inout5,$rndkey1)";
340 eval"&aes${p}last ($inout0,$rndkey0)";
341 eval"&aes${p}last ($inout1,$rndkey0)";
342 eval"&aes${p}last ($inout2,$rndkey0)";
343 eval"&aes${p}last ($inout3,$rndkey0)";
344 eval"&aes${p}last ($inout4,$rndkey0)";
345 eval"&aes${p}last ($inout5,$rndkey0)";
347 &function_end_B("_aesni_${p}rypt6");
349 &aesni_generate3("enc") if ($PREFIX eq "aesni");
350 &aesni_generate3("dec");
351 &aesni_generate4("enc") if ($PREFIX eq "aesni");
352 &aesni_generate4("dec");
353 &aesni_generate6("enc") if ($PREFIX eq "aesni");
354 &aesni_generate6("dec");
356 if ($PREFIX eq "aesni") {
357 ######################################################################
358 # void aesni_ecb_encrypt (const void *in, void *out,
359 # size_t length, const AES_KEY *key,
361 &function_begin("aesni_ecb_encrypt");
362 &mov ($inp,&wparam(0));
363 &mov ($out,&wparam(1));
364 &mov ($len,&wparam(2));
365 &mov ($key,&wparam(3));
366 &mov ($rounds_,&wparam(4));
368 &jz (&label("ecb_ret"));
369 &mov ($rounds,&DWP(240,$key));
370 &test ($rounds_,$rounds_);
371 &jz (&label("ecb_decrypt"));
373 &mov ($key_,$key); # backup $key
374 &mov ($rounds_,$rounds); # backup $rounds
376 &jb (&label("ecb_enc_tail"));
378 &movdqu ($inout0,&QWP(0,$inp));
379 &movdqu ($inout1,&QWP(0x10,$inp));
380 &movdqu ($inout2,&QWP(0x20,$inp));
381 &movdqu ($inout3,&QWP(0x30,$inp));
382 &movdqu ($inout4,&QWP(0x40,$inp));
383 &movdqu ($inout5,&QWP(0x50,$inp));
384 &lea ($inp,&DWP(0x60,$inp));
386 &jmp (&label("ecb_enc_loop6_enter"));
388 &set_label("ecb_enc_loop6",16);
389 &movups (&QWP(0,$out),$inout0);
390 &movdqu ($inout0,&QWP(0,$inp));
391 &movups (&QWP(0x10,$out),$inout1);
392 &movdqu ($inout1,&QWP(0x10,$inp));
393 &movups (&QWP(0x20,$out),$inout2);
394 &movdqu ($inout2,&QWP(0x20,$inp));
395 &movups (&QWP(0x30,$out),$inout3);
396 &movdqu ($inout3,&QWP(0x30,$inp));
397 &movups (&QWP(0x40,$out),$inout4);
398 &movdqu ($inout4,&QWP(0x40,$inp));
399 &movups (&QWP(0x50,$out),$inout5);
400 &lea ($out,&DWP(0x60,$out));
401 &movdqu ($inout5,&QWP(0x50,$inp));
402 &lea ($inp,&DWP(0x60,$inp));
403 &set_label("ecb_enc_loop6_enter");
405 &call ("_aesni_encrypt6");
407 &mov ($key,$key_); # restore $key
408 &mov ($rounds,$rounds_); # restore $rounds
410 &jnc (&label("ecb_enc_loop6"));
412 &movups (&QWP(0,$out),$inout0);
413 &movups (&QWP(0x10,$out),$inout1);
414 &movups (&QWP(0x20,$out),$inout2);
415 &movups (&QWP(0x30,$out),$inout3);
416 &movups (&QWP(0x40,$out),$inout4);
417 &movups (&QWP(0x50,$out),$inout5);
418 &lea ($out,&DWP(0x60,$out));
420 &jz (&label("ecb_ret"));
422 &set_label("ecb_enc_tail");
423 &movups ($inout0,&QWP(0,$inp));
425 &jb (&label("ecb_enc_one"));
426 &movups ($inout1,&QWP(0x10,$inp));
427 &je (&label("ecb_enc_two"));
428 &movups ($inout2,&QWP(0x20,$inp));
430 &jb (&label("ecb_enc_three"));
431 &movups ($inout3,&QWP(0x30,$inp));
432 &je (&label("ecb_enc_four"));
433 &movups ($inout4,&QWP(0x40,$inp));
434 &xorps ($inout5,$inout5);
435 &call ("_aesni_encrypt6");
436 &movups (&QWP(0,$out),$inout0);
437 &movups (&QWP(0x10,$out),$inout1);
438 &movups (&QWP(0x20,$out),$inout2);
439 &movups (&QWP(0x30,$out),$inout3);
440 &movups (&QWP(0x40,$out),$inout4);
441 jmp (&label("ecb_ret"));
443 &set_label("ecb_enc_one",16);
445 { &aesni_inline_generate1("enc"); }
447 { &call ("_aesni_encrypt1"); }
448 &movups (&QWP(0,$out),$inout0);
449 &jmp (&label("ecb_ret"));
451 &set_label("ecb_enc_two",16);
452 &xorps ($inout2,$inout2);
453 &call ("_aesni_encrypt3");
454 &movups (&QWP(0,$out),$inout0);
455 &movups (&QWP(0x10,$out),$inout1);
456 &jmp (&label("ecb_ret"));
458 &set_label("ecb_enc_three",16);
459 &call ("_aesni_encrypt3");
460 &movups (&QWP(0,$out),$inout0);
461 &movups (&QWP(0x10,$out),$inout1);
462 &movups (&QWP(0x20,$out),$inout2);
463 &jmp (&label("ecb_ret"));
465 &set_label("ecb_enc_four",16);
466 &call ("_aesni_encrypt4");
467 &movups (&QWP(0,$out),$inout0);
468 &movups (&QWP(0x10,$out),$inout1);
469 &movups (&QWP(0x20,$out),$inout2);
470 &movups (&QWP(0x30,$out),$inout3);
471 &jmp (&label("ecb_ret"));
472 ######################################################################
473 &set_label("ecb_decrypt",16);
474 &mov ($key_,$key); # backup $key
475 &mov ($rounds_,$rounds); # backup $rounds
477 &jb (&label("ecb_dec_tail"));
479 &movdqu ($inout0,&QWP(0,$inp));
480 &movdqu ($inout1,&QWP(0x10,$inp));
481 &movdqu ($inout2,&QWP(0x20,$inp));
482 &movdqu ($inout3,&QWP(0x30,$inp));
483 &movdqu ($inout4,&QWP(0x40,$inp));
484 &movdqu ($inout5,&QWP(0x50,$inp));
485 &lea ($inp,&DWP(0x60,$inp));
487 &jmp (&label("ecb_dec_loop6_enter"));
489 &set_label("ecb_dec_loop6",16);
490 &movups (&QWP(0,$out),$inout0);
491 &movdqu ($inout0,&QWP(0,$inp));
492 &movups (&QWP(0x10,$out),$inout1);
493 &movdqu ($inout1,&QWP(0x10,$inp));
494 &movups (&QWP(0x20,$out),$inout2);
495 &movdqu ($inout2,&QWP(0x20,$inp));
496 &movups (&QWP(0x30,$out),$inout3);
497 &movdqu ($inout3,&QWP(0x30,$inp));
498 &movups (&QWP(0x40,$out),$inout4);
499 &movdqu ($inout4,&QWP(0x40,$inp));
500 &movups (&QWP(0x50,$out),$inout5);
501 &lea ($out,&DWP(0x60,$out));
502 &movdqu ($inout5,&QWP(0x50,$inp));
503 &lea ($inp,&DWP(0x60,$inp));
504 &set_label("ecb_dec_loop6_enter");
506 &call ("_aesni_decrypt6");
508 &mov ($key,$key_); # restore $key
509 &mov ($rounds,$rounds_); # restore $rounds
511 &jnc (&label("ecb_dec_loop6"));
513 &movups (&QWP(0,$out),$inout0);
514 &movups (&QWP(0x10,$out),$inout1);
515 &movups (&QWP(0x20,$out),$inout2);
516 &movups (&QWP(0x30,$out),$inout3);
517 &movups (&QWP(0x40,$out),$inout4);
518 &movups (&QWP(0x50,$out),$inout5);
519 &lea ($out,&DWP(0x60,$out));
521 &jz (&label("ecb_ret"));
523 &set_label("ecb_dec_tail");
524 &movups ($inout0,&QWP(0,$inp));
526 &jb (&label("ecb_dec_one"));
527 &movups ($inout1,&QWP(0x10,$inp));
528 &je (&label("ecb_dec_two"));
529 &movups ($inout2,&QWP(0x20,$inp));
531 &jb (&label("ecb_dec_three"));
532 &movups ($inout3,&QWP(0x30,$inp));
533 &je (&label("ecb_dec_four"));
534 &movups ($inout4,&QWP(0x40,$inp));
535 &xorps ($inout5,$inout5);
536 &call ("_aesni_decrypt6");
537 &movups (&QWP(0,$out),$inout0);
538 &movups (&QWP(0x10,$out),$inout1);
539 &movups (&QWP(0x20,$out),$inout2);
540 &movups (&QWP(0x30,$out),$inout3);
541 &movups (&QWP(0x40,$out),$inout4);
542 &jmp (&label("ecb_ret"));
544 &set_label("ecb_dec_one",16);
546 { &aesni_inline_generate1("dec"); }
548 { &call ("_aesni_decrypt1"); }
549 &movups (&QWP(0,$out),$inout0);
550 &jmp (&label("ecb_ret"));
552 &set_label("ecb_dec_two",16);
553 &xorps ($inout2,$inout2);
554 &call ("_aesni_decrypt3");
555 &movups (&QWP(0,$out),$inout0);
556 &movups (&QWP(0x10,$out),$inout1);
557 &jmp (&label("ecb_ret"));
559 &set_label("ecb_dec_three",16);
560 &call ("_aesni_decrypt3");
561 &movups (&QWP(0,$out),$inout0);
562 &movups (&QWP(0x10,$out),$inout1);
563 &movups (&QWP(0x20,$out),$inout2);
564 &jmp (&label("ecb_ret"));
566 &set_label("ecb_dec_four",16);
567 &call ("_aesni_decrypt4");
568 &movups (&QWP(0,$out),$inout0);
569 &movups (&QWP(0x10,$out),$inout1);
570 &movups (&QWP(0x20,$out),$inout2);
571 &movups (&QWP(0x30,$out),$inout3);
573 &set_label("ecb_ret");
574 &function_end("aesni_ecb_encrypt");
576 ######################################################################
577 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
578 # size_t blocks, const AES_KEY *key,
579 # const char *ivec,char *cmac);
581 # Handles only complete blocks, operates on 64-bit counter and
582 # does not update *ivec! Nor does it finalize CMAC value
583 # (see engine/eng_aesni.c for details)
586 &function_begin("aesni_ccm64_encrypt_blocks");
587 &mov ($inp,&wparam(0));
588 &mov ($out,&wparam(1));
589 &mov ($len,&wparam(2));
590 &mov ($key,&wparam(3));
591 &mov ($rounds_,&wparam(4));
592 &mov ($rounds,&wparam(5));
595 &and ("esp",-16); # align stack
596 &mov (&DWP(48,"esp"),$key_);
598 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
599 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
600 &mov ($rounds,&DWP(240,$key));
602 # compose byte-swap control mask for pshufb on stack
603 &mov (&DWP(0,"esp"),0x0c0d0e0f);
604 &mov (&DWP(4,"esp"),0x08090a0b);
605 &mov (&DWP(8,"esp"),0x04050607);
606 &mov (&DWP(12,"esp"),0x00010203);
608 # compose counter increment vector on stack
611 &mov (&DWP(16,"esp"),$rounds_);
612 &mov (&DWP(20,"esp"),$key_);
613 &mov (&DWP(24,"esp"),$key_);
614 &mov (&DWP(28,"esp"),$key_);
618 &lea ($key_,&DWP(0,$key));
619 &movdqa ($inout3,&QWP(0,"esp"));
620 &movdqa ($inout0,$ivec);
621 &lea ($key,&DWP(32,$key,$rounds));
622 &sub ($rounds_,$rounds);
623 &pshufb ($ivec,$inout3);
625 &set_label("ccm64_enc_outer");
626 &$movekey ($rndkey0,&QWP(0,$key_));
627 &mov ($rounds,$rounds_);
628 &movups ($in0,&QWP(0,$inp));
630 &xorps ($inout0,$rndkey0);
631 &$movekey ($rndkey1,&QWP(16,$key_));
632 &xorps ($rndkey0,$in0);
633 &xorps ($cmac,$rndkey0); # cmac^=inp
634 &$movekey ($rndkey0,&QWP(32,$key_));
636 &set_label("ccm64_enc2_loop");
637 &aesenc ($inout0,$rndkey1);
638 &aesenc ($cmac,$rndkey1);
639 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
641 &aesenc ($inout0,$rndkey0);
642 &aesenc ($cmac,$rndkey0);
643 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
644 &jnz (&label("ccm64_enc2_loop"));
645 &aesenc ($inout0,$rndkey1);
646 &aesenc ($cmac,$rndkey1);
647 &paddq ($ivec,&QWP(16,"esp"));
649 &aesenclast ($inout0,$rndkey0);
650 &aesenclast ($cmac,$rndkey0);
652 &lea ($inp,&DWP(16,$inp));
653 &xorps ($in0,$inout0); # inp^=E(ivec)
654 &movdqa ($inout0,$ivec);
655 &movups (&QWP(0,$out),$in0); # save output
656 &pshufb ($inout0,$inout3);
657 &lea ($out,&DWP(16,$out));
658 &jnz (&label("ccm64_enc_outer"));
660 &mov ("esp",&DWP(48,"esp"));
661 &mov ($out,&wparam(5));
662 &movups (&QWP(0,$out),$cmac);
663 &function_end("aesni_ccm64_encrypt_blocks");
665 &function_begin("aesni_ccm64_decrypt_blocks");
666 &mov ($inp,&wparam(0));
667 &mov ($out,&wparam(1));
668 &mov ($len,&wparam(2));
669 &mov ($key,&wparam(3));
670 &mov ($rounds_,&wparam(4));
671 &mov ($rounds,&wparam(5));
674 &and ("esp",-16); # align stack
675 &mov (&DWP(48,"esp"),$key_);
677 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
678 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
679 &mov ($rounds,&DWP(240,$key));
681 # compose byte-swap control mask for pshufb on stack
682 &mov (&DWP(0,"esp"),0x0c0d0e0f);
683 &mov (&DWP(4,"esp"),0x08090a0b);
684 &mov (&DWP(8,"esp"),0x04050607);
685 &mov (&DWP(12,"esp"),0x00010203);
687 # compose counter increment vector on stack
690 &mov (&DWP(16,"esp"),$rounds_);
691 &mov (&DWP(20,"esp"),$key_);
692 &mov (&DWP(24,"esp"),$key_);
693 &mov (&DWP(28,"esp"),$key_);
695 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
696 &movdqa ($inout0,$ivec);
699 &mov ($rounds_,$rounds);
701 &pshufb ($ivec,$inout3);
703 { &aesni_inline_generate1("enc"); }
705 { &call ("_aesni_encrypt1"); }
708 &movups ($in0,&QWP(0,$inp)); # load inp
709 &paddq ($ivec,&QWP(16,"esp"));
710 &lea ($inp,&QWP(16,$inp));
711 &sub ($rounds,$rounds_);
712 &lea ($key,&DWP(32,$key_,$rounds_));
713 &mov ($rounds_,$rounds);
714 &jmp (&label("ccm64_dec_outer"));
716 &set_label("ccm64_dec_outer",16);
717 &xorps ($in0,$inout0); # inp ^= E(ivec)
718 &movdqa ($inout0,$ivec);
719 &movups (&QWP(0,$out),$in0); # save output
720 &lea ($out,&DWP(16,$out));
721 &pshufb ($inout0,$inout3);
724 &jz (&label("ccm64_dec_break"));
726 &$movekey ($rndkey0,&QWP(0,$key_));
727 &mov ($rounds,$rounds_);
728 &$movekey ($rndkey1,&QWP(16,$key_));
729 &xorps ($in0,$rndkey0);
730 &xorps ($inout0,$rndkey0);
731 &xorps ($cmac,$in0); # cmac^=out
732 &$movekey ($rndkey0,&QWP(32,$key_));
734 &set_label("ccm64_dec2_loop");
735 &aesenc ($inout0,$rndkey1);
736 &aesenc ($cmac,$rndkey1);
737 &$movekey ($rndkey1,&QWP(0,$key,$rounds));
739 &aesenc ($inout0,$rndkey0);
740 &aesenc ($cmac,$rndkey0);
741 &$movekey ($rndkey0,&QWP(-16,$key,$rounds));
742 &jnz (&label("ccm64_dec2_loop"));
743 &movups ($in0,&QWP(0,$inp)); # load inp
744 &paddq ($ivec,&QWP(16,"esp"));
745 &aesenc ($inout0,$rndkey1);
746 &aesenc ($cmac,$rndkey1);
747 &aesenclast ($inout0,$rndkey0);
748 &aesenclast ($cmac,$rndkey0);
749 &lea ($inp,&QWP(16,$inp));
750 &jmp (&label("ccm64_dec_outer"));
752 &set_label("ccm64_dec_break",16);
753 &mov ($rounds,&DWP(240,$key_));
756 { &aesni_inline_generate1("enc",$cmac,$in0); }
758 { &call ("_aesni_encrypt1",$cmac); }
760 &mov ("esp",&DWP(48,"esp"));
761 &mov ($out,&wparam(5));
762 &movups (&QWP(0,$out),$cmac);
763 &function_end("aesni_ccm64_decrypt_blocks");
766 ######################################################################
767 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
768 # size_t blocks, const AES_KEY *key,
771 # Handles only complete blocks, operates on 32-bit counter and
772 # does not update *ivec! (see crypto/modes/ctr128.c for details)
776 # 16 vector addend: 0,6,6,6
777 # 32 counter-less ivec
778 # 48 1st triplet of counter vector
779 # 64 2nd triplet of counter vector
782 &function_begin("aesni_ctr32_encrypt_blocks");
783 &mov ($inp,&wparam(0));
784 &mov ($out,&wparam(1));
785 &mov ($len,&wparam(2));
786 &mov ($key,&wparam(3));
787 &mov ($rounds_,&wparam(4));
790 &and ("esp",-16); # align stack
791 &mov (&DWP(80,"esp"),$key_);
794 &je (&label("ctr32_one_shortcut"));
796 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
798 # compose byte-swap control mask for pshufb on stack
799 &mov (&DWP(0,"esp"),0x0c0d0e0f);
800 &mov (&DWP(4,"esp"),0x08090a0b);
801 &mov (&DWP(8,"esp"),0x04050607);
802 &mov (&DWP(12,"esp"),0x00010203);
804 # compose counter increment vector on stack
807 &mov (&DWP(16,"esp"),$rounds);
808 &mov (&DWP(20,"esp"),$rounds);
809 &mov (&DWP(24,"esp"),$rounds);
810 &mov (&DWP(28,"esp"),$key_);
812 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
813 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
815 &mov ($rounds,&DWP(240,$key)); # key->rounds
817 # compose 2 vectors of 3x32-bit counters
819 &pxor ($rndkey0,$rndkey0);
820 &pxor ($rndkey1,$rndkey1);
821 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
822 &pinsrd ($rndkey0,$rounds_,0);
823 &lea ($key_,&DWP(3,$rounds_));
824 &pinsrd ($rndkey1,$key_,0);
826 &pinsrd ($rndkey0,$rounds_,1);
828 &pinsrd ($rndkey1,$key_,1);
830 &pinsrd ($rndkey0,$rounds_,2);
832 &pinsrd ($rndkey1,$key_,2);
833 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
834 &pshufb ($rndkey0,$inout0); # byte swap
835 &movdqu ($inout4,&QWP(0,$key)); # key[0]
836 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
837 &pshufb ($rndkey1,$inout0); # byte swap
839 &pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
840 &pshufd ($inout1,$rndkey0,2<<6);
842 &jb (&label("ctr32_tail"));
843 &pxor ($inout5,$inout4); # counter-less ivec^key[0]
846 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
847 &mov ($key_,$key); # backup $key
848 &sub ($rounds_,$rounds); # backup twisted $rounds
849 &lea ($key,&DWP(32,$key,$rounds));
851 &jmp (&label("ctr32_loop6"));
853 &set_label("ctr32_loop6",16);
854 # inlining _aesni_encrypt6's prologue gives ~6% improvement...
855 &pshufd ($inout2,$rndkey0,1<<6);
856 &movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
857 &pshufd ($inout3,$rndkey1,3<<6);
858 &pxor ($inout0,$rndkey0); # merge counter-less ivec
859 &pshufd ($inout4,$rndkey1,2<<6);
860 &pxor ($inout1,$rndkey0);
861 &pshufd ($inout5,$rndkey1,1<<6);
862 &$movekey ($rndkey1,&QWP(16,$key_));
863 &pxor ($inout2,$rndkey0);
864 &pxor ($inout3,$rndkey0);
865 &aesenc ($inout0,$rndkey1);
866 &pxor ($inout4,$rndkey0);
867 &pxor ($inout5,$rndkey0);
868 &aesenc ($inout1,$rndkey1);
869 &$movekey ($rndkey0,&QWP(32,$key_));
870 &mov ($rounds,$rounds_);
871 &aesenc ($inout2,$rndkey1);
872 &aesenc ($inout3,$rndkey1);
873 &aesenc ($inout4,$rndkey1);
874 &aesenc ($inout5,$rndkey1);
876 &call (&label("_aesni_encrypt6_enter"));
878 &movups ($rndkey1,&QWP(0,$inp));
879 &movups ($rndkey0,&QWP(0x10,$inp));
880 &xorps ($inout0,$rndkey1);
881 &movups ($rndkey1,&QWP(0x20,$inp));
882 &xorps ($inout1,$rndkey0);
883 &movups (&QWP(0,$out),$inout0);
884 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
885 &xorps ($inout2,$rndkey1);
886 &movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
887 &movups (&QWP(0x10,$out),$inout1);
888 &movups (&QWP(0x20,$out),$inout2);
890 &paddd ($rndkey1,$rndkey0); # 2nd triplet increment
891 &paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
892 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
894 &movups ($inout1,&QWP(0x30,$inp));
895 &movups ($inout2,&QWP(0x40,$inp));
896 &xorps ($inout3,$inout1);
897 &movups ($inout1,&QWP(0x50,$inp));
898 &lea ($inp,&DWP(0x60,$inp));
899 &movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
900 &pshufb ($rndkey0,$inout0); # byte swap
901 &xorps ($inout4,$inout2);
902 &movups (&QWP(0x30,$out),$inout3);
903 &xorps ($inout5,$inout1);
904 &movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
905 &pshufb ($rndkey1,$inout0); # byte swap
906 &movups (&QWP(0x40,$out),$inout4);
907 &pshufd ($inout0,$rndkey0,3<<6);
908 &movups (&QWP(0x50,$out),$inout5);
909 &lea ($out,&DWP(0x60,$out));
911 &pshufd ($inout1,$rndkey0,2<<6);
913 &jnc (&label("ctr32_loop6"));
916 &jz (&label("ctr32_ret"));
917 &movdqu ($inout5,&QWP(0,$key_));
919 &pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
920 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
922 &set_label("ctr32_tail");
923 &por ($inout0,$inout5);
925 &jb (&label("ctr32_one"));
927 &pshufd ($inout2,$rndkey0,1<<6);
928 &por ($inout1,$inout5);
929 &je (&label("ctr32_two"));
931 &pshufd ($inout3,$rndkey1,3<<6);
932 &por ($inout2,$inout5);
934 &jb (&label("ctr32_three"));
936 &pshufd ($inout4,$rndkey1,2<<6);
937 &por ($inout3,$inout5);
938 &je (&label("ctr32_four"));
940 &por ($inout4,$inout5);
941 &call ("_aesni_encrypt6");
942 &movups ($rndkey1,&QWP(0,$inp));
943 &movups ($rndkey0,&QWP(0x10,$inp));
944 &xorps ($inout0,$rndkey1);
945 &movups ($rndkey1,&QWP(0x20,$inp));
946 &xorps ($inout1,$rndkey0);
947 &movups ($rndkey0,&QWP(0x30,$inp));
948 &xorps ($inout2,$rndkey1);
949 &movups ($rndkey1,&QWP(0x40,$inp));
950 &xorps ($inout3,$rndkey0);
951 &movups (&QWP(0,$out),$inout0);
952 &xorps ($inout4,$rndkey1);
953 &movups (&QWP(0x10,$out),$inout1);
954 &movups (&QWP(0x20,$out),$inout2);
955 &movups (&QWP(0x30,$out),$inout3);
956 &movups (&QWP(0x40,$out),$inout4);
957 &jmp (&label("ctr32_ret"));
959 &set_label("ctr32_one_shortcut",16);
960 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
961 &mov ($rounds,&DWP(240,$key));
963 &set_label("ctr32_one");
965 { &aesni_inline_generate1("enc"); }
967 { &call ("_aesni_encrypt1"); }
968 &movups ($in0,&QWP(0,$inp));
969 &xorps ($in0,$inout0);
970 &movups (&QWP(0,$out),$in0);
971 &jmp (&label("ctr32_ret"));
973 &set_label("ctr32_two",16);
974 &call ("_aesni_encrypt3");
975 &movups ($inout3,&QWP(0,$inp));
976 &movups ($inout4,&QWP(0x10,$inp));
977 &xorps ($inout0,$inout3);
978 &xorps ($inout1,$inout4);
979 &movups (&QWP(0,$out),$inout0);
980 &movups (&QWP(0x10,$out),$inout1);
981 &jmp (&label("ctr32_ret"));
983 &set_label("ctr32_three",16);
984 &call ("_aesni_encrypt3");
985 &movups ($inout3,&QWP(0,$inp));
986 &movups ($inout4,&QWP(0x10,$inp));
987 &xorps ($inout0,$inout3);
988 &movups ($inout5,&QWP(0x20,$inp));
989 &xorps ($inout1,$inout4);
990 &movups (&QWP(0,$out),$inout0);
991 &xorps ($inout2,$inout5);
992 &movups (&QWP(0x10,$out),$inout1);
993 &movups (&QWP(0x20,$out),$inout2);
994 &jmp (&label("ctr32_ret"));
996 &set_label("ctr32_four",16);
997 &call ("_aesni_encrypt4");
998 &movups ($inout4,&QWP(0,$inp));
999 &movups ($inout5,&QWP(0x10,$inp));
1000 &movups ($rndkey1,&QWP(0x20,$inp));
1001 &xorps ($inout0,$inout4);
1002 &movups ($rndkey0,&QWP(0x30,$inp));
1003 &xorps ($inout1,$inout5);
1004 &movups (&QWP(0,$out),$inout0);
1005 &xorps ($inout2,$rndkey1);
1006 &movups (&QWP(0x10,$out),$inout1);
1007 &xorps ($inout3,$rndkey0);
1008 &movups (&QWP(0x20,$out),$inout2);
1009 &movups (&QWP(0x30,$out),$inout3);
1011 &set_label("ctr32_ret");
1012 &mov ("esp",&DWP(80,"esp"));
1013 &function_end("aesni_ctr32_encrypt_blocks");
1015 ######################################################################
1016 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1017 # const AES_KEY *key1, const AES_KEY *key2
1018 # const unsigned char iv[16]);
1020 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1022 &function_begin("aesni_xts_encrypt");
1023 &mov ($key,&wparam(4)); # key2
1024 &mov ($inp,&wparam(5)); # clear-text tweak
1026 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1027 &movups ($inout0,&QWP(0,$inp));
1029 { &aesni_inline_generate1("enc"); }
1031 { &call ("_aesni_encrypt1"); }
1033 &mov ($inp,&wparam(0));
1034 &mov ($out,&wparam(1));
1035 &mov ($len,&wparam(2));
1036 &mov ($key,&wparam(3)); # key1
1039 &sub ("esp",16*7+8);
1040 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1041 &and ("esp",-16); # align stack
1043 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1044 &mov (&DWP(16*6+4,"esp"),0);
1045 &mov (&DWP(16*6+8,"esp"),1);
1046 &mov (&DWP(16*6+12,"esp"),0);
1047 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1048 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1050 &movdqa ($tweak,$inout0);
1051 &pxor ($twtmp,$twtmp);
1052 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1053 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1056 &mov ($key_,$key); # backup $key
1057 &mov ($rounds_,$rounds); # backup $rounds
1059 &jc (&label("xts_enc_short"));
1063 &sub ($rounds_,$rounds);
1064 &lea ($key,&DWP(32,$key,$rounds));
1065 &jmp (&label("xts_enc_loop6"));
1067 &set_label("xts_enc_loop6",16);
1068 for ($i=0;$i<4;$i++) {
1069 &pshufd ($twres,$twtmp,0x13);
1070 &pxor ($twtmp,$twtmp);
1071 &movdqa (&QWP(16*$i,"esp"),$tweak);
1072 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1073 &pand ($twres,$twmask); # isolate carry and residue
1074 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1075 &pxor ($tweak,$twres);
1077 &pshufd ($inout5,$twtmp,0x13);
1078 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1079 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1080 &$movekey ($rndkey0,&QWP(0,$key_));
1081 &pand ($inout5,$twmask); # isolate carry and residue
1082 &movups ($inout0,&QWP(0,$inp)); # load input
1083 &pxor ($inout5,$tweak);
1085 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1086 &mov ($rounds,$rounds_); # restore $rounds
1087 &movdqu ($inout1,&QWP(16*1,$inp));
1088 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1089 &movdqu ($inout2,&QWP(16*2,$inp));
1090 &pxor ($inout1,$rndkey0);
1091 &movdqu ($inout3,&QWP(16*3,$inp));
1092 &pxor ($inout2,$rndkey0);
1093 &movdqu ($inout4,&QWP(16*4,$inp));
1094 &pxor ($inout3,$rndkey0);
1095 &movdqu ($rndkey1,&QWP(16*5,$inp));
1096 &pxor ($inout4,$rndkey0);
1097 &lea ($inp,&DWP(16*6,$inp));
1098 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1099 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1100 &pxor ($inout5,$rndkey1);
1102 &$movekey ($rndkey1,&QWP(16,$key_));
1103 &pxor ($inout1,&QWP(16*1,"esp"));
1104 &pxor ($inout2,&QWP(16*2,"esp"));
1105 &aesenc ($inout0,$rndkey1);
1106 &pxor ($inout3,&QWP(16*3,"esp"));
1107 &pxor ($inout4,&QWP(16*4,"esp"));
1108 &aesenc ($inout1,$rndkey1);
1109 &pxor ($inout5,$rndkey0);
1110 &$movekey ($rndkey0,&QWP(32,$key_));
1111 &aesenc ($inout2,$rndkey1);
1112 &aesenc ($inout3,$rndkey1);
1113 &aesenc ($inout4,$rndkey1);
1114 &aesenc ($inout5,$rndkey1);
1115 &call (&label("_aesni_encrypt6_enter"));
1117 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1118 &pxor ($twtmp,$twtmp);
1119 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1120 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1121 &xorps ($inout1,&QWP(16*1,"esp"));
1122 &movups (&QWP(16*0,$out),$inout0); # write output
1123 &xorps ($inout2,&QWP(16*2,"esp"));
1124 &movups (&QWP(16*1,$out),$inout1);
1125 &xorps ($inout3,&QWP(16*3,"esp"));
1126 &movups (&QWP(16*2,$out),$inout2);
1127 &xorps ($inout4,&QWP(16*4,"esp"));
1128 &movups (&QWP(16*3,$out),$inout3);
1129 &xorps ($inout5,$tweak);
1130 &movups (&QWP(16*4,$out),$inout4);
1131 &pshufd ($twres,$twtmp,0x13);
1132 &movups (&QWP(16*5,$out),$inout5);
1133 &lea ($out,&DWP(16*6,$out));
1134 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1136 &pxor ($twtmp,$twtmp);
1137 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1138 &pand ($twres,$twmask); # isolate carry and residue
1139 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1140 &pxor ($tweak,$twres);
1143 &jnc (&label("xts_enc_loop6"));
1145 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1146 &mov ($key,$key_); # restore $key
1147 &mov ($rounds_,$rounds);
1149 &set_label("xts_enc_short");
1151 &jz (&label("xts_enc_done6x"));
1153 &movdqa ($inout3,$tweak); # put aside previous tweak
1155 &jb (&label("xts_enc_one"));
1157 &pshufd ($twres,$twtmp,0x13);
1158 &pxor ($twtmp,$twtmp);
1159 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1160 &pand ($twres,$twmask); # isolate carry and residue
1161 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1162 &pxor ($tweak,$twres);
1163 &je (&label("xts_enc_two"));
1165 &pshufd ($twres,$twtmp,0x13);
1166 &pxor ($twtmp,$twtmp);
1167 &movdqa ($inout4,$tweak); # put aside previous tweak
1168 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1169 &pand ($twres,$twmask); # isolate carry and residue
1170 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1171 &pxor ($tweak,$twres);
1173 &jb (&label("xts_enc_three"));
1175 &pshufd ($twres,$twtmp,0x13);
1176 &pxor ($twtmp,$twtmp);
1177 &movdqa ($inout5,$tweak); # put aside previous tweak
1178 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1179 &pand ($twres,$twmask); # isolate carry and residue
1180 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1181 &pxor ($tweak,$twres);
1182 &movdqa (&QWP(16*0,"esp"),$inout3);
1183 &movdqa (&QWP(16*1,"esp"),$inout4);
1184 &je (&label("xts_enc_four"));
1186 &movdqa (&QWP(16*2,"esp"),$inout5);
1187 &pshufd ($inout5,$twtmp,0x13);
1188 &movdqa (&QWP(16*3,"esp"),$tweak);
1189 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1190 &pand ($inout5,$twmask); # isolate carry and residue
1191 &pxor ($inout5,$tweak);
1193 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1194 &movdqu ($inout1,&QWP(16*1,$inp));
1195 &movdqu ($inout2,&QWP(16*2,$inp));
1196 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1197 &movdqu ($inout3,&QWP(16*3,$inp));
1198 &pxor ($inout1,&QWP(16*1,"esp"));
1199 &movdqu ($inout4,&QWP(16*4,$inp));
1200 &pxor ($inout2,&QWP(16*2,"esp"));
1201 &lea ($inp,&DWP(16*5,$inp));
1202 &pxor ($inout3,&QWP(16*3,"esp"));
1203 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1204 &pxor ($inout4,$inout5);
1206 &call ("_aesni_encrypt6");
1208 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1209 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1210 &xorps ($inout1,&QWP(16*1,"esp"));
1211 &xorps ($inout2,&QWP(16*2,"esp"));
1212 &movups (&QWP(16*0,$out),$inout0); # write output
1213 &xorps ($inout3,&QWP(16*3,"esp"));
1214 &movups (&QWP(16*1,$out),$inout1);
1215 &xorps ($inout4,$tweak);
1216 &movups (&QWP(16*2,$out),$inout2);
1217 &movups (&QWP(16*3,$out),$inout3);
1218 &movups (&QWP(16*4,$out),$inout4);
1219 &lea ($out,&DWP(16*5,$out));
1220 &jmp (&label("xts_enc_done"));
1222 &set_label("xts_enc_one",16);
1223 &movups ($inout0,&QWP(16*0,$inp)); # load input
1224 &lea ($inp,&DWP(16*1,$inp));
1225 &xorps ($inout0,$inout3); # input^=tweak
1227 { &aesni_inline_generate1("enc"); }
1229 { &call ("_aesni_encrypt1"); }
1230 &xorps ($inout0,$inout3); # output^=tweak
1231 &movups (&QWP(16*0,$out),$inout0); # write output
1232 &lea ($out,&DWP(16*1,$out));
1234 &movdqa ($tweak,$inout3); # last tweak
1235 &jmp (&label("xts_enc_done"));
1237 &set_label("xts_enc_two",16);
1238 &movaps ($inout4,$tweak); # put aside last tweak
1240 &movups ($inout0,&QWP(16*0,$inp)); # load input
1241 &movups ($inout1,&QWP(16*1,$inp));
1242 &lea ($inp,&DWP(16*2,$inp));
1243 &xorps ($inout0,$inout3); # input^=tweak
1244 &xorps ($inout1,$inout4);
1245 &xorps ($inout2,$inout2);
1247 &call ("_aesni_encrypt3");
1249 &xorps ($inout0,$inout3); # output^=tweak
1250 &xorps ($inout1,$inout4);
1251 &movups (&QWP(16*0,$out),$inout0); # write output
1252 &movups (&QWP(16*1,$out),$inout1);
1253 &lea ($out,&DWP(16*2,$out));
1255 &movdqa ($tweak,$inout4); # last tweak
1256 &jmp (&label("xts_enc_done"));
1258 &set_label("xts_enc_three",16);
1259 &movaps ($inout5,$tweak); # put aside last tweak
1260 &movups ($inout0,&QWP(16*0,$inp)); # load input
1261 &movups ($inout1,&QWP(16*1,$inp));
1262 &movups ($inout2,&QWP(16*2,$inp));
1263 &lea ($inp,&DWP(16*3,$inp));
1264 &xorps ($inout0,$inout3); # input^=tweak
1265 &xorps ($inout1,$inout4);
1266 &xorps ($inout2,$inout5);
1268 &call ("_aesni_encrypt3");
1270 &xorps ($inout0,$inout3); # output^=tweak
1271 &xorps ($inout1,$inout4);
1272 &xorps ($inout2,$inout5);
1273 &movups (&QWP(16*0,$out),$inout0); # write output
1274 &movups (&QWP(16*1,$out),$inout1);
1275 &movups (&QWP(16*2,$out),$inout2);
1276 &lea ($out,&DWP(16*3,$out));
1278 &movdqa ($tweak,$inout5); # last tweak
1279 &jmp (&label("xts_enc_done"));
1281 &set_label("xts_enc_four",16);
1282 &movaps ($inout4,$tweak); # put aside last tweak
1284 &movups ($inout0,&QWP(16*0,$inp)); # load input
1285 &movups ($inout1,&QWP(16*1,$inp));
1286 &movups ($inout2,&QWP(16*2,$inp));
1287 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1288 &movups ($inout3,&QWP(16*3,$inp));
1289 &lea ($inp,&DWP(16*4,$inp));
1290 &xorps ($inout1,&QWP(16*1,"esp"));
1291 &xorps ($inout2,$inout5);
1292 &xorps ($inout3,$inout4);
1294 &call ("_aesni_encrypt4");
1296 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1297 &xorps ($inout1,&QWP(16*1,"esp"));
1298 &xorps ($inout2,$inout5);
1299 &movups (&QWP(16*0,$out),$inout0); # write output
1300 &xorps ($inout3,$inout4);
1301 &movups (&QWP(16*1,$out),$inout1);
1302 &movups (&QWP(16*2,$out),$inout2);
1303 &movups (&QWP(16*3,$out),$inout3);
1304 &lea ($out,&DWP(16*4,$out));
1306 &movdqa ($tweak,$inout4); # last tweak
1307 &jmp (&label("xts_enc_done"));
1309 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1310 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1312 &jz (&label("xts_enc_ret"));
1313 &movdqa ($inout3,$tweak);
1314 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1315 &jmp (&label("xts_enc_steal"));
1317 &set_label("xts_enc_done",16);
1318 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1319 &pxor ($twtmp,$twtmp);
1321 &jz (&label("xts_enc_ret"));
1323 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1324 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1325 &pshufd ($inout3,$twtmp,0x13);
1326 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1327 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1328 &pxor ($inout3,$tweak);
1330 &set_label("xts_enc_steal");
1331 &movz ($rounds,&BP(0,$inp));
1332 &movz ($key,&BP(-16,$out));
1333 &lea ($inp,&DWP(1,$inp));
1334 &mov (&BP(-16,$out),&LB($rounds));
1335 &mov (&BP(0,$out),&LB($key));
1336 &lea ($out,&DWP(1,$out));
1338 &jnz (&label("xts_enc_steal"));
1340 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1341 &mov ($key,$key_); # restore $key
1342 &mov ($rounds,$rounds_); # restore $rounds
1344 &movups ($inout0,&QWP(-16,$out)); # load input
1345 &xorps ($inout0,$inout3); # input^=tweak
1347 { &aesni_inline_generate1("enc"); }
1349 { &call ("_aesni_encrypt1"); }
1350 &xorps ($inout0,$inout3); # output^=tweak
1351 &movups (&QWP(-16,$out),$inout0); # write output
1353 &set_label("xts_enc_ret");
1354 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1355 &function_end("aesni_xts_encrypt");
1357 &function_begin("aesni_xts_decrypt");
1358 &mov ($key,&wparam(4)); # key2
1359 &mov ($inp,&wparam(5)); # clear-text tweak
1361 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1362 &movups ($inout0,&QWP(0,$inp));
1364 { &aesni_inline_generate1("enc"); }
1366 { &call ("_aesni_encrypt1"); }
1368 &mov ($inp,&wparam(0));
1369 &mov ($out,&wparam(1));
1370 &mov ($len,&wparam(2));
1371 &mov ($key,&wparam(3)); # key1
1374 &sub ("esp",16*7+8);
1375 &and ("esp",-16); # align stack
1377 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1379 &setnz (&LB($rounds_));
1381 &sub ($len,$rounds_);
1383 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1384 &mov (&DWP(16*6+4,"esp"),0);
1385 &mov (&DWP(16*6+8,"esp"),1);
1386 &mov (&DWP(16*6+12,"esp"),0);
1387 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1388 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1390 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1391 &mov ($key_,$key); # backup $key
1392 &mov ($rounds_,$rounds); # backup $rounds
1394 &movdqa ($tweak,$inout0);
1395 &pxor ($twtmp,$twtmp);
1396 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1397 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1401 &jc (&label("xts_dec_short"));
1405 &sub ($rounds_,$rounds);
1406 &lea ($key,&DWP(32,$key,$rounds));
1407 &jmp (&label("xts_dec_loop6"));
1409 &set_label("xts_dec_loop6",16);
1410 for ($i=0;$i<4;$i++) {
1411 &pshufd ($twres,$twtmp,0x13);
1412 &pxor ($twtmp,$twtmp);
1413 &movdqa (&QWP(16*$i,"esp"),$tweak);
1414 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1415 &pand ($twres,$twmask); # isolate carry and residue
1416 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1417 &pxor ($tweak,$twres);
1419 &pshufd ($inout5,$twtmp,0x13);
1420 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1421 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1422 &$movekey ($rndkey0,&QWP(0,$key_));
1423 &pand ($inout5,$twmask); # isolate carry and residue
1424 &movups ($inout0,&QWP(0,$inp)); # load input
1425 &pxor ($inout5,$tweak);
1427 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1428 &mov ($rounds,$rounds_);
1429 &movdqu ($inout1,&QWP(16*1,$inp));
1430 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1431 &movdqu ($inout2,&QWP(16*2,$inp));
1432 &pxor ($inout1,$rndkey0);
1433 &movdqu ($inout3,&QWP(16*3,$inp));
1434 &pxor ($inout2,$rndkey0);
1435 &movdqu ($inout4,&QWP(16*4,$inp));
1436 &pxor ($inout3,$rndkey0);
1437 &movdqu ($rndkey1,&QWP(16*5,$inp));
1438 &pxor ($inout4,$rndkey0);
1439 &lea ($inp,&DWP(16*6,$inp));
1440 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1441 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1442 &pxor ($inout5,$rndkey1);
1444 &$movekey ($rndkey1,&QWP(16,$key_));
1445 &pxor ($inout1,&QWP(16*1,"esp"));
1446 &pxor ($inout2,&QWP(16*2,"esp"));
1447 &aesdec ($inout0,$rndkey1);
1448 &pxor ($inout3,&QWP(16*3,"esp"));
1449 &pxor ($inout4,&QWP(16*4,"esp"));
1450 &aesdec ($inout1,$rndkey1);
1451 &pxor ($inout5,$rndkey0);
1452 &$movekey ($rndkey0,&QWP(32,$key_));
1453 &aesdec ($inout2,$rndkey1);
1454 &aesdec ($inout3,$rndkey1);
1455 &aesdec ($inout4,$rndkey1);
1456 &aesdec ($inout5,$rndkey1);
1457 &call (&label("_aesni_decrypt6_enter"));
1459 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1460 &pxor ($twtmp,$twtmp);
1461 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1462 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1463 &xorps ($inout1,&QWP(16*1,"esp"));
1464 &movups (&QWP(16*0,$out),$inout0); # write output
1465 &xorps ($inout2,&QWP(16*2,"esp"));
1466 &movups (&QWP(16*1,$out),$inout1);
1467 &xorps ($inout3,&QWP(16*3,"esp"));
1468 &movups (&QWP(16*2,$out),$inout2);
1469 &xorps ($inout4,&QWP(16*4,"esp"));
1470 &movups (&QWP(16*3,$out),$inout3);
1471 &xorps ($inout5,$tweak);
1472 &movups (&QWP(16*4,$out),$inout4);
1473 &pshufd ($twres,$twtmp,0x13);
1474 &movups (&QWP(16*5,$out),$inout5);
1475 &lea ($out,&DWP(16*6,$out));
1476 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1478 &pxor ($twtmp,$twtmp);
1479 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1480 &pand ($twres,$twmask); # isolate carry and residue
1481 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1482 &pxor ($tweak,$twres);
1485 &jnc (&label("xts_dec_loop6"));
1487 &mov ($rounds,&DWP(240,$key_)); # restore $rounds
1488 &mov ($key,$key_); # restore $key
1489 &mov ($rounds_,$rounds);
1491 &set_label("xts_dec_short");
1493 &jz (&label("xts_dec_done6x"));
1495 &movdqa ($inout3,$tweak); # put aside previous tweak
1497 &jb (&label("xts_dec_one"));
1499 &pshufd ($twres,$twtmp,0x13);
1500 &pxor ($twtmp,$twtmp);
1501 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1502 &pand ($twres,$twmask); # isolate carry and residue
1503 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1504 &pxor ($tweak,$twres);
1505 &je (&label("xts_dec_two"));
1507 &pshufd ($twres,$twtmp,0x13);
1508 &pxor ($twtmp,$twtmp);
1509 &movdqa ($inout4,$tweak); # put aside previous tweak
1510 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1511 &pand ($twres,$twmask); # isolate carry and residue
1512 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1513 &pxor ($tweak,$twres);
1515 &jb (&label("xts_dec_three"));
1517 &pshufd ($twres,$twtmp,0x13);
1518 &pxor ($twtmp,$twtmp);
1519 &movdqa ($inout5,$tweak); # put aside previous tweak
1520 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1521 &pand ($twres,$twmask); # isolate carry and residue
1522 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1523 &pxor ($tweak,$twres);
1524 &movdqa (&QWP(16*0,"esp"),$inout3);
1525 &movdqa (&QWP(16*1,"esp"),$inout4);
1526 &je (&label("xts_dec_four"));
1528 &movdqa (&QWP(16*2,"esp"),$inout5);
1529 &pshufd ($inout5,$twtmp,0x13);
1530 &movdqa (&QWP(16*3,"esp"),$tweak);
1531 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1532 &pand ($inout5,$twmask); # isolate carry and residue
1533 &pxor ($inout5,$tweak);
1535 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1536 &movdqu ($inout1,&QWP(16*1,$inp));
1537 &movdqu ($inout2,&QWP(16*2,$inp));
1538 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1539 &movdqu ($inout3,&QWP(16*3,$inp));
1540 &pxor ($inout1,&QWP(16*1,"esp"));
1541 &movdqu ($inout4,&QWP(16*4,$inp));
1542 &pxor ($inout2,&QWP(16*2,"esp"));
1543 &lea ($inp,&DWP(16*5,$inp));
1544 &pxor ($inout3,&QWP(16*3,"esp"));
1545 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1546 &pxor ($inout4,$inout5);
1548 &call ("_aesni_decrypt6");
1550 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1551 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1552 &xorps ($inout1,&QWP(16*1,"esp"));
1553 &xorps ($inout2,&QWP(16*2,"esp"));
1554 &movups (&QWP(16*0,$out),$inout0); # write output
1555 &xorps ($inout3,&QWP(16*3,"esp"));
1556 &movups (&QWP(16*1,$out),$inout1);
1557 &xorps ($inout4,$tweak);
1558 &movups (&QWP(16*2,$out),$inout2);
1559 &movups (&QWP(16*3,$out),$inout3);
1560 &movups (&QWP(16*4,$out),$inout4);
1561 &lea ($out,&DWP(16*5,$out));
1562 &jmp (&label("xts_dec_done"));
1564 &set_label("xts_dec_one",16);
1565 &movups ($inout0,&QWP(16*0,$inp)); # load input
1566 &lea ($inp,&DWP(16*1,$inp));
1567 &xorps ($inout0,$inout3); # input^=tweak
1569 { &aesni_inline_generate1("dec"); }
1571 { &call ("_aesni_decrypt1"); }
1572 &xorps ($inout0,$inout3); # output^=tweak
1573 &movups (&QWP(16*0,$out),$inout0); # write output
1574 &lea ($out,&DWP(16*1,$out));
1576 &movdqa ($tweak,$inout3); # last tweak
1577 &jmp (&label("xts_dec_done"));
1579 &set_label("xts_dec_two",16);
1580 &movaps ($inout4,$tweak); # put aside last tweak
1582 &movups ($inout0,&QWP(16*0,$inp)); # load input
1583 &movups ($inout1,&QWP(16*1,$inp));
1584 &lea ($inp,&DWP(16*2,$inp));
1585 &xorps ($inout0,$inout3); # input^=tweak
1586 &xorps ($inout1,$inout4);
1588 &call ("_aesni_decrypt3");
1590 &xorps ($inout0,$inout3); # output^=tweak
1591 &xorps ($inout1,$inout4);
1592 &movups (&QWP(16*0,$out),$inout0); # write output
1593 &movups (&QWP(16*1,$out),$inout1);
1594 &lea ($out,&DWP(16*2,$out));
1596 &movdqa ($tweak,$inout4); # last tweak
1597 &jmp (&label("xts_dec_done"));
1599 &set_label("xts_dec_three",16);
1600 &movaps ($inout5,$tweak); # put aside last tweak
1601 &movups ($inout0,&QWP(16*0,$inp)); # load input
1602 &movups ($inout1,&QWP(16*1,$inp));
1603 &movups ($inout2,&QWP(16*2,$inp));
1604 &lea ($inp,&DWP(16*3,$inp));
1605 &xorps ($inout0,$inout3); # input^=tweak
1606 &xorps ($inout1,$inout4);
1607 &xorps ($inout2,$inout5);
1609 &call ("_aesni_decrypt3");
1611 &xorps ($inout0,$inout3); # output^=tweak
1612 &xorps ($inout1,$inout4);
1613 &xorps ($inout2,$inout5);
1614 &movups (&QWP(16*0,$out),$inout0); # write output
1615 &movups (&QWP(16*1,$out),$inout1);
1616 &movups (&QWP(16*2,$out),$inout2);
1617 &lea ($out,&DWP(16*3,$out));
1619 &movdqa ($tweak,$inout5); # last tweak
1620 &jmp (&label("xts_dec_done"));
1622 &set_label("xts_dec_four",16);
1623 &movaps ($inout4,$tweak); # put aside last tweak
1625 &movups ($inout0,&QWP(16*0,$inp)); # load input
1626 &movups ($inout1,&QWP(16*1,$inp));
1627 &movups ($inout2,&QWP(16*2,$inp));
1628 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1629 &movups ($inout3,&QWP(16*3,$inp));
1630 &lea ($inp,&DWP(16*4,$inp));
1631 &xorps ($inout1,&QWP(16*1,"esp"));
1632 &xorps ($inout2,$inout5);
1633 &xorps ($inout3,$inout4);
1635 &call ("_aesni_decrypt4");
1637 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1638 &xorps ($inout1,&QWP(16*1,"esp"));
1639 &xorps ($inout2,$inout5);
1640 &movups (&QWP(16*0,$out),$inout0); # write output
1641 &xorps ($inout3,$inout4);
1642 &movups (&QWP(16*1,$out),$inout1);
1643 &movups (&QWP(16*2,$out),$inout2);
1644 &movups (&QWP(16*3,$out),$inout3);
1645 &lea ($out,&DWP(16*4,$out));
1647 &movdqa ($tweak,$inout4); # last tweak
1648 &jmp (&label("xts_dec_done"));
1650 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1651 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1653 &jz (&label("xts_dec_ret"));
1654 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1655 &jmp (&label("xts_dec_only_one_more"));
1657 &set_label("xts_dec_done",16);
1658 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1659 &pxor ($twtmp,$twtmp);
1661 &jz (&label("xts_dec_ret"));
1663 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1664 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1665 &pshufd ($twres,$twtmp,0x13);
1666 &pxor ($twtmp,$twtmp);
1667 &movdqa ($twmask,&QWP(16*6,"esp"));
1668 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1669 &pand ($twres,$twmask); # isolate carry and residue
1670 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1671 &pxor ($tweak,$twres);
1673 &set_label("xts_dec_only_one_more");
1674 &pshufd ($inout3,$twtmp,0x13);
1675 &movdqa ($inout4,$tweak); # put aside previous tweak
1676 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1677 &pand ($inout3,$twmask); # isolate carry and residue
1678 &pxor ($inout3,$tweak);
1680 &mov ($key,$key_); # restore $key
1681 &mov ($rounds,$rounds_); # restore $rounds
1683 &movups ($inout0,&QWP(0,$inp)); # load input
1684 &xorps ($inout0,$inout3); # input^=tweak
1686 { &aesni_inline_generate1("dec"); }
1688 { &call ("_aesni_decrypt1"); }
1689 &xorps ($inout0,$inout3); # output^=tweak
1690 &movups (&QWP(0,$out),$inout0); # write output
1692 &set_label("xts_dec_steal");
1693 &movz ($rounds,&BP(16,$inp));
1694 &movz ($key,&BP(0,$out));
1695 &lea ($inp,&DWP(1,$inp));
1696 &mov (&BP(0,$out),&LB($rounds));
1697 &mov (&BP(16,$out),&LB($key));
1698 &lea ($out,&DWP(1,$out));
1700 &jnz (&label("xts_dec_steal"));
1702 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1703 &mov ($key,$key_); # restore $key
1704 &mov ($rounds,$rounds_); # restore $rounds
1706 &movups ($inout0,&QWP(0,$out)); # load input
1707 &xorps ($inout0,$inout4); # input^=tweak
1709 { &aesni_inline_generate1("dec"); }
1711 { &call ("_aesni_decrypt1"); }
1712 &xorps ($inout0,$inout4); # output^=tweak
1713 &movups (&QWP(0,$out),$inout0); # write output
1715 &set_label("xts_dec_ret");
1716 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1717 &function_end("aesni_xts_decrypt");
1721 ######################################################################
1722 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
1723 # size_t length, const AES_KEY *key,
1724 # unsigned char *ivp,const int enc);
1725 &function_begin("${PREFIX}_cbc_encrypt");
1726 &mov ($inp,&wparam(0));
1727 &mov ($rounds_,"esp");
1728 &mov ($out,&wparam(1));
1730 &mov ($len,&wparam(2));
1731 &and ($rounds_,-16);
1732 &mov ($key,&wparam(3));
1733 &mov ($key_,&wparam(4));
1735 &jz (&label("cbc_abort"));
1737 &cmp (&wparam(5),0);
1738 &xchg ($rounds_,"esp"); # alloca
1739 &movups ($ivec,&QWP(0,$key_)); # load IV
1740 &mov ($rounds,&DWP(240,$key));
1741 &mov ($key_,$key); # backup $key
1742 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1743 &mov ($rounds_,$rounds); # backup $rounds
1744 &je (&label("cbc_decrypt"));
1746 &movaps ($inout0,$ivec);
1748 &jb (&label("cbc_enc_tail"));
1750 &jmp (&label("cbc_enc_loop"));
1752 &set_label("cbc_enc_loop",16);
1753 &movups ($ivec,&QWP(0,$inp)); # input actually
1754 &lea ($inp,&DWP(16,$inp));
1756 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1758 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1759 &mov ($rounds,$rounds_); # restore $rounds
1760 &mov ($key,$key_); # restore $key
1761 &movups (&QWP(0,$out),$inout0); # store output
1762 &lea ($out,&DWP(16,$out));
1764 &jnc (&label("cbc_enc_loop"));
1766 &jnz (&label("cbc_enc_tail"));
1767 &movaps ($ivec,$inout0);
1768 &jmp (&label("cbc_ret"));
1770 &set_label("cbc_enc_tail");
1771 &mov ("ecx",$len); # zaps $rounds
1772 &data_word(0xA4F3F689); # rep movsb
1773 &mov ("ecx",16); # zero tail
1775 &xor ("eax","eax"); # zaps $len
1776 &data_word(0xAAF3F689); # rep stosb
1777 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1778 &mov ($rounds,$rounds_); # restore $rounds
1779 &mov ($inp,$out); # $inp and $out are the same
1780 &mov ($key,$key_); # restore $key
1781 &jmp (&label("cbc_enc_loop"));
1782 ######################################################################
1783 &set_label("cbc_decrypt",16);
1785 &jbe (&label("cbc_dec_tail"));
1786 &movaps (&QWP(0,"esp"),$ivec); # save IV
1788 &jmp (&label("cbc_dec_loop6_enter"));
1790 &set_label("cbc_dec_loop6",16);
1791 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1792 &movups (&QWP(0,$out),$inout5);
1793 &lea ($out,&DWP(0x10,$out));
1794 &set_label("cbc_dec_loop6_enter");
1795 &movdqu ($inout0,&QWP(0,$inp));
1796 &movdqu ($inout1,&QWP(0x10,$inp));
1797 &movdqu ($inout2,&QWP(0x20,$inp));
1798 &movdqu ($inout3,&QWP(0x30,$inp));
1799 &movdqu ($inout4,&QWP(0x40,$inp));
1800 &movdqu ($inout5,&QWP(0x50,$inp));
1802 &call ("_aesni_decrypt6");
1804 &movups ($rndkey1,&QWP(0,$inp));
1805 &movups ($rndkey0,&QWP(0x10,$inp));
1806 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1807 &xorps ($inout1,$rndkey1);
1808 &movups ($rndkey1,&QWP(0x20,$inp));
1809 &xorps ($inout2,$rndkey0);
1810 &movups ($rndkey0,&QWP(0x30,$inp));
1811 &xorps ($inout3,$rndkey1);
1812 &movups ($rndkey1,&QWP(0x40,$inp));
1813 &xorps ($inout4,$rndkey0);
1814 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1815 &xorps ($inout5,$rndkey1);
1816 &movups (&QWP(0,$out),$inout0);
1817 &movups (&QWP(0x10,$out),$inout1);
1818 &lea ($inp,&DWP(0x60,$inp));
1819 &movups (&QWP(0x20,$out),$inout2);
1820 &mov ($rounds,$rounds_); # restore $rounds
1821 &movups (&QWP(0x30,$out),$inout3);
1822 &mov ($key,$key_); # restore $key
1823 &movups (&QWP(0x40,$out),$inout4);
1824 &lea ($out,&DWP(0x50,$out));
1826 &ja (&label("cbc_dec_loop6"));
1828 &movaps ($inout0,$inout5);
1829 &movaps ($ivec,$rndkey0);
1831 &jle (&label("cbc_dec_tail_collected"));
1832 &movups (&QWP(0,$out),$inout0);
1833 &lea ($out,&DWP(0x10,$out));
1834 &set_label("cbc_dec_tail");
1835 &movups ($inout0,&QWP(0,$inp));
1836 &movaps ($in0,$inout0);
1838 &jbe (&label("cbc_dec_one"));
1840 &movups ($inout1,&QWP(0x10,$inp));
1841 &movaps ($in1,$inout1);
1843 &jbe (&label("cbc_dec_two"));
1845 &movups ($inout2,&QWP(0x20,$inp));
1847 &jbe (&label("cbc_dec_three"));
1849 &movups ($inout3,&QWP(0x30,$inp));
1851 &jbe (&label("cbc_dec_four"));
1853 &movups ($inout4,&QWP(0x40,$inp));
1854 &movaps (&QWP(0,"esp"),$ivec); # save IV
1855 &movups ($inout0,&QWP(0,$inp));
1856 &xorps ($inout5,$inout5);
1857 &call ("_aesni_decrypt6");
1858 &movups ($rndkey1,&QWP(0,$inp));
1859 &movups ($rndkey0,&QWP(0x10,$inp));
1860 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1861 &xorps ($inout1,$rndkey1);
1862 &movups ($rndkey1,&QWP(0x20,$inp));
1863 &xorps ($inout2,$rndkey0);
1864 &movups ($rndkey0,&QWP(0x30,$inp));
1865 &xorps ($inout3,$rndkey1);
1866 &movups ($ivec,&QWP(0x40,$inp)); # IV
1867 &xorps ($inout4,$rndkey0);
1868 &movups (&QWP(0,$out),$inout0);
1869 &movups (&QWP(0x10,$out),$inout1);
1870 &movups (&QWP(0x20,$out),$inout2);
1871 &movups (&QWP(0x30,$out),$inout3);
1872 &lea ($out,&DWP(0x40,$out));
1873 &movaps ($inout0,$inout4);
1875 &jmp (&label("cbc_dec_tail_collected"));
1877 &set_label("cbc_dec_one",16);
1879 { &aesni_inline_generate1("dec"); }
1881 { &call ("_aesni_decrypt1"); }
1882 &xorps ($inout0,$ivec);
1883 &movaps ($ivec,$in0);
1885 &jmp (&label("cbc_dec_tail_collected"));
1887 &set_label("cbc_dec_two",16);
1888 &xorps ($inout2,$inout2);
1889 &call ("_aesni_decrypt3");
1890 &xorps ($inout0,$ivec);
1891 &xorps ($inout1,$in0);
1892 &movups (&QWP(0,$out),$inout0);
1893 &movaps ($inout0,$inout1);
1894 &lea ($out,&DWP(0x10,$out));
1895 &movaps ($ivec,$in1);
1897 &jmp (&label("cbc_dec_tail_collected"));
1899 &set_label("cbc_dec_three",16);
1900 &call ("_aesni_decrypt3");
1901 &xorps ($inout0,$ivec);
1902 &xorps ($inout1,$in0);
1903 &xorps ($inout2,$in1);
1904 &movups (&QWP(0,$out),$inout0);
1905 &movaps ($inout0,$inout2);
1906 &movups (&QWP(0x10,$out),$inout1);
1907 &lea ($out,&DWP(0x20,$out));
1908 &movups ($ivec,&QWP(0x20,$inp));
1910 &jmp (&label("cbc_dec_tail_collected"));
1912 &set_label("cbc_dec_four",16);
1913 &call ("_aesni_decrypt4");
1914 &movups ($rndkey1,&QWP(0x10,$inp));
1915 &movups ($rndkey0,&QWP(0x20,$inp));
1916 &xorps ($inout0,$ivec);
1917 &movups ($ivec,&QWP(0x30,$inp));
1918 &xorps ($inout1,$in0);
1919 &movups (&QWP(0,$out),$inout0);
1920 &xorps ($inout2,$rndkey1);
1921 &movups (&QWP(0x10,$out),$inout1);
1922 &xorps ($inout3,$rndkey0);
1923 &movups (&QWP(0x20,$out),$inout2);
1924 &lea ($out,&DWP(0x30,$out));
1925 &movaps ($inout0,$inout3);
1928 &set_label("cbc_dec_tail_collected");
1930 &jnz (&label("cbc_dec_tail_partial"));
1931 &movups (&QWP(0,$out),$inout0);
1932 &jmp (&label("cbc_ret"));
1934 &set_label("cbc_dec_tail_partial",16);
1935 &movaps (&QWP(0,"esp"),$inout0);
1939 &data_word(0xA4F3F689); # rep movsb
1941 &set_label("cbc_ret");
1942 &mov ("esp",&DWP(16,"esp")); # pull original %esp
1943 &mov ($key_,&wparam(4));
1944 &movups (&QWP(0,$key_),$ivec); # output IV
1945 &set_label("cbc_abort");
1946 &function_end("${PREFIX}_cbc_encrypt");
1948 ######################################################################
1949 # Mechanical port from aesni-x86_64.pl.
1951 # _aesni_set_encrypt_key is private interface,
1953 # "eax" const unsigned char *userKey
1960 &function_begin_B("_aesni_set_encrypt_key");
1961 &test ("eax","eax");
1962 &jz (&label("bad_pointer"));
1964 &jz (&label("bad_pointer"));
1966 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
1967 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
1968 &lea ($key,&DWP(16,$key));
1970 &je (&label("14rounds"));
1972 &je (&label("12rounds"));
1974 &jne (&label("bad_keybits"));
1976 &set_label("10rounds",16);
1978 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1979 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
1980 &call (&label("key_128_cold"));
1981 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
1982 &call (&label("key_128"));
1983 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
1984 &call (&label("key_128"));
1985 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
1986 &call (&label("key_128"));
1987 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
1988 &call (&label("key_128"));
1989 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
1990 &call (&label("key_128"));
1991 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
1992 &call (&label("key_128"));
1993 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
1994 &call (&label("key_128"));
1995 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
1996 &call (&label("key_128"));
1997 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
1998 &call (&label("key_128"));
1999 &$movekey (&QWP(0,$key),"xmm0");
2000 &mov (&DWP(80,$key),$rounds);
2004 &set_label("key_128",16);
2005 &$movekey (&QWP(0,$key),"xmm0");
2006 &lea ($key,&DWP(16,$key));
2007 &set_label("key_128_cold");
2008 &shufps ("xmm4","xmm0",0b00010000);
2009 &xorps ("xmm0","xmm4");
2010 &shufps ("xmm4","xmm0",0b10001100);
2011 &xorps ("xmm0","xmm4");
2012 &shufps ("xmm1","xmm1",0b11111111); # critical path
2013 &xorps ("xmm0","xmm1");
2016 &set_label("12rounds",16);
2017 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2019 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
2020 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2021 &call (&label("key_192a_cold"));
2022 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2023 &call (&label("key_192b"));
2024 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2025 &call (&label("key_192a"));
2026 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2027 &call (&label("key_192b"));
2028 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2029 &call (&label("key_192a"));
2030 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2031 &call (&label("key_192b"));
2032 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2033 &call (&label("key_192a"));
2034 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2035 &call (&label("key_192b"));
2036 &$movekey (&QWP(0,$key),"xmm0");
2037 &mov (&DWP(48,$key),$rounds);
2041 &set_label("key_192a",16);
2042 &$movekey (&QWP(0,$key),"xmm0");
2043 &lea ($key,&DWP(16,$key));
2044 &set_label("key_192a_cold",16);
2045 &movaps ("xmm5","xmm2");
2046 &set_label("key_192b_warm");
2047 &shufps ("xmm4","xmm0",0b00010000);
2048 &movdqa ("xmm3","xmm2");
2049 &xorps ("xmm0","xmm4");
2050 &shufps ("xmm4","xmm0",0b10001100);
2052 &xorps ("xmm0","xmm4");
2053 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2054 &pxor ("xmm2","xmm3");
2055 &pxor ("xmm0","xmm1");
2056 &pshufd ("xmm3","xmm0",0b11111111);
2057 &pxor ("xmm2","xmm3");
2060 &set_label("key_192b",16);
2061 &movaps ("xmm3","xmm0");
2062 &shufps ("xmm5","xmm0",0b01000100);
2063 &$movekey (&QWP(0,$key),"xmm5");
2064 &shufps ("xmm3","xmm2",0b01001110);
2065 &$movekey (&QWP(16,$key),"xmm3");
2066 &lea ($key,&DWP(32,$key));
2067 &jmp (&label("key_192b_warm"));
2069 &set_label("14rounds",16);
2070 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2072 &lea ($key,&DWP(16,$key));
2073 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2074 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2075 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2076 &call (&label("key_256a_cold"));
2077 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2078 &call (&label("key_256b"));
2079 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2080 &call (&label("key_256a"));
2081 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2082 &call (&label("key_256b"));
2083 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2084 &call (&label("key_256a"));
2085 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2086 &call (&label("key_256b"));
2087 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2088 &call (&label("key_256a"));
2089 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2090 &call (&label("key_256b"));
2091 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2092 &call (&label("key_256a"));
2093 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2094 &call (&label("key_256b"));
2095 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2096 &call (&label("key_256a"));
2097 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2098 &call (&label("key_256b"));
2099 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2100 &call (&label("key_256a"));
2101 &$movekey (&QWP(0,$key),"xmm0");
2102 &mov (&DWP(16,$key),$rounds);
2106 &set_label("key_256a",16);
2107 &$movekey (&QWP(0,$key),"xmm2");
2108 &lea ($key,&DWP(16,$key));
2109 &set_label("key_256a_cold");
2110 &shufps ("xmm4","xmm0",0b00010000);
2111 &xorps ("xmm0","xmm4");
2112 &shufps ("xmm4","xmm0",0b10001100);
2113 &xorps ("xmm0","xmm4");
2114 &shufps ("xmm1","xmm1",0b11111111); # critical path
2115 &xorps ("xmm0","xmm1");
2118 &set_label("key_256b",16);
2119 &$movekey (&QWP(0,$key),"xmm0");
2120 &lea ($key,&DWP(16,$key));
2122 &shufps ("xmm4","xmm2",0b00010000);
2123 &xorps ("xmm2","xmm4");
2124 &shufps ("xmm4","xmm2",0b10001100);
2125 &xorps ("xmm2","xmm4");
2126 &shufps ("xmm1","xmm1",0b10101010); # critical path
2127 &xorps ("xmm2","xmm1");
2130 &set_label("bad_pointer",4);
2133 &set_label("bad_keybits",4);
2136 &function_end_B("_aesni_set_encrypt_key");
2138 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2140 &function_begin_B("${PREFIX}_set_encrypt_key");
2141 &mov ("eax",&wparam(0));
2142 &mov ($rounds,&wparam(1));
2143 &mov ($key,&wparam(2));
2144 &call ("_aesni_set_encrypt_key");
2146 &function_end_B("${PREFIX}_set_encrypt_key");
2148 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2150 &function_begin_B("${PREFIX}_set_decrypt_key");
2151 &mov ("eax",&wparam(0));
2152 &mov ($rounds,&wparam(1));
2153 &mov ($key,&wparam(2));
2154 &call ("_aesni_set_encrypt_key");
2155 &mov ($key,&wparam(2));
2156 &shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
2157 &test ("eax","eax");
2158 &jnz (&label("dec_key_ret"));
2159 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2161 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2162 &$movekey ("xmm1",&QWP(0,"eax"));
2163 &$movekey (&QWP(0,"eax"),"xmm0");
2164 &$movekey (&QWP(0,$key),"xmm1");
2165 &lea ($key,&DWP(16,$key));
2166 &lea ("eax",&DWP(-16,"eax"));
2168 &set_label("dec_key_inverse");
2169 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2170 &$movekey ("xmm1",&QWP(0,"eax"));
2171 &aesimc ("xmm0","xmm0");
2172 &aesimc ("xmm1","xmm1");
2173 &lea ($key,&DWP(16,$key));
2174 &lea ("eax",&DWP(-16,"eax"));
2175 &$movekey (&QWP(16,"eax"),"xmm0");
2176 &$movekey (&QWP(-16,$key),"xmm1");
2178 &ja (&label("dec_key_inverse"));
2180 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2181 &aesimc ("xmm0","xmm0");
2182 &$movekey (&QWP(0,$key),"xmm0");
2184 &xor ("eax","eax"); # return success
2185 &set_label("dec_key_ret");
2187 &function_end_B("${PREFIX}_set_decrypt_key");
2188 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");