3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for Intel AES-NI extension. In
11 # OpenSSL context it's used with Intel engine, but can also be used as
12 # drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
17 # To start with see corresponding paragraph in aesni-x86_64.pl...
18 # Instead of filling table similar to one found there I've chosen to
19 # summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
20 # The simplified table below represents 32-bit performance relative
21 # to 64-bit one in every given point. Ratios vary for different
22 # encryption modes, therefore interval values.
24 # 16-byte 64-byte 256-byte 1-KB 8-KB
25 # 53-67% 67-84% 91-94% 95-98% 97-99.5%
27 # Lower ratios for smaller block sizes are perfectly understandable,
28 # because function call overhead is higher in 32-bit mode. Largest
29 # 8-KB block performance is virtually same: 32-bit code is less than
30 # 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
34 # See aesni-x86_64.pl for details. Unlike x86_64 version this module
35 # interleaves at most 6 aes[enc|dec] instructions, because there are
36 # not enough registers for 8x interleave [which should be optimal for
37 # Sandy Bridge]. Actually, performance results for 6x interleave
38 # factor presented in aesni-x86_64.pl (except for CTR) are for this
43 # Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
44 # one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
46 $PREFIX="aesni"; # if $PREFIX is set to "AES", the script
47 # generates drop-in replacement for
48 # crypto/aes/asm/aes-586.pl:-)
49 $inline=1; # inline _aesni_[en|de]crypt
51 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
52 push(@INC,"${dir}","${dir}../../perlasm");
55 &asm_init($ARGV[0],$0);
57 if ($PREFIX eq "aesni") { $movekey=*movups; }
58 else { $movekey=*movups; }
65 $rounds_="ebx"; # backup copy for $rounds
66 $key_="ebp"; # backup copy for $key
73 $inout3="xmm5"; $in1="xmm5";
74 $inout4="xmm6"; $in0="xmm6";
75 $inout5="xmm7"; $ivec="xmm7";
79 { my($dst,$src,$imm)=@_;
80 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
81 { &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
84 { my($opcodelet,$dst,$src)=@_;
85 if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
86 { &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
88 sub aesimc { aescommon(0xdb,@_); }
89 sub aesenc { aescommon(0xdc,@_); }
90 sub aesenclast { aescommon(0xdd,@_); }
91 sub aesdec { aescommon(0xde,@_); }
92 sub aesdeclast { aescommon(0xdf,@_); }
94 # Inline version of internal aesni_[en|de]crypt1
96 sub aesni_inline_generate1
97 { my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
100 &$movekey ($rndkey0,&QWP(0,$key));
101 &$movekey ($rndkey1,&QWP(16,$key));
102 &xorps ($ivec,$rndkey0) if (defined($ivec));
103 &lea ($key,&DWP(32,$key));
104 &xorps ($inout,$ivec) if (defined($ivec));
105 &xorps ($inout,$rndkey0) if (!defined($ivec));
106 &set_label("${p}1_loop_$sn");
107 eval"&aes${p} ($inout,$rndkey1)";
109 &$movekey ($rndkey1,&QWP(0,$key));
110 &lea ($key,&DWP(16,$key));
111 &jnz (&label("${p}1_loop_$sn"));
112 eval"&aes${p}last ($inout,$rndkey1)";
115 sub aesni_generate1 # fully unrolled loop
116 { my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
118 &function_begin_B("_aesni_${p}rypt1");
119 &movups ($rndkey0,&QWP(0,$key));
120 &$movekey ($rndkey1,&QWP(0x10,$key));
121 &xorps ($inout,$rndkey0);
122 &$movekey ($rndkey0,&QWP(0x20,$key));
123 &lea ($key,&DWP(0x30,$key));
125 &jb (&label("${p}128"));
126 &lea ($key,&DWP(0x20,$key));
127 &je (&label("${p}192"));
128 &lea ($key,&DWP(0x20,$key));
129 eval"&aes${p} ($inout,$rndkey1)";
130 &$movekey ($rndkey1,&QWP(-0x40,$key));
131 eval"&aes${p} ($inout,$rndkey0)";
132 &$movekey ($rndkey0,&QWP(-0x30,$key));
133 &set_label("${p}192");
134 eval"&aes${p} ($inout,$rndkey1)";
135 &$movekey ($rndkey1,&QWP(-0x20,$key));
136 eval"&aes${p} ($inout,$rndkey0)";
137 &$movekey ($rndkey0,&QWP(-0x10,$key));
138 &set_label("${p}128");
139 eval"&aes${p} ($inout,$rndkey1)";
140 &$movekey ($rndkey1,&QWP(0,$key));
141 eval"&aes${p} ($inout,$rndkey0)";
142 &$movekey ($rndkey0,&QWP(0x10,$key));
143 eval"&aes${p} ($inout,$rndkey1)";
144 &$movekey ($rndkey1,&QWP(0x20,$key));
145 eval"&aes${p} ($inout,$rndkey0)";
146 &$movekey ($rndkey0,&QWP(0x30,$key));
147 eval"&aes${p} ($inout,$rndkey1)";
148 &$movekey ($rndkey1,&QWP(0x40,$key));
149 eval"&aes${p} ($inout,$rndkey0)";
150 &$movekey ($rndkey0,&QWP(0x50,$key));
151 eval"&aes${p} ($inout,$rndkey1)";
152 &$movekey ($rndkey1,&QWP(0x60,$key));
153 eval"&aes${p} ($inout,$rndkey0)";
154 &$movekey ($rndkey0,&QWP(0x70,$key));
155 eval"&aes${p} ($inout,$rndkey1)";
156 eval"&aes${p}last ($inout,$rndkey0)";
158 &function_end_B("_aesni_${p}rypt1");
161 # void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
162 &aesni_generate1("enc") if (!$inline);
163 &function_begin_B("${PREFIX}_encrypt");
164 &mov ("eax",&wparam(0));
165 &mov ($key,&wparam(2));
166 &movups ($inout0,&QWP(0,"eax"));
167 &mov ($rounds,&DWP(240,$key));
168 &mov ("eax",&wparam(1));
170 { &aesni_inline_generate1("enc"); }
172 { &call ("_aesni_encrypt1"); }
173 &movups (&QWP(0,"eax"),$inout0);
175 &function_end_B("${PREFIX}_encrypt");
177 # void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
178 &aesni_generate1("dec") if(!$inline);
179 &function_begin_B("${PREFIX}_decrypt");
180 &mov ("eax",&wparam(0));
181 &mov ($key,&wparam(2));
182 &movups ($inout0,&QWP(0,"eax"));
183 &mov ($rounds,&DWP(240,$key));
184 &mov ("eax",&wparam(1));
186 { &aesni_inline_generate1("dec"); }
188 { &call ("_aesni_decrypt1"); }
189 &movups (&QWP(0,"eax"),$inout0);
191 &function_end_B("${PREFIX}_decrypt");
193 # _aesni_[en|de]cryptN are private interfaces, N denotes interleave
194 # factor. Why 3x subroutine were originally used in loops? Even though
195 # aes[enc|dec] latency was originally 6, it could be scheduled only
196 # every *2nd* cycle. Thus 3x interleave was the one providing optimal
197 # utilization, i.e. when subroutine's throughput is virtually same as
198 # of non-interleaved subroutine [for number of input blocks up to 3].
199 # This is why it makes no sense to implement 2x subroutine.
200 # aes[enc|dec] latency in next processor generation is 8, but the
201 # instructions can be scheduled every cycle. Optimal interleave for
202 # new processor is therefore 8x, but it's unfeasible to accommodate it
203 # in XMM registers addreassable in 32-bit mode and therefore 6x is
209 &function_begin_B("_aesni_${p}rypt3");
210 &$movekey ($rndkey0,&QWP(0,$key));
212 &$movekey ($rndkey1,&QWP(16,$key));
213 &lea ($key,&DWP(32,$key));
214 &xorps ($inout0,$rndkey0);
215 &pxor ($inout1,$rndkey0);
216 &pxor ($inout2,$rndkey0);
217 &$movekey ($rndkey0,&QWP(0,$key));
219 &set_label("${p}3_loop");
220 eval"&aes${p} ($inout0,$rndkey1)";
221 eval"&aes${p} ($inout1,$rndkey1)";
223 eval"&aes${p} ($inout2,$rndkey1)";
224 &$movekey ($rndkey1,&QWP(16,$key));
225 eval"&aes${p} ($inout0,$rndkey0)";
226 eval"&aes${p} ($inout1,$rndkey0)";
227 &lea ($key,&DWP(32,$key));
228 eval"&aes${p} ($inout2,$rndkey0)";
229 &$movekey ($rndkey0,&QWP(0,$key));
230 &jnz (&label("${p}3_loop"));
231 eval"&aes${p} ($inout0,$rndkey1)";
232 eval"&aes${p} ($inout1,$rndkey1)";
233 eval"&aes${p} ($inout2,$rndkey1)";
234 eval"&aes${p}last ($inout0,$rndkey0)";
235 eval"&aes${p}last ($inout1,$rndkey0)";
236 eval"&aes${p}last ($inout2,$rndkey0)";
238 &function_end_B("_aesni_${p}rypt3");
241 # 4x interleave is implemented to improve small block performance,
242 # most notably [and naturally] 4 block by ~30%. One can argue that one
243 # should have implemented 5x as well, but improvement would be <20%,
244 # so it's not worth it...
248 &function_begin_B("_aesni_${p}rypt4");
249 &$movekey ($rndkey0,&QWP(0,$key));
250 &$movekey ($rndkey1,&QWP(16,$key));
252 &lea ($key,&DWP(32,$key));
253 &xorps ($inout0,$rndkey0);
254 &pxor ($inout1,$rndkey0);
255 &pxor ($inout2,$rndkey0);
256 &pxor ($inout3,$rndkey0);
257 &$movekey ($rndkey0,&QWP(0,$key));
259 &set_label("${p}4_loop");
260 eval"&aes${p} ($inout0,$rndkey1)";
261 eval"&aes${p} ($inout1,$rndkey1)";
263 eval"&aes${p} ($inout2,$rndkey1)";
264 eval"&aes${p} ($inout3,$rndkey1)";
265 &$movekey ($rndkey1,&QWP(16,$key));
266 eval"&aes${p} ($inout0,$rndkey0)";
267 eval"&aes${p} ($inout1,$rndkey0)";
268 &lea ($key,&DWP(32,$key));
269 eval"&aes${p} ($inout2,$rndkey0)";
270 eval"&aes${p} ($inout3,$rndkey0)";
271 &$movekey ($rndkey0,&QWP(0,$key));
272 &jnz (&label("${p}4_loop"));
274 eval"&aes${p} ($inout0,$rndkey1)";
275 eval"&aes${p} ($inout1,$rndkey1)";
276 eval"&aes${p} ($inout2,$rndkey1)";
277 eval"&aes${p} ($inout3,$rndkey1)";
278 eval"&aes${p}last ($inout0,$rndkey0)";
279 eval"&aes${p}last ($inout1,$rndkey0)";
280 eval"&aes${p}last ($inout2,$rndkey0)";
281 eval"&aes${p}last ($inout3,$rndkey0)";
283 &function_end_B("_aesni_${p}rypt4");
289 &function_begin_B("_aesni_${p}rypt6");
290 &static_label("_aesni_${p}rypt6_enter");
291 &$movekey ($rndkey0,&QWP(0,$key));
293 &$movekey ($rndkey1,&QWP(16,$key));
294 &lea ($key,&DWP(32,$key));
295 &xorps ($inout0,$rndkey0);
296 &pxor ($inout1,$rndkey0); # pxor does better here
297 eval"&aes${p} ($inout0,$rndkey1)";
298 &pxor ($inout2,$rndkey0);
299 eval"&aes${p} ($inout1,$rndkey1)";
300 &pxor ($inout3,$rndkey0);
302 eval"&aes${p} ($inout2,$rndkey1)";
303 &pxor ($inout4,$rndkey0);
304 eval"&aes${p} ($inout3,$rndkey1)";
305 &pxor ($inout5,$rndkey0);
306 eval"&aes${p} ($inout4,$rndkey1)";
307 &$movekey ($rndkey0,&QWP(0,$key));
308 eval"&aes${p} ($inout5,$rndkey1)";
309 &jmp (&label("_aesni_${p}rypt6_enter"));
311 &set_label("${p}6_loop",16);
312 eval"&aes${p} ($inout0,$rndkey1)";
313 eval"&aes${p} ($inout1,$rndkey1)";
315 eval"&aes${p} ($inout2,$rndkey1)";
316 eval"&aes${p} ($inout3,$rndkey1)";
317 eval"&aes${p} ($inout4,$rndkey1)";
318 eval"&aes${p} ($inout5,$rndkey1)";
319 &set_label("_aesni_${p}rypt6_enter",16);
320 &$movekey ($rndkey1,&QWP(16,$key));
321 eval"&aes${p} ($inout0,$rndkey0)";
322 eval"&aes${p} ($inout1,$rndkey0)";
323 &lea ($key,&DWP(32,$key));
324 eval"&aes${p} ($inout2,$rndkey0)";
325 eval"&aes${p} ($inout3,$rndkey0)";
326 eval"&aes${p} ($inout4,$rndkey0)";
327 eval"&aes${p} ($inout5,$rndkey0)";
328 &$movekey ($rndkey0,&QWP(0,$key));
329 &jnz (&label("${p}6_loop"));
331 eval"&aes${p} ($inout0,$rndkey1)";
332 eval"&aes${p} ($inout1,$rndkey1)";
333 eval"&aes${p} ($inout2,$rndkey1)";
334 eval"&aes${p} ($inout3,$rndkey1)";
335 eval"&aes${p} ($inout4,$rndkey1)";
336 eval"&aes${p} ($inout5,$rndkey1)";
337 eval"&aes${p}last ($inout0,$rndkey0)";
338 eval"&aes${p}last ($inout1,$rndkey0)";
339 eval"&aes${p}last ($inout2,$rndkey0)";
340 eval"&aes${p}last ($inout3,$rndkey0)";
341 eval"&aes${p}last ($inout4,$rndkey0)";
342 eval"&aes${p}last ($inout5,$rndkey0)";
344 &function_end_B("_aesni_${p}rypt6");
346 &aesni_generate3("enc") if ($PREFIX eq "aesni");
347 &aesni_generate3("dec");
348 &aesni_generate4("enc") if ($PREFIX eq "aesni");
349 &aesni_generate4("dec");
350 &aesni_generate6("enc") if ($PREFIX eq "aesni");
351 &aesni_generate6("dec");
353 if ($PREFIX eq "aesni") {
354 ######################################################################
355 # void aesni_ecb_encrypt (const void *in, void *out,
356 # size_t length, const AES_KEY *key,
358 &function_begin("aesni_ecb_encrypt");
359 &mov ($inp,&wparam(0));
360 &mov ($out,&wparam(1));
361 &mov ($len,&wparam(2));
362 &mov ($key,&wparam(3));
363 &mov ($rounds_,&wparam(4));
365 &jz (&label("ecb_ret"));
366 &mov ($rounds,&DWP(240,$key));
367 &test ($rounds_,$rounds_);
368 &jz (&label("ecb_decrypt"));
370 &mov ($key_,$key); # backup $key
371 &mov ($rounds_,$rounds); # backup $rounds
373 &jb (&label("ecb_enc_tail"));
375 &movdqu ($inout0,&QWP(0,$inp));
376 &movdqu ($inout1,&QWP(0x10,$inp));
377 &movdqu ($inout2,&QWP(0x20,$inp));
378 &movdqu ($inout3,&QWP(0x30,$inp));
379 &movdqu ($inout4,&QWP(0x40,$inp));
380 &movdqu ($inout5,&QWP(0x50,$inp));
381 &lea ($inp,&DWP(0x60,$inp));
383 &jmp (&label("ecb_enc_loop6_enter"));
385 &set_label("ecb_enc_loop6",16);
386 &movups (&QWP(0,$out),$inout0);
387 &movdqu ($inout0,&QWP(0,$inp));
388 &movups (&QWP(0x10,$out),$inout1);
389 &movdqu ($inout1,&QWP(0x10,$inp));
390 &movups (&QWP(0x20,$out),$inout2);
391 &movdqu ($inout2,&QWP(0x20,$inp));
392 &movups (&QWP(0x30,$out),$inout3);
393 &movdqu ($inout3,&QWP(0x30,$inp));
394 &movups (&QWP(0x40,$out),$inout4);
395 &movdqu ($inout4,&QWP(0x40,$inp));
396 &movups (&QWP(0x50,$out),$inout5);
397 &lea ($out,&DWP(0x60,$out));
398 &movdqu ($inout5,&QWP(0x50,$inp));
399 &lea ($inp,&DWP(0x60,$inp));
400 &set_label("ecb_enc_loop6_enter");
402 &call ("_aesni_encrypt6");
404 &mov ($key,$key_); # restore $key
405 &mov ($rounds,$rounds_); # restore $rounds
407 &jnc (&label("ecb_enc_loop6"));
409 &movups (&QWP(0,$out),$inout0);
410 &movups (&QWP(0x10,$out),$inout1);
411 &movups (&QWP(0x20,$out),$inout2);
412 &movups (&QWP(0x30,$out),$inout3);
413 &movups (&QWP(0x40,$out),$inout4);
414 &movups (&QWP(0x50,$out),$inout5);
415 &lea ($out,&DWP(0x60,$out));
417 &jz (&label("ecb_ret"));
419 &set_label("ecb_enc_tail");
420 &movups ($inout0,&QWP(0,$inp));
422 &jb (&label("ecb_enc_one"));
423 &movups ($inout1,&QWP(0x10,$inp));
424 &je (&label("ecb_enc_two"));
425 &movups ($inout2,&QWP(0x20,$inp));
427 &jb (&label("ecb_enc_three"));
428 &movups ($inout3,&QWP(0x30,$inp));
429 &je (&label("ecb_enc_four"));
430 &movups ($inout4,&QWP(0x40,$inp));
431 &xorps ($inout5,$inout5);
432 &call ("_aesni_encrypt6");
433 &movups (&QWP(0,$out),$inout0);
434 &movups (&QWP(0x10,$out),$inout1);
435 &movups (&QWP(0x20,$out),$inout2);
436 &movups (&QWP(0x30,$out),$inout3);
437 &movups (&QWP(0x40,$out),$inout4);
438 jmp (&label("ecb_ret"));
440 &set_label("ecb_enc_one",16);
442 { &aesni_inline_generate1("enc"); }
444 { &call ("_aesni_encrypt1"); }
445 &movups (&QWP(0,$out),$inout0);
446 &jmp (&label("ecb_ret"));
448 &set_label("ecb_enc_two",16);
449 &xorps ($inout2,$inout2);
450 &call ("_aesni_encrypt3");
451 &movups (&QWP(0,$out),$inout0);
452 &movups (&QWP(0x10,$out),$inout1);
453 &jmp (&label("ecb_ret"));
455 &set_label("ecb_enc_three",16);
456 &call ("_aesni_encrypt3");
457 &movups (&QWP(0,$out),$inout0);
458 &movups (&QWP(0x10,$out),$inout1);
459 &movups (&QWP(0x20,$out),$inout2);
460 &jmp (&label("ecb_ret"));
462 &set_label("ecb_enc_four",16);
463 &call ("_aesni_encrypt4");
464 &movups (&QWP(0,$out),$inout0);
465 &movups (&QWP(0x10,$out),$inout1);
466 &movups (&QWP(0x20,$out),$inout2);
467 &movups (&QWP(0x30,$out),$inout3);
468 &jmp (&label("ecb_ret"));
469 ######################################################################
470 &set_label("ecb_decrypt",16);
471 &mov ($key_,$key); # backup $key
472 &mov ($rounds_,$rounds); # backup $rounds
474 &jb (&label("ecb_dec_tail"));
476 &movdqu ($inout0,&QWP(0,$inp));
477 &movdqu ($inout1,&QWP(0x10,$inp));
478 &movdqu ($inout2,&QWP(0x20,$inp));
479 &movdqu ($inout3,&QWP(0x30,$inp));
480 &movdqu ($inout4,&QWP(0x40,$inp));
481 &movdqu ($inout5,&QWP(0x50,$inp));
482 &lea ($inp,&DWP(0x60,$inp));
484 &jmp (&label("ecb_dec_loop6_enter"));
486 &set_label("ecb_dec_loop6",16);
487 &movups (&QWP(0,$out),$inout0);
488 &movdqu ($inout0,&QWP(0,$inp));
489 &movups (&QWP(0x10,$out),$inout1);
490 &movdqu ($inout1,&QWP(0x10,$inp));
491 &movups (&QWP(0x20,$out),$inout2);
492 &movdqu ($inout2,&QWP(0x20,$inp));
493 &movups (&QWP(0x30,$out),$inout3);
494 &movdqu ($inout3,&QWP(0x30,$inp));
495 &movups (&QWP(0x40,$out),$inout4);
496 &movdqu ($inout4,&QWP(0x40,$inp));
497 &movups (&QWP(0x50,$out),$inout5);
498 &lea ($out,&DWP(0x60,$out));
499 &movdqu ($inout5,&QWP(0x50,$inp));
500 &lea ($inp,&DWP(0x60,$inp));
501 &set_label("ecb_dec_loop6_enter");
503 &call ("_aesni_decrypt6");
505 &mov ($key,$key_); # restore $key
506 &mov ($rounds,$rounds_); # restore $rounds
508 &jnc (&label("ecb_dec_loop6"));
510 &movups (&QWP(0,$out),$inout0);
511 &movups (&QWP(0x10,$out),$inout1);
512 &movups (&QWP(0x20,$out),$inout2);
513 &movups (&QWP(0x30,$out),$inout3);
514 &movups (&QWP(0x40,$out),$inout4);
515 &movups (&QWP(0x50,$out),$inout5);
516 &lea ($out,&DWP(0x60,$out));
518 &jz (&label("ecb_ret"));
520 &set_label("ecb_dec_tail");
521 &movups ($inout0,&QWP(0,$inp));
523 &jb (&label("ecb_dec_one"));
524 &movups ($inout1,&QWP(0x10,$inp));
525 &je (&label("ecb_dec_two"));
526 &movups ($inout2,&QWP(0x20,$inp));
528 &jb (&label("ecb_dec_three"));
529 &movups ($inout3,&QWP(0x30,$inp));
530 &je (&label("ecb_dec_four"));
531 &movups ($inout4,&QWP(0x40,$inp));
532 &xorps ($inout5,$inout5);
533 &call ("_aesni_decrypt6");
534 &movups (&QWP(0,$out),$inout0);
535 &movups (&QWP(0x10,$out),$inout1);
536 &movups (&QWP(0x20,$out),$inout2);
537 &movups (&QWP(0x30,$out),$inout3);
538 &movups (&QWP(0x40,$out),$inout4);
539 &jmp (&label("ecb_ret"));
541 &set_label("ecb_dec_one",16);
543 { &aesni_inline_generate1("dec"); }
545 { &call ("_aesni_decrypt1"); }
546 &movups (&QWP(0,$out),$inout0);
547 &jmp (&label("ecb_ret"));
549 &set_label("ecb_dec_two",16);
550 &xorps ($inout2,$inout2);
551 &call ("_aesni_decrypt3");
552 &movups (&QWP(0,$out),$inout0);
553 &movups (&QWP(0x10,$out),$inout1);
554 &jmp (&label("ecb_ret"));
556 &set_label("ecb_dec_three",16);
557 &call ("_aesni_decrypt3");
558 &movups (&QWP(0,$out),$inout0);
559 &movups (&QWP(0x10,$out),$inout1);
560 &movups (&QWP(0x20,$out),$inout2);
561 &jmp (&label("ecb_ret"));
563 &set_label("ecb_dec_four",16);
564 &call ("_aesni_decrypt4");
565 &movups (&QWP(0,$out),$inout0);
566 &movups (&QWP(0x10,$out),$inout1);
567 &movups (&QWP(0x20,$out),$inout2);
568 &movups (&QWP(0x30,$out),$inout3);
570 &set_label("ecb_ret");
571 &function_end("aesni_ecb_encrypt");
573 ######################################################################
574 # void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
575 # size_t blocks, const AES_KEY *key,
576 # const char *ivec,char *cmac);
578 # Handles only complete blocks, operates on 64-bit counter and
579 # does not update *ivec! Nor does it finalize CMAC value
580 # (see engine/eng_aesni.c for details)
583 &function_begin("aesni_ccm64_encrypt_blocks");
584 &mov ($inp,&wparam(0));
585 &mov ($out,&wparam(1));
586 &mov ($len,&wparam(2));
587 &mov ($key,&wparam(3));
588 &mov ($rounds_,&wparam(4));
589 &mov ($rounds,&wparam(5));
592 &and ("esp",-16); # align stack
593 &mov (&DWP(48,"esp"),$key_);
595 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
596 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
598 # compose byte-swap control mask for pshufb on stack
599 &mov (&DWP(0,"esp"),0x0c0d0e0f);
600 &mov (&DWP(4,"esp"),0x08090a0b);
601 &mov (&DWP(8,"esp"),0x04050607);
602 &mov (&DWP(12,"esp"),0x00010203);
604 # compose counter increment vector on stack
607 &mov (&DWP(16,"esp"),$rounds);
608 &mov (&DWP(20,"esp"),$key_);
609 &mov (&DWP(24,"esp"),$key_);
610 &mov (&DWP(28,"esp"),$key_);
612 &movdqa ($inout3,&QWP(0,"esp"));
613 &pshufb ($ivec,$inout3); # keep iv in reverse order
615 &mov ($rounds,&DWP(240,$key));
617 &mov ($rounds_,$rounds);
618 &movdqa ($inout0,$ivec);
620 &set_label("ccm64_enc_outer");
621 &movups ($in0,&QWP(0,$inp));
622 &pshufb ($inout0,$inout3);
624 &mov ($rounds,$rounds_);
626 &$movekey ($rndkey0,&QWP(0,$key));
628 &$movekey ($rndkey1,&QWP(16,$key));
629 &xorps ($in0,$rndkey0);
630 &lea ($key,&DWP(32,$key));
631 &xorps ($inout0,$rndkey0);
632 &xorps ($cmac,$in0); # cmac^=inp
633 &$movekey ($rndkey0,&QWP(0,$key));
635 &set_label("ccm64_enc2_loop");
636 &aesenc ($inout0,$rndkey1);
638 &aesenc ($cmac,$rndkey1);
639 &$movekey ($rndkey1,&QWP(16,$key));
640 &aesenc ($inout0,$rndkey0);
641 &lea ($key,&DWP(32,$key));
642 &aesenc ($cmac,$rndkey0);
643 &$movekey ($rndkey0,&QWP(0,$key));
644 &jnz (&label("ccm64_enc2_loop"));
645 &aesenc ($inout0,$rndkey1);
646 &aesenc ($cmac,$rndkey1);
647 &aesenclast ($inout0,$rndkey0);
648 &aesenclast ($cmac,$rndkey0);
650 &paddq ($ivec,&QWP(16,"esp"));
652 &lea ($inp,&DWP(16,$inp));
653 &xorps ($in0,$inout0); # inp^=E(ivec)
654 &movdqa ($inout0,$ivec);
655 &movups (&QWP(0,$out),$in0);
656 &lea ($out,&DWP(16,$out));
657 &jnz (&label("ccm64_enc_outer"));
659 &mov ("esp",&DWP(48,"esp"));
660 &mov ($out,&wparam(5));
661 &movups (&QWP(0,$out),$cmac);
662 &function_end("aesni_ccm64_encrypt_blocks");
664 &function_begin("aesni_ccm64_decrypt_blocks");
665 &mov ($inp,&wparam(0));
666 &mov ($out,&wparam(1));
667 &mov ($len,&wparam(2));
668 &mov ($key,&wparam(3));
669 &mov ($rounds_,&wparam(4));
670 &mov ($rounds,&wparam(5));
673 &and ("esp",-16); # align stack
674 &mov (&DWP(48,"esp"),$key_);
676 &movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
677 &movdqu ($cmac,&QWP(0,$rounds)); # load cmac
679 # compose byte-swap control mask for pshufb on stack
680 &mov (&DWP(0,"esp"),0x0c0d0e0f);
681 &mov (&DWP(4,"esp"),0x08090a0b);
682 &mov (&DWP(8,"esp"),0x04050607);
683 &mov (&DWP(12,"esp"),0x00010203);
685 # compose counter increment vector on stack
688 &mov (&DWP(16,"esp"),$rounds);
689 &mov (&DWP(20,"esp"),$key_);
690 &mov (&DWP(24,"esp"),$key_);
691 &mov (&DWP(28,"esp"),$key_);
693 &movdqa ($inout3,&QWP(0,"esp")); # bswap mask
694 &movdqa ($inout0,$ivec);
695 &pshufb ($ivec,$inout3); # keep iv in reverse order
697 &mov ($rounds,&DWP(240,$key));
699 &mov ($rounds_,$rounds);
702 { &aesni_inline_generate1("enc"); }
704 { &call ("_aesni_encrypt1"); }
706 &set_label("ccm64_dec_outer");
707 &paddq ($ivec,&QWP(16,"esp"));
708 &movups ($in0,&QWP(0,$inp)); # load inp
709 &xorps ($in0,$inout0);
710 &movdqa ($inout0,$ivec);
711 &lea ($inp,&QWP(16,$inp));
712 &pshufb ($inout0,$inout3);
714 &mov ($rounds,$rounds_);
715 &movups (&QWP(0,$out),$in0);
716 &lea ($out,&DWP(16,$out));
719 &jz (&label("ccm64_dec_break"));
721 &$movekey ($rndkey0,&QWP(0,$key));
723 &$movekey ($rndkey1,&QWP(16,$key));
724 &xorps ($in0,$rndkey0);
725 &lea ($key,&DWP(32,$key));
726 &xorps ($inout0,$rndkey0);
727 &xorps ($cmac,$in0); # cmac^=out
728 &$movekey ($rndkey0,&QWP(0,$key));
730 &set_label("ccm64_dec2_loop");
731 &aesenc ($inout0,$rndkey1);
733 &aesenc ($cmac,$rndkey1);
734 &$movekey ($rndkey1,&QWP(16,$key));
735 &aesenc ($inout0,$rndkey0);
736 &lea ($key,&DWP(32,$key));
737 &aesenc ($cmac,$rndkey0);
738 &$movekey ($rndkey0,&QWP(0,$key));
739 &jnz (&label("ccm64_dec2_loop"));
740 &aesenc ($inout0,$rndkey1);
741 &aesenc ($cmac,$rndkey1);
742 &aesenclast ($inout0,$rndkey0);
743 &aesenclast ($cmac,$rndkey0);
744 &jmp (&label("ccm64_dec_outer"));
746 &set_label("ccm64_dec_break",16);
748 { &aesni_inline_generate1("enc",$cmac,$in0); }
750 { &call ("_aesni_encrypt1",$cmac); }
752 &mov ("esp",&DWP(48,"esp"));
753 &mov ($out,&wparam(5));
754 &movups (&QWP(0,$out),$cmac);
755 &function_end("aesni_ccm64_decrypt_blocks");
758 ######################################################################
759 # void aesni_ctr32_encrypt_blocks (const void *in, void *out,
760 # size_t blocks, const AES_KEY *key,
763 # Handles only complete blocks, operates on 32-bit counter and
764 # does not update *ivec! (see engine/eng_aesni.c for details)
768 # 16 vector addend: 0,6,6,6
769 # 32 counter-less ivec
770 # 48 1st triplet of counter vector
771 # 64 2nd triplet of counter vector
774 &function_begin("aesni_ctr32_encrypt_blocks");
775 &mov ($inp,&wparam(0));
776 &mov ($out,&wparam(1));
777 &mov ($len,&wparam(2));
778 &mov ($key,&wparam(3));
779 &mov ($rounds_,&wparam(4));
782 &and ("esp",-16); # align stack
783 &mov (&DWP(80,"esp"),$key_);
786 &je (&label("ctr32_one_shortcut"));
788 &movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
790 # compose byte-swap control mask for pshufb on stack
791 &mov (&DWP(0,"esp"),0x0c0d0e0f);
792 &mov (&DWP(4,"esp"),0x08090a0b);
793 &mov (&DWP(8,"esp"),0x04050607);
794 &mov (&DWP(12,"esp"),0x00010203);
796 # compose counter increment vector on stack
799 &mov (&DWP(16,"esp"),$rounds);
800 &mov (&DWP(20,"esp"),$rounds);
801 &mov (&DWP(24,"esp"),$rounds);
802 &mov (&DWP(28,"esp"),$key_);
804 &pextrd ($rounds_,$inout5,3); # pull 32-bit counter
805 &pinsrd ($inout5,$key_,3); # wipe 32-bit counter
807 &mov ($rounds,&DWP(240,$key)); # key->rounds
809 # compose 2 vectors of 3x32-bit counters
811 &pxor ($rndkey1,$rndkey1);
812 &pxor ($rndkey0,$rndkey0);
813 &movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
814 &pinsrd ($rndkey1,$rounds_,0);
815 &lea ($key_,&DWP(3,$rounds_));
816 &pinsrd ($rndkey0,$key_,0);
818 &pinsrd ($rndkey1,$rounds_,1);
820 &pinsrd ($rndkey0,$key_,1);
822 &pinsrd ($rndkey1,$rounds_,2);
824 &pinsrd ($rndkey0,$key_,2);
825 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
826 &pshufb ($rndkey1,$inout0); # byte swap
827 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
828 &pshufb ($rndkey0,$inout0); # byte swap
830 &pshufd ($inout0,$rndkey1,3<<6); # place counter to upper dword
831 &pshufd ($inout1,$rndkey1,2<<6);
833 &jb (&label("ctr32_tail"));
834 &movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec
836 &mov ($key_,$key); # backup $key
837 &mov ($rounds_,$rounds); # backup $rounds
839 &jmp (&label("ctr32_loop6"));
841 &set_label("ctr32_loop6",16);
842 &pshufd ($inout2,$rndkey1,1<<6);
843 &movdqa ($rndkey1,&QWP(32,"esp")); # pull counter-less ivec
844 &pshufd ($inout3,$rndkey0,3<<6);
845 &por ($inout0,$rndkey1); # merge counter-less ivec
846 &pshufd ($inout4,$rndkey0,2<<6);
847 &por ($inout1,$rndkey1);
848 &pshufd ($inout5,$rndkey0,1<<6);
849 &por ($inout2,$rndkey1);
850 &por ($inout3,$rndkey1);
851 &por ($inout4,$rndkey1);
852 &por ($inout5,$rndkey1);
854 # inlining _aesni_encrypt6's prologue gives ~4% improvement...
855 &$movekey ($rndkey0,&QWP(0,$key_));
856 &$movekey ($rndkey1,&QWP(16,$key_));
857 &lea ($key,&DWP(32,$key_));
859 &pxor ($inout0,$rndkey0);
860 &pxor ($inout1,$rndkey0);
861 &aesenc ($inout0,$rndkey1);
862 &pxor ($inout2,$rndkey0);
863 &aesenc ($inout1,$rndkey1);
864 &pxor ($inout3,$rndkey0);
865 &aesenc ($inout2,$rndkey1);
866 &pxor ($inout4,$rndkey0);
867 &aesenc ($inout3,$rndkey1);
868 &pxor ($inout5,$rndkey0);
869 &aesenc ($inout4,$rndkey1);
870 &$movekey ($rndkey0,&QWP(0,$key));
871 &aesenc ($inout5,$rndkey1);
873 &call (&label("_aesni_encrypt6_enter"));
875 &movups ($rndkey1,&QWP(0,$inp));
876 &movups ($rndkey0,&QWP(0x10,$inp));
877 &xorps ($inout0,$rndkey1);
878 &movups ($rndkey1,&QWP(0x20,$inp));
879 &xorps ($inout1,$rndkey0);
880 &movups (&QWP(0,$out),$inout0);
881 &movdqa ($rndkey0,&QWP(16,"esp")); # load increment
882 &xorps ($inout2,$rndkey1);
883 &movdqa ($rndkey1,&QWP(48,"esp")); # load 1st triplet
884 &movups (&QWP(0x10,$out),$inout1);
885 &movups (&QWP(0x20,$out),$inout2);
887 &paddd ($rndkey1,$rndkey0); # 1st triplet increment
888 &paddd ($rndkey0,&QWP(64,"esp")); # 2nd triplet increment
889 &movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
891 &movups ($inout1,&QWP(0x30,$inp));
892 &movups ($inout2,&QWP(0x40,$inp));
893 &xorps ($inout3,$inout1);
894 &movups ($inout1,&QWP(0x50,$inp));
895 &lea ($inp,&DWP(0x60,$inp));
896 &movdqa (&QWP(48,"esp"),$rndkey1); # save 1st triplet
897 &pshufb ($rndkey1,$inout0); # byte swap
898 &xorps ($inout4,$inout2);
899 &movups (&QWP(0x30,$out),$inout3);
900 &xorps ($inout5,$inout1);
901 &movdqa (&QWP(64,"esp"),$rndkey0); # save 2nd triplet
902 &pshufb ($rndkey0,$inout0); # byte swap
903 &movups (&QWP(0x40,$out),$inout4);
904 &pshufd ($inout0,$rndkey1,3<<6);
905 &movups (&QWP(0x50,$out),$inout5);
906 &lea ($out,&DWP(0x60,$out));
908 &mov ($rounds,$rounds_);
909 &pshufd ($inout1,$rndkey1,2<<6);
911 &jnc (&label("ctr32_loop6"));
914 &jz (&label("ctr32_ret"));
916 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
917 &movdqa ($inout5,&QWP(32,"esp")); # pull count-less ivec
919 &set_label("ctr32_tail");
920 &por ($inout0,$inout5);
922 &jb (&label("ctr32_one"));
924 &pshufd ($inout2,$rndkey1,1<<6);
925 &por ($inout1,$inout5);
926 &je (&label("ctr32_two"));
928 &pshufd ($inout3,$rndkey0,3<<6);
929 &por ($inout2,$inout5);
931 &jb (&label("ctr32_three"));
933 &pshufd ($inout4,$rndkey0,2<<6);
934 &por ($inout3,$inout5);
935 &je (&label("ctr32_four"));
937 &por ($inout4,$inout5);
938 &call ("_aesni_encrypt6");
939 &movups ($rndkey1,&QWP(0,$inp));
940 &movups ($rndkey0,&QWP(0x10,$inp));
941 &xorps ($inout0,$rndkey1);
942 &movups ($rndkey1,&QWP(0x20,$inp));
943 &xorps ($inout1,$rndkey0);
944 &movups ($rndkey0,&QWP(0x30,$inp));
945 &xorps ($inout2,$rndkey1);
946 &movups ($rndkey1,&QWP(0x40,$inp));
947 &xorps ($inout3,$rndkey0);
948 &movups (&QWP(0,$out),$inout0);
949 &xorps ($inout4,$rndkey1);
950 &movups (&QWP(0x10,$out),$inout1);
951 &movups (&QWP(0x20,$out),$inout2);
952 &movups (&QWP(0x30,$out),$inout3);
953 &movups (&QWP(0x40,$out),$inout4);
954 &jmp (&label("ctr32_ret"));
956 &set_label("ctr32_one_shortcut",16);
957 &movups ($inout0,&QWP(0,$rounds_)); # load ivec
958 &mov ($rounds,&DWP(240,$key));
960 &set_label("ctr32_one");
962 { &aesni_inline_generate1("enc"); }
964 { &call ("_aesni_encrypt1"); }
965 &movups ($in0,&QWP(0,$inp));
966 &xorps ($in0,$inout0);
967 &movups (&QWP(0,$out),$in0);
968 &jmp (&label("ctr32_ret"));
970 &set_label("ctr32_two",16);
971 &call ("_aesni_encrypt3");
972 &movups ($inout3,&QWP(0,$inp));
973 &movups ($inout4,&QWP(0x10,$inp));
974 &xorps ($inout0,$inout3);
975 &xorps ($inout1,$inout4);
976 &movups (&QWP(0,$out),$inout0);
977 &movups (&QWP(0x10,$out),$inout1);
978 &jmp (&label("ctr32_ret"));
980 &set_label("ctr32_three",16);
981 &call ("_aesni_encrypt3");
982 &movups ($inout3,&QWP(0,$inp));
983 &movups ($inout4,&QWP(0x10,$inp));
984 &xorps ($inout0,$inout3);
985 &movups ($inout5,&QWP(0x20,$inp));
986 &xorps ($inout1,$inout4);
987 &movups (&QWP(0,$out),$inout0);
988 &xorps ($inout2,$inout5);
989 &movups (&QWP(0x10,$out),$inout1);
990 &movups (&QWP(0x20,$out),$inout2);
991 &jmp (&label("ctr32_ret"));
993 &set_label("ctr32_four",16);
994 &call ("_aesni_encrypt4");
995 &movups ($inout4,&QWP(0,$inp));
996 &movups ($inout5,&QWP(0x10,$inp));
997 &movups ($rndkey1,&QWP(0x20,$inp));
998 &xorps ($inout0,$inout4);
999 &movups ($rndkey0,&QWP(0x30,$inp));
1000 &xorps ($inout1,$inout5);
1001 &movups (&QWP(0,$out),$inout0);
1002 &xorps ($inout2,$rndkey1);
1003 &movups (&QWP(0x10,$out),$inout1);
1004 &xorps ($inout3,$rndkey0);
1005 &movups (&QWP(0x20,$out),$inout2);
1006 &movups (&QWP(0x30,$out),$inout3);
1008 &set_label("ctr32_ret");
1009 &mov ("esp",&DWP(80,"esp"));
1010 &function_end("aesni_ctr32_encrypt_blocks");
1012 ######################################################################
1013 # void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
1014 # const AES_KEY *key1, const AES_KEY *key2
1015 # const unsigned char iv[16]);
1017 { my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
1019 &function_begin("aesni_xts_encrypt");
1020 &mov ($key,&wparam(4)); # key2
1021 &mov ($inp,&wparam(5)); # clear-text tweak
1023 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1024 &movups ($inout0,&QWP(0,$inp));
1026 { &aesni_inline_generate1("enc"); }
1028 { &call ("_aesni_encrypt1"); }
1030 &mov ($inp,&wparam(0));
1031 &mov ($out,&wparam(1));
1032 &mov ($len,&wparam(2));
1033 &mov ($key,&wparam(3)); # key1
1036 &sub ("esp",16*7+8);
1037 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1038 &and ("esp",-16); # align stack
1040 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1041 &mov (&DWP(16*6+4,"esp"),0);
1042 &mov (&DWP(16*6+8,"esp"),1);
1043 &mov (&DWP(16*6+12,"esp"),0);
1044 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1045 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1047 &movdqa ($tweak,$inout0);
1048 &pxor ($twtmp,$twtmp);
1049 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1050 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1053 &mov ($key_,$key); # backup $key
1054 &mov ($rounds_,$rounds); # backup $rounds
1056 &jc (&label("xts_enc_short"));
1059 &mov ($rounds_,$rounds);
1060 &jmp (&label("xts_enc_loop6"));
1062 &set_label("xts_enc_loop6",16);
1063 for ($i=0;$i<4;$i++) {
1064 &pshufd ($twres,$twtmp,0x13);
1065 &pxor ($twtmp,$twtmp);
1066 &movdqa (&QWP(16*$i,"esp"),$tweak);
1067 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1068 &pand ($twres,$twmask); # isolate carry and residue
1069 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1070 &pxor ($tweak,$twres);
1072 &pshufd ($inout5,$twtmp,0x13);
1073 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1074 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1075 &$movekey ($rndkey0,&QWP(0,$key_));
1076 &pand ($inout5,$twmask); # isolate carry and residue
1077 &movups ($inout0,&QWP(0,$inp)); # load input
1078 &pxor ($inout5,$tweak);
1080 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1081 &movdqu ($inout1,&QWP(16*1,$inp));
1082 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1083 &movdqu ($inout2,&QWP(16*2,$inp));
1084 &pxor ($inout1,$rndkey0);
1085 &movdqu ($inout3,&QWP(16*3,$inp));
1086 &pxor ($inout2,$rndkey0);
1087 &movdqu ($inout4,&QWP(16*4,$inp));
1088 &pxor ($inout3,$rndkey0);
1089 &movdqu ($rndkey1,&QWP(16*5,$inp));
1090 &pxor ($inout4,$rndkey0);
1091 &lea ($inp,&DWP(16*6,$inp));
1092 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1093 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1094 &pxor ($inout5,$rndkey1);
1096 &$movekey ($rndkey1,&QWP(16,$key_));
1097 &lea ($key,&DWP(32,$key_));
1098 &pxor ($inout1,&QWP(16*1,"esp"));
1099 &aesenc ($inout0,$rndkey1);
1100 &pxor ($inout2,&QWP(16*2,"esp"));
1101 &aesenc ($inout1,$rndkey1);
1102 &pxor ($inout3,&QWP(16*3,"esp"));
1104 &aesenc ($inout2,$rndkey1);
1105 &pxor ($inout4,&QWP(16*4,"esp"));
1106 &aesenc ($inout3,$rndkey1);
1107 &pxor ($inout5,$rndkey0);
1108 &aesenc ($inout4,$rndkey1);
1109 &$movekey ($rndkey0,&QWP(0,$key));
1110 &aesenc ($inout5,$rndkey1);
1111 &call (&label("_aesni_encrypt6_enter"));
1113 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1114 &pxor ($twtmp,$twtmp);
1115 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1116 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1117 &xorps ($inout1,&QWP(16*1,"esp"));
1118 &movups (&QWP(16*0,$out),$inout0); # write output
1119 &xorps ($inout2,&QWP(16*2,"esp"));
1120 &movups (&QWP(16*1,$out),$inout1);
1121 &xorps ($inout3,&QWP(16*3,"esp"));
1122 &movups (&QWP(16*2,$out),$inout2);
1123 &xorps ($inout4,&QWP(16*4,"esp"));
1124 &movups (&QWP(16*3,$out),$inout3);
1125 &xorps ($inout5,$tweak);
1126 &movups (&QWP(16*4,$out),$inout4);
1127 &pshufd ($twres,$twtmp,0x13);
1128 &movups (&QWP(16*5,$out),$inout5);
1129 &lea ($out,&DWP(16*6,$out));
1130 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1132 &pxor ($twtmp,$twtmp);
1133 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1134 &pand ($twres,$twmask); # isolate carry and residue
1135 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1136 &mov ($rounds,$rounds_); # restore $rounds
1137 &pxor ($tweak,$twres);
1140 &jnc (&label("xts_enc_loop6"));
1142 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1143 &mov ($key,$key_); # restore $key
1144 &mov ($rounds_,$rounds);
1146 &set_label("xts_enc_short");
1148 &jz (&label("xts_enc_done6x"));
1150 &movdqa ($inout3,$tweak); # put aside previous tweak
1152 &jb (&label("xts_enc_one"));
1154 &pshufd ($twres,$twtmp,0x13);
1155 &pxor ($twtmp,$twtmp);
1156 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1157 &pand ($twres,$twmask); # isolate carry and residue
1158 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1159 &pxor ($tweak,$twres);
1160 &je (&label("xts_enc_two"));
1162 &pshufd ($twres,$twtmp,0x13);
1163 &pxor ($twtmp,$twtmp);
1164 &movdqa ($inout4,$tweak); # put aside previous tweak
1165 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1166 &pand ($twres,$twmask); # isolate carry and residue
1167 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1168 &pxor ($tweak,$twres);
1170 &jb (&label("xts_enc_three"));
1172 &pshufd ($twres,$twtmp,0x13);
1173 &pxor ($twtmp,$twtmp);
1174 &movdqa ($inout5,$tweak); # put aside previous tweak
1175 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1176 &pand ($twres,$twmask); # isolate carry and residue
1177 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1178 &pxor ($tweak,$twres);
1179 &movdqa (&QWP(16*0,"esp"),$inout3);
1180 &movdqa (&QWP(16*1,"esp"),$inout4);
1181 &je (&label("xts_enc_four"));
1183 &movdqa (&QWP(16*2,"esp"),$inout5);
1184 &pshufd ($inout5,$twtmp,0x13);
1185 &movdqa (&QWP(16*3,"esp"),$tweak);
1186 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1187 &pand ($inout5,$twmask); # isolate carry and residue
1188 &pxor ($inout5,$tweak);
1190 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1191 &movdqu ($inout1,&QWP(16*1,$inp));
1192 &movdqu ($inout2,&QWP(16*2,$inp));
1193 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1194 &movdqu ($inout3,&QWP(16*3,$inp));
1195 &pxor ($inout1,&QWP(16*1,"esp"));
1196 &movdqu ($inout4,&QWP(16*4,$inp));
1197 &pxor ($inout2,&QWP(16*2,"esp"));
1198 &lea ($inp,&DWP(16*5,$inp));
1199 &pxor ($inout3,&QWP(16*3,"esp"));
1200 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1201 &pxor ($inout4,$inout5);
1203 &call ("_aesni_encrypt6");
1205 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1206 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1207 &xorps ($inout1,&QWP(16*1,"esp"));
1208 &xorps ($inout2,&QWP(16*2,"esp"));
1209 &movups (&QWP(16*0,$out),$inout0); # write output
1210 &xorps ($inout3,&QWP(16*3,"esp"));
1211 &movups (&QWP(16*1,$out),$inout1);
1212 &xorps ($inout4,$tweak);
1213 &movups (&QWP(16*2,$out),$inout2);
1214 &movups (&QWP(16*3,$out),$inout3);
1215 &movups (&QWP(16*4,$out),$inout4);
1216 &lea ($out,&DWP(16*5,$out));
1217 &jmp (&label("xts_enc_done"));
1219 &set_label("xts_enc_one",16);
1220 &movups ($inout0,&QWP(16*0,$inp)); # load input
1221 &lea ($inp,&DWP(16*1,$inp));
1222 &xorps ($inout0,$inout3); # input^=tweak
1224 { &aesni_inline_generate1("enc"); }
1226 { &call ("_aesni_encrypt1"); }
1227 &xorps ($inout0,$inout3); # output^=tweak
1228 &movups (&QWP(16*0,$out),$inout0); # write output
1229 &lea ($out,&DWP(16*1,$out));
1231 &movdqa ($tweak,$inout3); # last tweak
1232 &jmp (&label("xts_enc_done"));
1234 &set_label("xts_enc_two",16);
1235 &movaps ($inout4,$tweak); # put aside last tweak
1237 &movups ($inout0,&QWP(16*0,$inp)); # load input
1238 &movups ($inout1,&QWP(16*1,$inp));
1239 &lea ($inp,&DWP(16*2,$inp));
1240 &xorps ($inout0,$inout3); # input^=tweak
1241 &xorps ($inout1,$inout4);
1242 &xorps ($inout2,$inout2);
1244 &call ("_aesni_encrypt3");
1246 &xorps ($inout0,$inout3); # output^=tweak
1247 &xorps ($inout1,$inout4);
1248 &movups (&QWP(16*0,$out),$inout0); # write output
1249 &movups (&QWP(16*1,$out),$inout1);
1250 &lea ($out,&DWP(16*2,$out));
1252 &movdqa ($tweak,$inout4); # last tweak
1253 &jmp (&label("xts_enc_done"));
1255 &set_label("xts_enc_three",16);
1256 &movaps ($inout5,$tweak); # put aside last tweak
1257 &movups ($inout0,&QWP(16*0,$inp)); # load input
1258 &movups ($inout1,&QWP(16*1,$inp));
1259 &movups ($inout2,&QWP(16*2,$inp));
1260 &lea ($inp,&DWP(16*3,$inp));
1261 &xorps ($inout0,$inout3); # input^=tweak
1262 &xorps ($inout1,$inout4);
1263 &xorps ($inout2,$inout5);
1265 &call ("_aesni_encrypt3");
1267 &xorps ($inout0,$inout3); # output^=tweak
1268 &xorps ($inout1,$inout4);
1269 &xorps ($inout2,$inout5);
1270 &movups (&QWP(16*0,$out),$inout0); # write output
1271 &movups (&QWP(16*1,$out),$inout1);
1272 &movups (&QWP(16*2,$out),$inout2);
1273 &lea ($out,&DWP(16*3,$out));
1275 &movdqa ($tweak,$inout5); # last tweak
1276 &jmp (&label("xts_enc_done"));
1278 &set_label("xts_enc_four",16);
1279 &movaps ($inout4,$tweak); # put aside last tweak
1281 &movups ($inout0,&QWP(16*0,$inp)); # load input
1282 &movups ($inout1,&QWP(16*1,$inp));
1283 &movups ($inout2,&QWP(16*2,$inp));
1284 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1285 &movups ($inout3,&QWP(16*3,$inp));
1286 &lea ($inp,&DWP(16*4,$inp));
1287 &xorps ($inout1,&QWP(16*1,"esp"));
1288 &xorps ($inout2,$inout5);
1289 &xorps ($inout3,$inout4);
1291 &call ("_aesni_encrypt4");
1293 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1294 &xorps ($inout1,&QWP(16*1,"esp"));
1295 &xorps ($inout2,$inout5);
1296 &movups (&QWP(16*0,$out),$inout0); # write output
1297 &xorps ($inout3,$inout4);
1298 &movups (&QWP(16*1,$out),$inout1);
1299 &movups (&QWP(16*2,$out),$inout2);
1300 &movups (&QWP(16*3,$out),$inout3);
1301 &lea ($out,&DWP(16*4,$out));
1303 &movdqa ($tweak,$inout4); # last tweak
1304 &jmp (&label("xts_enc_done"));
1306 &set_label("xts_enc_done6x",16); # $tweak is pre-calculated
1307 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1309 &jz (&label("xts_enc_ret"));
1310 &movdqa ($inout3,$tweak);
1311 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1312 &jmp (&label("xts_enc_steal"));
1314 &set_label("xts_enc_done",16);
1315 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1316 &pxor ($twtmp,$twtmp);
1318 &jz (&label("xts_enc_ret"));
1320 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1321 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1322 &pshufd ($inout3,$twtmp,0x13);
1323 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1324 &pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
1325 &pxor ($inout3,$tweak);
1327 &set_label("xts_enc_steal");
1328 &movz ($rounds,&BP(0,$inp));
1329 &movz ($key,&BP(-16,$out));
1330 &lea ($inp,&DWP(1,$inp));
1331 &mov (&BP(-16,$out),&LB($rounds));
1332 &mov (&BP(0,$out),&LB($key));
1333 &lea ($out,&DWP(1,$out));
1335 &jnz (&label("xts_enc_steal"));
1337 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1338 &mov ($key,$key_); # restore $key
1339 &mov ($rounds,$rounds_); # restore $rounds
1341 &movups ($inout0,&QWP(-16,$out)); # load input
1342 &xorps ($inout0,$inout3); # input^=tweak
1344 { &aesni_inline_generate1("enc"); }
1346 { &call ("_aesni_encrypt1"); }
1347 &xorps ($inout0,$inout3); # output^=tweak
1348 &movups (&QWP(-16,$out),$inout0); # write output
1350 &set_label("xts_enc_ret");
1351 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1352 &function_end("aesni_xts_encrypt");
1354 &function_begin("aesni_xts_decrypt");
1355 &mov ($key,&wparam(4)); # key2
1356 &mov ($inp,&wparam(5)); # clear-text tweak
1358 &mov ($rounds,&DWP(240,$key)); # key2->rounds
1359 &movups ($inout0,&QWP(0,$inp));
1361 { &aesni_inline_generate1("enc"); }
1363 { &call ("_aesni_encrypt1"); }
1365 &mov ($inp,&wparam(0));
1366 &mov ($out,&wparam(1));
1367 &mov ($len,&wparam(2));
1368 &mov ($key,&wparam(3)); # key1
1371 &sub ("esp",16*7+8);
1372 &and ("esp",-16); # align stack
1374 &xor ($rounds_,$rounds_); # if(len%16) len-=16;
1376 &setnz (&LB($rounds_));
1378 &sub ($len,$rounds_);
1380 &mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
1381 &mov (&DWP(16*6+4,"esp"),0);
1382 &mov (&DWP(16*6+8,"esp"),1);
1383 &mov (&DWP(16*6+12,"esp"),0);
1384 &mov (&DWP(16*7+0,"esp"),$len); # save original $len
1385 &mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
1387 &mov ($rounds,&DWP(240,$key)); # key1->rounds
1388 &mov ($key_,$key); # backup $key
1389 &mov ($rounds_,$rounds); # backup $rounds
1391 &movdqa ($tweak,$inout0);
1392 &pxor ($twtmp,$twtmp);
1393 &movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
1394 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1398 &jc (&label("xts_dec_short"));
1401 &mov ($rounds_,$rounds);
1402 &jmp (&label("xts_dec_loop6"));
1404 &set_label("xts_dec_loop6",16);
1405 for ($i=0;$i<4;$i++) {
1406 &pshufd ($twres,$twtmp,0x13);
1407 &pxor ($twtmp,$twtmp);
1408 &movdqa (&QWP(16*$i,"esp"),$tweak);
1409 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1410 &pand ($twres,$twmask); # isolate carry and residue
1411 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1412 &pxor ($tweak,$twres);
1414 &pshufd ($inout5,$twtmp,0x13);
1415 &movdqa (&QWP(16*$i++,"esp"),$tweak);
1416 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1417 &$movekey ($rndkey0,&QWP(0,$key_));
1418 &pand ($inout5,$twmask); # isolate carry and residue
1419 &movups ($inout0,&QWP(0,$inp)); # load input
1420 &pxor ($inout5,$tweak);
1422 # inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
1423 &movdqu ($inout1,&QWP(16*1,$inp));
1424 &xorps ($inout0,$rndkey0); # input^=rndkey[0]
1425 &movdqu ($inout2,&QWP(16*2,$inp));
1426 &pxor ($inout1,$rndkey0);
1427 &movdqu ($inout3,&QWP(16*3,$inp));
1428 &pxor ($inout2,$rndkey0);
1429 &movdqu ($inout4,&QWP(16*4,$inp));
1430 &pxor ($inout3,$rndkey0);
1431 &movdqu ($rndkey1,&QWP(16*5,$inp));
1432 &pxor ($inout4,$rndkey0);
1433 &lea ($inp,&DWP(16*6,$inp));
1434 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1435 &movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
1436 &pxor ($inout5,$rndkey1);
1438 &$movekey ($rndkey1,&QWP(16,$key_));
1439 &lea ($key,&DWP(32,$key_));
1440 &pxor ($inout1,&QWP(16*1,"esp"));
1441 &aesdec ($inout0,$rndkey1);
1442 &pxor ($inout2,&QWP(16*2,"esp"));
1443 &aesdec ($inout1,$rndkey1);
1444 &pxor ($inout3,&QWP(16*3,"esp"));
1446 &aesdec ($inout2,$rndkey1);
1447 &pxor ($inout4,&QWP(16*4,"esp"));
1448 &aesdec ($inout3,$rndkey1);
1449 &pxor ($inout5,$rndkey0);
1450 &aesdec ($inout4,$rndkey1);
1451 &$movekey ($rndkey0,&QWP(0,$key));
1452 &aesdec ($inout5,$rndkey1);
1453 &call (&label("_aesni_decrypt6_enter"));
1455 &movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
1456 &pxor ($twtmp,$twtmp);
1457 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1458 &pcmpgtd ($twtmp,$tweak); # broadcast upper bits
1459 &xorps ($inout1,&QWP(16*1,"esp"));
1460 &movups (&QWP(16*0,$out),$inout0); # write output
1461 &xorps ($inout2,&QWP(16*2,"esp"));
1462 &movups (&QWP(16*1,$out),$inout1);
1463 &xorps ($inout3,&QWP(16*3,"esp"));
1464 &movups (&QWP(16*2,$out),$inout2);
1465 &xorps ($inout4,&QWP(16*4,"esp"));
1466 &movups (&QWP(16*3,$out),$inout3);
1467 &xorps ($inout5,$tweak);
1468 &movups (&QWP(16*4,$out),$inout4);
1469 &pshufd ($twres,$twtmp,0x13);
1470 &movups (&QWP(16*5,$out),$inout5);
1471 &lea ($out,&DWP(16*6,$out));
1472 &movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
1474 &pxor ($twtmp,$twtmp);
1475 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1476 &pand ($twres,$twmask); # isolate carry and residue
1477 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1478 &mov ($rounds,$rounds_); # restore $rounds
1479 &pxor ($tweak,$twres);
1482 &jnc (&label("xts_dec_loop6"));
1484 &lea ($rounds,&DWP(1,"",$rounds,2)); # restore $rounds
1485 &mov ($key,$key_); # restore $key
1486 &mov ($rounds_,$rounds);
1488 &set_label("xts_dec_short");
1490 &jz (&label("xts_dec_done6x"));
1492 &movdqa ($inout3,$tweak); # put aside previous tweak
1494 &jb (&label("xts_dec_one"));
1496 &pshufd ($twres,$twtmp,0x13);
1497 &pxor ($twtmp,$twtmp);
1498 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1499 &pand ($twres,$twmask); # isolate carry and residue
1500 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1501 &pxor ($tweak,$twres);
1502 &je (&label("xts_dec_two"));
1504 &pshufd ($twres,$twtmp,0x13);
1505 &pxor ($twtmp,$twtmp);
1506 &movdqa ($inout4,$tweak); # put aside previous tweak
1507 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1508 &pand ($twres,$twmask); # isolate carry and residue
1509 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1510 &pxor ($tweak,$twres);
1512 &jb (&label("xts_dec_three"));
1514 &pshufd ($twres,$twtmp,0x13);
1515 &pxor ($twtmp,$twtmp);
1516 &movdqa ($inout5,$tweak); # put aside previous tweak
1517 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1518 &pand ($twres,$twmask); # isolate carry and residue
1519 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1520 &pxor ($tweak,$twres);
1521 &movdqa (&QWP(16*0,"esp"),$inout3);
1522 &movdqa (&QWP(16*1,"esp"),$inout4);
1523 &je (&label("xts_dec_four"));
1525 &movdqa (&QWP(16*2,"esp"),$inout5);
1526 &pshufd ($inout5,$twtmp,0x13);
1527 &movdqa (&QWP(16*3,"esp"),$tweak);
1528 &paddq ($tweak,$tweak); # &psllq($inout0,1);
1529 &pand ($inout5,$twmask); # isolate carry and residue
1530 &pxor ($inout5,$tweak);
1532 &movdqu ($inout0,&QWP(16*0,$inp)); # load input
1533 &movdqu ($inout1,&QWP(16*1,$inp));
1534 &movdqu ($inout2,&QWP(16*2,$inp));
1535 &pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
1536 &movdqu ($inout3,&QWP(16*3,$inp));
1537 &pxor ($inout1,&QWP(16*1,"esp"));
1538 &movdqu ($inout4,&QWP(16*4,$inp));
1539 &pxor ($inout2,&QWP(16*2,"esp"));
1540 &lea ($inp,&DWP(16*5,$inp));
1541 &pxor ($inout3,&QWP(16*3,"esp"));
1542 &movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
1543 &pxor ($inout4,$inout5);
1545 &call ("_aesni_decrypt6");
1547 &movaps ($tweak,&QWP(16*4,"esp")); # last tweak
1548 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1549 &xorps ($inout1,&QWP(16*1,"esp"));
1550 &xorps ($inout2,&QWP(16*2,"esp"));
1551 &movups (&QWP(16*0,$out),$inout0); # write output
1552 &xorps ($inout3,&QWP(16*3,"esp"));
1553 &movups (&QWP(16*1,$out),$inout1);
1554 &xorps ($inout4,$tweak);
1555 &movups (&QWP(16*2,$out),$inout2);
1556 &movups (&QWP(16*3,$out),$inout3);
1557 &movups (&QWP(16*4,$out),$inout4);
1558 &lea ($out,&DWP(16*5,$out));
1559 &jmp (&label("xts_dec_done"));
1561 &set_label("xts_dec_one",16);
1562 &movups ($inout0,&QWP(16*0,$inp)); # load input
1563 &lea ($inp,&DWP(16*1,$inp));
1564 &xorps ($inout0,$inout3); # input^=tweak
1566 { &aesni_inline_generate1("dec"); }
1568 { &call ("_aesni_decrypt1"); }
1569 &xorps ($inout0,$inout3); # output^=tweak
1570 &movups (&QWP(16*0,$out),$inout0); # write output
1571 &lea ($out,&DWP(16*1,$out));
1573 &movdqa ($tweak,$inout3); # last tweak
1574 &jmp (&label("xts_dec_done"));
1576 &set_label("xts_dec_two",16);
1577 &movaps ($inout4,$tweak); # put aside last tweak
1579 &movups ($inout0,&QWP(16*0,$inp)); # load input
1580 &movups ($inout1,&QWP(16*1,$inp));
1581 &lea ($inp,&DWP(16*2,$inp));
1582 &xorps ($inout0,$inout3); # input^=tweak
1583 &xorps ($inout1,$inout4);
1585 &call ("_aesni_decrypt3");
1587 &xorps ($inout0,$inout3); # output^=tweak
1588 &xorps ($inout1,$inout4);
1589 &movups (&QWP(16*0,$out),$inout0); # write output
1590 &movups (&QWP(16*1,$out),$inout1);
1591 &lea ($out,&DWP(16*2,$out));
1593 &movdqa ($tweak,$inout4); # last tweak
1594 &jmp (&label("xts_dec_done"));
1596 &set_label("xts_dec_three",16);
1597 &movaps ($inout5,$tweak); # put aside last tweak
1598 &movups ($inout0,&QWP(16*0,$inp)); # load input
1599 &movups ($inout1,&QWP(16*1,$inp));
1600 &movups ($inout2,&QWP(16*2,$inp));
1601 &lea ($inp,&DWP(16*3,$inp));
1602 &xorps ($inout0,$inout3); # input^=tweak
1603 &xorps ($inout1,$inout4);
1604 &xorps ($inout2,$inout5);
1606 &call ("_aesni_decrypt3");
1608 &xorps ($inout0,$inout3); # output^=tweak
1609 &xorps ($inout1,$inout4);
1610 &xorps ($inout2,$inout5);
1611 &movups (&QWP(16*0,$out),$inout0); # write output
1612 &movups (&QWP(16*1,$out),$inout1);
1613 &movups (&QWP(16*2,$out),$inout2);
1614 &lea ($out,&DWP(16*3,$out));
1616 &movdqa ($tweak,$inout5); # last tweak
1617 &jmp (&label("xts_dec_done"));
1619 &set_label("xts_dec_four",16);
1620 &movaps ($inout4,$tweak); # put aside last tweak
1622 &movups ($inout0,&QWP(16*0,$inp)); # load input
1623 &movups ($inout1,&QWP(16*1,$inp));
1624 &movups ($inout2,&QWP(16*2,$inp));
1625 &xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
1626 &movups ($inout3,&QWP(16*3,$inp));
1627 &lea ($inp,&DWP(16*4,$inp));
1628 &xorps ($inout1,&QWP(16*1,"esp"));
1629 &xorps ($inout2,$inout5);
1630 &xorps ($inout3,$inout4);
1632 &call ("_aesni_decrypt4");
1634 &xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
1635 &xorps ($inout1,&QWP(16*1,"esp"));
1636 &xorps ($inout2,$inout5);
1637 &movups (&QWP(16*0,$out),$inout0); # write output
1638 &xorps ($inout3,$inout4);
1639 &movups (&QWP(16*1,$out),$inout1);
1640 &movups (&QWP(16*2,$out),$inout2);
1641 &movups (&QWP(16*3,$out),$inout3);
1642 &lea ($out,&DWP(16*4,$out));
1644 &movdqa ($tweak,$inout4); # last tweak
1645 &jmp (&label("xts_dec_done"));
1647 &set_label("xts_dec_done6x",16); # $tweak is pre-calculated
1648 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1650 &jz (&label("xts_dec_ret"));
1651 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1652 &jmp (&label("xts_dec_only_one_more"));
1654 &set_label("xts_dec_done",16);
1655 &mov ($len,&DWP(16*7+0,"esp")); # restore original $len
1656 &pxor ($twtmp,$twtmp);
1658 &jz (&label("xts_dec_ret"));
1660 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1661 &mov (&DWP(16*7+0,"esp"),$len); # save $len%16
1662 &pshufd ($twres,$twtmp,0x13);
1663 &pxor ($twtmp,$twtmp);
1664 &movdqa ($twmask,&QWP(16*6,"esp"));
1665 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1666 &pand ($twres,$twmask); # isolate carry and residue
1667 &pcmpgtd($twtmp,$tweak); # broadcast upper bits
1668 &pxor ($tweak,$twres);
1670 &set_label("xts_dec_only_one_more");
1671 &pshufd ($inout3,$twtmp,0x13);
1672 &movdqa ($inout4,$tweak); # put aside previous tweak
1673 &paddq ($tweak,$tweak); # &psllq($tweak,1);
1674 &pand ($inout3,$twmask); # isolate carry and residue
1675 &pxor ($inout3,$tweak);
1677 &mov ($key,$key_); # restore $key
1678 &mov ($rounds,$rounds_); # restore $rounds
1680 &movups ($inout0,&QWP(0,$inp)); # load input
1681 &xorps ($inout0,$inout3); # input^=tweak
1683 { &aesni_inline_generate1("dec"); }
1685 { &call ("_aesni_decrypt1"); }
1686 &xorps ($inout0,$inout3); # output^=tweak
1687 &movups (&QWP(0,$out),$inout0); # write output
1689 &set_label("xts_dec_steal");
1690 &movz ($rounds,&BP(16,$inp));
1691 &movz ($key,&BP(0,$out));
1692 &lea ($inp,&DWP(1,$inp));
1693 &mov (&BP(0,$out),&LB($rounds));
1694 &mov (&BP(16,$out),&LB($key));
1695 &lea ($out,&DWP(1,$out));
1697 &jnz (&label("xts_dec_steal"));
1699 &sub ($out,&DWP(16*7+0,"esp")); # rewind $out
1700 &mov ($key,$key_); # restore $key
1701 &mov ($rounds,$rounds_); # restore $rounds
1703 &movups ($inout0,&QWP(0,$out)); # load input
1704 &xorps ($inout0,$inout4); # input^=tweak
1706 { &aesni_inline_generate1("dec"); }
1708 { &call ("_aesni_decrypt1"); }
1709 &xorps ($inout0,$inout4); # output^=tweak
1710 &movups (&QWP(0,$out),$inout0); # write output
1712 &set_label("xts_dec_ret");
1713 &mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
1714 &function_end("aesni_xts_decrypt");
1718 ######################################################################
1719 # void $PREFIX_cbc_encrypt (const void *inp, void *out,
1720 # size_t length, const AES_KEY *key,
1721 # unsigned char *ivp,const int enc);
1722 &function_begin("${PREFIX}_cbc_encrypt");
1723 &mov ($inp,&wparam(0));
1724 &mov ($rounds_,"esp");
1725 &mov ($out,&wparam(1));
1727 &mov ($len,&wparam(2));
1728 &and ($rounds_,-16);
1729 &mov ($key,&wparam(3));
1730 &mov ($key_,&wparam(4));
1732 &jz (&label("cbc_abort"));
1734 &cmp (&wparam(5),0);
1735 &xchg ($rounds_,"esp"); # alloca
1736 &movups ($ivec,&QWP(0,$key_)); # load IV
1737 &mov ($rounds,&DWP(240,$key));
1738 &mov ($key_,$key); # backup $key
1739 &mov (&DWP(16,"esp"),$rounds_); # save original %esp
1740 &mov ($rounds_,$rounds); # backup $rounds
1741 &je (&label("cbc_decrypt"));
1743 &movaps ($inout0,$ivec);
1745 &jb (&label("cbc_enc_tail"));
1747 &jmp (&label("cbc_enc_loop"));
1749 &set_label("cbc_enc_loop",16);
1750 &movups ($ivec,&QWP(0,$inp)); # input actually
1751 &lea ($inp,&DWP(16,$inp));
1753 { &aesni_inline_generate1("enc",$inout0,$ivec); }
1755 { &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
1756 &mov ($rounds,$rounds_); # restore $rounds
1757 &mov ($key,$key_); # restore $key
1758 &movups (&QWP(0,$out),$inout0); # store output
1759 &lea ($out,&DWP(16,$out));
1761 &jnc (&label("cbc_enc_loop"));
1763 &jnz (&label("cbc_enc_tail"));
1764 &movaps ($ivec,$inout0);
1765 &jmp (&label("cbc_ret"));
1767 &set_label("cbc_enc_tail");
1768 &mov ("ecx",$len); # zaps $rounds
1769 &data_word(0xA4F3F689); # rep movsb
1770 &mov ("ecx",16); # zero tail
1772 &xor ("eax","eax"); # zaps $len
1773 &data_word(0xAAF3F689); # rep stosb
1774 &lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
1775 &mov ($rounds,$rounds_); # restore $rounds
1776 &mov ($inp,$out); # $inp and $out are the same
1777 &mov ($key,$key_); # restore $key
1778 &jmp (&label("cbc_enc_loop"));
1779 ######################################################################
1780 &set_label("cbc_decrypt",16);
1782 &jbe (&label("cbc_dec_tail"));
1783 &movaps (&QWP(0,"esp"),$ivec); # save IV
1785 &jmp (&label("cbc_dec_loop6_enter"));
1787 &set_label("cbc_dec_loop6",16);
1788 &movaps (&QWP(0,"esp"),$rndkey0); # save IV
1789 &movups (&QWP(0,$out),$inout5);
1790 &lea ($out,&DWP(0x10,$out));
1791 &set_label("cbc_dec_loop6_enter");
1792 &movdqu ($inout0,&QWP(0,$inp));
1793 &movdqu ($inout1,&QWP(0x10,$inp));
1794 &movdqu ($inout2,&QWP(0x20,$inp));
1795 &movdqu ($inout3,&QWP(0x30,$inp));
1796 &movdqu ($inout4,&QWP(0x40,$inp));
1797 &movdqu ($inout5,&QWP(0x50,$inp));
1799 &call ("_aesni_decrypt6");
1801 &movups ($rndkey1,&QWP(0,$inp));
1802 &movups ($rndkey0,&QWP(0x10,$inp));
1803 &xorps ($inout0,&QWP(0,"esp")); # ^=IV
1804 &xorps ($inout1,$rndkey1);
1805 &movups ($rndkey1,&QWP(0x20,$inp));
1806 &xorps ($inout2,$rndkey0);
1807 &movups ($rndkey0,&QWP(0x30,$inp));
1808 &xorps ($inout3,$rndkey1);
1809 &movups ($rndkey1,&QWP(0x40,$inp));
1810 &xorps ($inout4,$rndkey0);
1811 &movups ($rndkey0,&QWP(0x50,$inp)); # IV
1812 &xorps ($inout5,$rndkey1);
1813 &movups (&QWP(0,$out),$inout0);
1814 &movups (&QWP(0x10,$out),$inout1);
1815 &lea ($inp,&DWP(0x60,$inp));
1816 &movups (&QWP(0x20,$out),$inout2);
1817 &mov ($rounds,$rounds_) # restore $rounds
1818 &movups (&QWP(0x30,$out),$inout3);
1819 &mov ($key,$key_); # restore $key
1820 &movups (&QWP(0x40,$out),$inout4);
1821 &lea ($out,&DWP(0x50,$out));
1823 &ja (&label("cbc_dec_loop6"));
1825 &movaps ($inout0,$inout5);
1826 &movaps ($ivec,$rndkey0);
1828 &jle (&label("cbc_dec_tail_collected"));
1829 &movups (&QWP(0,$out),$inout0);
1830 &lea ($out,&DWP(0x10,$out));
1831 &set_label("cbc_dec_tail");
1832 &movups ($inout0,&QWP(0,$inp));
1833 &movaps ($in0,$inout0);
1835 &jbe (&label("cbc_dec_one"));
1837 &movups ($inout1,&QWP(0x10,$inp));
1838 &movaps ($in1,$inout1);
1840 &jbe (&label("cbc_dec_two"));
1842 &movups ($inout2,&QWP(0x20,$inp));
1844 &jbe (&label("cbc_dec_three"));
1846 &movups ($inout3,&QWP(0x30,$inp));
1848 &jbe (&label("cbc_dec_four"));
1850 &movups ($inout4,&QWP(0x40,$inp));
1851 &movaps (&QWP(0,"esp"),$ivec); # save IV
1852 &movups ($inout0,&QWP(0,$inp));
1853 &xorps ($inout5,$inout5);
1854 &call ("_aesni_decrypt6");
1855 &movups ($rndkey1,&QWP(0,$inp));
1856 &movups ($rndkey0,&QWP(0x10,$inp));
1857 &xorps ($inout0,&QWP(0,"esp")); # ^= IV
1858 &xorps ($inout1,$rndkey1);
1859 &movups ($rndkey1,&QWP(0x20,$inp));
1860 &xorps ($inout2,$rndkey0);
1861 &movups ($rndkey0,&QWP(0x30,$inp));
1862 &xorps ($inout3,$rndkey1);
1863 &movups ($ivec,&QWP(0x40,$inp)); # IV
1864 &xorps ($inout4,$rndkey0);
1865 &movups (&QWP(0,$out),$inout0);
1866 &movups (&QWP(0x10,$out),$inout1);
1867 &movups (&QWP(0x20,$out),$inout2);
1868 &movups (&QWP(0x30,$out),$inout3);
1869 &lea ($out,&DWP(0x40,$out));
1870 &movaps ($inout0,$inout4);
1872 &jmp (&label("cbc_dec_tail_collected"));
1874 &set_label("cbc_dec_one",16);
1876 { &aesni_inline_generate1("dec"); }
1878 { &call ("_aesni_decrypt1"); }
1879 &xorps ($inout0,$ivec);
1880 &movaps ($ivec,$in0);
1882 &jmp (&label("cbc_dec_tail_collected"));
1884 &set_label("cbc_dec_two",16);
1885 &xorps ($inout2,$inout2);
1886 &call ("_aesni_decrypt3");
1887 &xorps ($inout0,$ivec);
1888 &xorps ($inout1,$in0);
1889 &movups (&QWP(0,$out),$inout0);
1890 &movaps ($inout0,$inout1);
1891 &lea ($out,&DWP(0x10,$out));
1892 &movaps ($ivec,$in1);
1894 &jmp (&label("cbc_dec_tail_collected"));
1896 &set_label("cbc_dec_three",16);
1897 &call ("_aesni_decrypt3");
1898 &xorps ($inout0,$ivec);
1899 &xorps ($inout1,$in0);
1900 &xorps ($inout2,$in1);
1901 &movups (&QWP(0,$out),$inout0);
1902 &movaps ($inout0,$inout2);
1903 &movups (&QWP(0x10,$out),$inout1);
1904 &lea ($out,&DWP(0x20,$out));
1905 &movups ($ivec,&QWP(0x20,$inp));
1907 &jmp (&label("cbc_dec_tail_collected"));
1909 &set_label("cbc_dec_four",16);
1910 &call ("_aesni_decrypt4");
1911 &movups ($rndkey1,&QWP(0x10,$inp));
1912 &movups ($rndkey0,&QWP(0x20,$inp));
1913 &xorps ($inout0,$ivec);
1914 &movups ($ivec,&QWP(0x30,$inp));
1915 &xorps ($inout1,$in0);
1916 &movups (&QWP(0,$out),$inout0);
1917 &xorps ($inout2,$rndkey1);
1918 &movups (&QWP(0x10,$out),$inout1);
1919 &xorps ($inout3,$rndkey0);
1920 &movups (&QWP(0x20,$out),$inout2);
1921 &lea ($out,&DWP(0x30,$out));
1922 &movaps ($inout0,$inout3);
1925 &set_label("cbc_dec_tail_collected");
1927 &jnz (&label("cbc_dec_tail_partial"));
1928 &movups (&QWP(0,$out),$inout0);
1929 &jmp (&label("cbc_ret"));
1931 &set_label("cbc_dec_tail_partial",16);
1932 &movaps (&QWP(0,"esp"),$inout0);
1936 &data_word(0xA4F3F689); # rep movsb
1938 &set_label("cbc_ret");
1939 &mov ("esp",&DWP(16,"esp")); # pull original %esp
1940 &mov ($key_,&wparam(4));
1941 &movups (&QWP(0,$key_),$ivec); # output IV
1942 &set_label("cbc_abort");
1943 &function_end("${PREFIX}_cbc_encrypt");
1945 ######################################################################
1946 # Mechanical port from aesni-x86_64.pl.
1948 # _aesni_set_encrypt_key is private interface,
1950 # "eax" const unsigned char *userKey
1957 &function_begin_B("_aesni_set_encrypt_key");
1958 &test ("eax","eax");
1959 &jz (&label("bad_pointer"));
1961 &jz (&label("bad_pointer"));
1963 &movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
1964 &xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
1965 &lea ($key,&DWP(16,$key));
1967 &je (&label("14rounds"));
1969 &je (&label("12rounds"));
1971 &jne (&label("bad_keybits"));
1973 &set_label("10rounds",16);
1975 &$movekey (&QWP(-16,$key),"xmm0"); # round 0
1976 &aeskeygenassist("xmm1","xmm0",0x01); # round 1
1977 &call (&label("key_128_cold"));
1978 &aeskeygenassist("xmm1","xmm0",0x2); # round 2
1979 &call (&label("key_128"));
1980 &aeskeygenassist("xmm1","xmm0",0x04); # round 3
1981 &call (&label("key_128"));
1982 &aeskeygenassist("xmm1","xmm0",0x08); # round 4
1983 &call (&label("key_128"));
1984 &aeskeygenassist("xmm1","xmm0",0x10); # round 5
1985 &call (&label("key_128"));
1986 &aeskeygenassist("xmm1","xmm0",0x20); # round 6
1987 &call (&label("key_128"));
1988 &aeskeygenassist("xmm1","xmm0",0x40); # round 7
1989 &call (&label("key_128"));
1990 &aeskeygenassist("xmm1","xmm0",0x80); # round 8
1991 &call (&label("key_128"));
1992 &aeskeygenassist("xmm1","xmm0",0x1b); # round 9
1993 &call (&label("key_128"));
1994 &aeskeygenassist("xmm1","xmm0",0x36); # round 10
1995 &call (&label("key_128"));
1996 &$movekey (&QWP(0,$key),"xmm0");
1997 &mov (&DWP(80,$key),$rounds);
2001 &set_label("key_128",16);
2002 &$movekey (&QWP(0,$key),"xmm0");
2003 &lea ($key,&DWP(16,$key));
2004 &set_label("key_128_cold");
2005 &shufps ("xmm4","xmm0",0b00010000);
2006 &xorps ("xmm0","xmm4");
2007 &shufps ("xmm4","xmm0",0b10001100);
2008 &xorps ("xmm0","xmm4");
2009 &shufps ("xmm1","xmm1",0b11111111); # critical path
2010 &xorps ("xmm0","xmm1");
2013 &set_label("12rounds",16);
2014 &movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
2016 &$movekey (&QWP(-16,$key),"xmm0") # round 0
2017 &aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
2018 &call (&label("key_192a_cold"));
2019 &aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
2020 &call (&label("key_192b"));
2021 &aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
2022 &call (&label("key_192a"));
2023 &aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
2024 &call (&label("key_192b"));
2025 &aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
2026 &call (&label("key_192a"));
2027 &aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
2028 &call (&label("key_192b"));
2029 &aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
2030 &call (&label("key_192a"));
2031 &aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
2032 &call (&label("key_192b"));
2033 &$movekey (&QWP(0,$key),"xmm0");
2034 &mov (&DWP(48,$key),$rounds);
2038 &set_label("key_192a",16);
2039 &$movekey (&QWP(0,$key),"xmm0");
2040 &lea ($key,&DWP(16,$key));
2041 &set_label("key_192a_cold",16);
2042 &movaps ("xmm5","xmm2");
2043 &set_label("key_192b_warm");
2044 &shufps ("xmm4","xmm0",0b00010000);
2045 &movdqa ("xmm3","xmm2");
2046 &xorps ("xmm0","xmm4");
2047 &shufps ("xmm4","xmm0",0b10001100);
2049 &xorps ("xmm0","xmm4");
2050 &pshufd ("xmm1","xmm1",0b01010101); # critical path
2051 &pxor ("xmm2","xmm3");
2052 &pxor ("xmm0","xmm1");
2053 &pshufd ("xmm3","xmm0",0b11111111);
2054 &pxor ("xmm2","xmm3");
2057 &set_label("key_192b",16);
2058 &movaps ("xmm3","xmm0");
2059 &shufps ("xmm5","xmm0",0b01000100);
2060 &$movekey (&QWP(0,$key),"xmm5");
2061 &shufps ("xmm3","xmm2",0b01001110);
2062 &$movekey (&QWP(16,$key),"xmm3");
2063 &lea ($key,&DWP(32,$key));
2064 &jmp (&label("key_192b_warm"));
2066 &set_label("14rounds",16);
2067 &movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
2069 &lea ($key,&DWP(16,$key));
2070 &$movekey (&QWP(-32,$key),"xmm0"); # round 0
2071 &$movekey (&QWP(-16,$key),"xmm2"); # round 1
2072 &aeskeygenassist("xmm1","xmm2",0x01); # round 2
2073 &call (&label("key_256a_cold"));
2074 &aeskeygenassist("xmm1","xmm0",0x01); # round 3
2075 &call (&label("key_256b"));
2076 &aeskeygenassist("xmm1","xmm2",0x02); # round 4
2077 &call (&label("key_256a"));
2078 &aeskeygenassist("xmm1","xmm0",0x02); # round 5
2079 &call (&label("key_256b"));
2080 &aeskeygenassist("xmm1","xmm2",0x04); # round 6
2081 &call (&label("key_256a"));
2082 &aeskeygenassist("xmm1","xmm0",0x04); # round 7
2083 &call (&label("key_256b"));
2084 &aeskeygenassist("xmm1","xmm2",0x08); # round 8
2085 &call (&label("key_256a"));
2086 &aeskeygenassist("xmm1","xmm0",0x08); # round 9
2087 &call (&label("key_256b"));
2088 &aeskeygenassist("xmm1","xmm2",0x10); # round 10
2089 &call (&label("key_256a"));
2090 &aeskeygenassist("xmm1","xmm0",0x10); # round 11
2091 &call (&label("key_256b"));
2092 &aeskeygenassist("xmm1","xmm2",0x20); # round 12
2093 &call (&label("key_256a"));
2094 &aeskeygenassist("xmm1","xmm0",0x20); # round 13
2095 &call (&label("key_256b"));
2096 &aeskeygenassist("xmm1","xmm2",0x40); # round 14
2097 &call (&label("key_256a"));
2098 &$movekey (&QWP(0,$key),"xmm0");
2099 &mov (&DWP(16,$key),$rounds);
2103 &set_label("key_256a",16);
2104 &$movekey (&QWP(0,$key),"xmm2");
2105 &lea ($key,&DWP(16,$key));
2106 &set_label("key_256a_cold");
2107 &shufps ("xmm4","xmm0",0b00010000);
2108 &xorps ("xmm0","xmm4");
2109 &shufps ("xmm4","xmm0",0b10001100);
2110 &xorps ("xmm0","xmm4");
2111 &shufps ("xmm1","xmm1",0b11111111); # critical path
2112 &xorps ("xmm0","xmm1");
2115 &set_label("key_256b",16);
2116 &$movekey (&QWP(0,$key),"xmm0");
2117 &lea ($key,&DWP(16,$key));
2119 &shufps ("xmm4","xmm2",0b00010000);
2120 &xorps ("xmm2","xmm4");
2121 &shufps ("xmm4","xmm2",0b10001100);
2122 &xorps ("xmm2","xmm4");
2123 &shufps ("xmm1","xmm1",0b10101010); # critical path
2124 &xorps ("xmm2","xmm1");
2127 &set_label("bad_pointer",4);
2130 &set_label("bad_keybits",4);
2133 &function_end_B("_aesni_set_encrypt_key");
2135 # int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
2137 &function_begin_B("${PREFIX}_set_encrypt_key");
2138 &mov ("eax",&wparam(0));
2139 &mov ($rounds,&wparam(1));
2140 &mov ($key,&wparam(2));
2141 &call ("_aesni_set_encrypt_key");
2143 &function_end_B("${PREFIX}_set_encrypt_key");
2145 # int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
2147 &function_begin_B("${PREFIX}_set_decrypt_key");
2148 &mov ("eax",&wparam(0));
2149 &mov ($rounds,&wparam(1));
2150 &mov ($key,&wparam(2));
2151 &call ("_aesni_set_encrypt_key");
2152 &mov ($key,&wparam(2));
2153 &shl ($rounds,4) # rounds-1 after _aesni_set_encrypt_key
2154 &test ("eax","eax");
2155 &jnz (&label("dec_key_ret"));
2156 &lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
2158 &$movekey ("xmm0",&QWP(0,$key)); # just swap
2159 &$movekey ("xmm1",&QWP(0,"eax"));
2160 &$movekey (&QWP(0,"eax"),"xmm0");
2161 &$movekey (&QWP(0,$key),"xmm1");
2162 &lea ($key,&DWP(16,$key));
2163 &lea ("eax",&DWP(-16,"eax"));
2165 &set_label("dec_key_inverse");
2166 &$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
2167 &$movekey ("xmm1",&QWP(0,"eax"));
2168 &aesimc ("xmm0","xmm0");
2169 &aesimc ("xmm1","xmm1");
2170 &lea ($key,&DWP(16,$key));
2171 &lea ("eax",&DWP(-16,"eax"));
2172 &$movekey (&QWP(16,"eax"),"xmm0");
2173 &$movekey (&QWP(-16,$key),"xmm1");
2175 &ja (&label("dec_key_inverse"));
2177 &$movekey ("xmm0",&QWP(0,$key)); # inverse middle
2178 &aesimc ("xmm0","xmm0");
2179 &$movekey (&QWP(0,$key),"xmm0");
2181 &xor ("eax","eax"); # return success
2182 &set_label("dec_key_ret");
2184 &function_end_B("${PREFIX}_set_decrypt_key");
2185 &asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");