3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer AES-NI procedures process several independent buffers
11 # in parallel by interleaving independent instructions.
13 # Cycles per byte for interleave factor 4:
16 # ---------------------------
17 # Westmere 5.00/4=1.25 5.13/4=1.28
18 # Atom 15.0/4=3.75 15.7/4=3.93
19 # Sandy Bridge 5.06/4=1.27 5.15/4=1.29
20 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29
21 # Haswell 4.44/4=1.11 4.44/4=1.11
22 # Bulldozer 5.75/4=1.44 5.76/4=1.44
24 # Cycles per byte for interleave factor 8 (not implemented for
25 # pre-AVX processors, where higher interleave factor incidentally
26 # doesn't result in improvement):
29 # ---------------------------
30 # Sandy Bridge 5.06/8=0.64 7.05/8=0.88(*)
31 # Ivy Bridge 5.06/8=0.64 7.02/8=0.88(*)
32 # Haswell 5.00/8=0.63 5.00/8=0.63
33 # Bulldozer 5.75/8=0.72 5.77/8=0.72
35 # (*) Sandy/Ivy Bridge are known to handle high interleave factors
40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47 die "can't locate x86_64-xlate.pl";
51 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
53 $avx = ($1>=2.19) + ($1>=2.22);
56 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
58 $avx = ($1>=2.09) + ($1>=2.10);
61 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
62 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
63 $avx = ($1>=10) + ($1>=11);
66 open OUT,"| \"$^X\" $xlate $flavour $output";
69 # void aesni_multi_cbc_encrypt (
70 # struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
72 # int num); /* 1 or 2 */
74 $inp="%rdi"; # 1st arg
75 $key="%rsi"; # 2nd arg
78 @inptr=map("%r$_",(8..11));
79 @outptr=map("%r$_",(12..15));
81 ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
82 @out=map("%xmm$_",(2..5));
83 @inp=map("%xmm$_",(6..9));
84 ($counters,$mask,$zero)=map("%xmm$_",(10..12));
86 ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
91 .extern OPENSSL_ia32cap_P
93 .globl aesni_multi_cbc_encrypt
94 .type aesni_multi_cbc_encrypt,\@function,3
96 aesni_multi_cbc_encrypt:
98 $code.=<<___ if ($avx);
101 mov OPENSSL_ia32cap_P+4(%rip),%ecx
102 test \$`1<<28`,%ecx # AVX bit
103 jnz _avx_cbc_enc_shortcut
117 $code.=<<___ if ($win64);
120 movaps %xmm7,0x10(%rsp)
121 movaps %xmm8,0x20(%rsp)
122 movaps %xmm9,0x30(%rsp)
123 movaps %xmm10,0x40(%rsp)
124 movaps %xmm11,0x50(%rsp)
125 movaps %xmm12,0x60(%rsp)
131 # +16 input sink [original %rsp and $num]
136 mov %rax,16(%rsp) # original %rsp
139 movdqu ($key),$zero # 0-round key
140 lea 0x78($key),$key # size optimization
144 mov $num,24(%rsp) # original $num
147 for($i=0;$i<4;$i++) {
149 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
150 mov `40*$i+0-40*2`($inp),@inptr[$i]
152 mov `40*$i+8-40*2`($inp),@outptr[$i]
153 cmovg $one,$num # find maximum
155 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
156 mov $one,`32+4*$i`(%rsp) # initialize counters
157 cmovle %rsp,@inptr[$i] # cancel input
164 movups 0x10-0x78($key),$rndkey1
166 movups 0x20-0x78($key),$rndkey0
168 mov 0xf0-0x78($key),$rounds
170 movdqu (@inptr[0]),@inp[0] # load inputs
172 movdqu (@inptr[1]),@inp[1]
174 movdqu (@inptr[2]),@inp[2]
176 movdqu (@inptr[3]),@inp[3]
179 movdqa 32(%rsp),$counters # load counters
186 lea 16(%rsp),$sink # sink pointer
187 mov \$1,$one # constant of 1
190 aesenc $rndkey1,@out[0]
191 aesenc $rndkey1,@out[1]
192 aesenc $rndkey1,@out[2]
193 aesenc $rndkey1,@out[3]
194 movups 0x30-0x78($key),$rndkey1
196 for($i=0;$i<4;$i++) {
197 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
199 cmp `32+4*$i`(%rsp),$one
200 aesenc $rndkey,@out[0]
201 aesenc $rndkey,@out[1]
202 cmovge $sink,@inptr[$i] # cancel input
203 aesenc $rndkey,@out[2]
204 cmovg $sink,@outptr[$i] # sink output
205 aesenc $rndkey,@out[3]
206 movups `0x40+16*$i-0x78`($key),$rndkey
210 movdqa $counters,$mask
211 aesenc $rndkey0,@out[0]
212 aesenc $rndkey0,@out[1]
213 aesenc $rndkey0,@out[2]
214 aesenc $rndkey0,@out[3]
215 movups 0x80-0x78($key),$rndkey0
218 aesenc $rndkey1,@out[0]
220 movdqu -0x78($key),$zero # reload 0-round key
221 aesenc $rndkey1,@out[1]
222 paddd $mask,$counters # decrement counters
223 movdqa $counters,32(%rsp) # update counters
224 aesenc $rndkey1,@out[2]
225 aesenc $rndkey1,@out[3]
226 movups 0x90-0x78($key),$rndkey1
230 aesenc $rndkey0,@out[0]
231 aesenc $rndkey0,@out[1]
232 aesenc $rndkey0,@out[2]
233 aesenc $rndkey0,@out[3]
234 movups 0xa0-0x78($key),$rndkey0
238 aesenc $rndkey1,@out[0]
239 aesenc $rndkey1,@out[1]
240 aesenc $rndkey1,@out[2]
241 aesenc $rndkey1,@out[3]
242 movups 0xb0-0x78($key),$rndkey1
244 aesenc $rndkey0,@out[0]
245 aesenc $rndkey0,@out[1]
246 aesenc $rndkey0,@out[2]
247 aesenc $rndkey0,@out[3]
248 movups 0xc0-0x78($key),$rndkey0
252 aesenc $rndkey1,@out[0]
253 aesenc $rndkey1,@out[1]
254 aesenc $rndkey1,@out[2]
255 aesenc $rndkey1,@out[3]
256 movups 0xd0-0x78($key),$rndkey1
258 aesenc $rndkey0,@out[0]
259 aesenc $rndkey0,@out[1]
260 aesenc $rndkey0,@out[2]
261 aesenc $rndkey0,@out[3]
262 movups 0xe0-0x78($key),$rndkey0
265 aesenc $rndkey1,@out[0]
266 aesenc $rndkey1,@out[1]
267 aesenc $rndkey1,@out[2]
268 movdqu (@inptr[0],$offset),@inp[0]
269 aesenc $rndkey1,@out[3]
270 movdqu 0x10-0x78($key),$rndkey1
272 aesenclast $rndkey0,@out[0]
273 movdqu (@inptr[1],$offset),@inp[1]
275 aesenclast $rndkey0,@out[1]
276 movdqu (@inptr[2],$offset),@inp[2]
278 aesenclast $rndkey0,@out[2]
279 movdqu (@inptr[3],$offset),@inp[3]
281 aesenclast $rndkey0,@out[3]
282 movdqu 0x20-0x78($key),$rndkey0
285 movups @out[0],-16(@outptr[0],$offset)
287 movups @out[1],-16(@outptr[1],$offset)
289 movups @out[2],-16(@outptr[2],$offset)
291 movups @out[3],-16(@outptr[3],$offset)
297 mov 16(%rsp),%rax # original %rsp
300 #pxor @inp[0],@out[0]
301 #pxor @inp[1],@out[1]
302 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
303 #pxor @inp[2],@out[2]
304 #movdqu @out[1],`40*1+24-40*2`($inp)
305 #pxor @inp[3],@out[3]
306 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
307 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
309 lea `40*4`($inp),$inp
311 jnz .Lenc4x_loop_grande
315 $code.=<<___ if ($win64);
316 movaps -0xa8(%rax),%xmm6
317 movaps -0x98(%rax),%xmm7
318 movaps -0x88(%rax),%xmm8
319 movaps -0x78(%rax),%xmm9
320 movaps -0x68(%rax),%xmm10
321 movaps -0x58(%rax),%xmm11
322 movaps -0x48(%rax),%xmm12
333 .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
335 .globl aesni_multi_cbc_decrypt
336 .type aesni_multi_cbc_decrypt,\@function,3
338 aesni_multi_cbc_decrypt:
340 $code.=<<___ if ($avx);
343 mov OPENSSL_ia32cap_P+4(%rip),%ecx
344 test \$`1<<28`,%ecx # AVX bit
345 jnz _avx_cbc_dec_shortcut
359 $code.=<<___ if ($win64);
362 movaps %xmm7,0x10(%rsp)
363 movaps %xmm8,0x20(%rsp)
364 movaps %xmm9,0x30(%rsp)
365 movaps %xmm10,0x40(%rsp)
366 movaps %xmm11,0x50(%rsp)
367 movaps %xmm12,0x60(%rsp)
373 # +16 input sink [original %rsp and $num]
378 mov %rax,16(%rsp) # original %rsp
381 movdqu ($key),$zero # 0-round key
382 lea 0x78($key),$key # size optimization
386 mov $num,24(%rsp) # original $num
389 for($i=0;$i<4;$i++) {
391 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
392 mov `40*$i+0-40*2`($inp),@inptr[$i]
394 mov `40*$i+8-40*2`($inp),@outptr[$i]
395 cmovg $one,$num # find maximum
397 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
398 mov $one,`32+4*$i`(%rsp) # initialize counters
399 cmovle %rsp,@inptr[$i] # cancel input
406 movups 0x10-0x78($key),$rndkey1
407 movups 0x20-0x78($key),$rndkey0
408 mov 0xf0-0x78($key),$rounds
409 movdqu (@inptr[0]),@out[0] # load inputs
410 movdqu (@inptr[1]),@out[1]
412 movdqu (@inptr[2]),@out[2]
414 movdqu (@inptr[3]),@out[3]
417 movdqa 32(%rsp),$counters # load counters
424 lea 16(%rsp),$sink # sink pointer
425 mov \$1,$one # constant of 1
428 aesdec $rndkey1,@out[0]
429 aesdec $rndkey1,@out[1]
430 aesdec $rndkey1,@out[2]
431 aesdec $rndkey1,@out[3]
432 movups 0x30-0x78($key),$rndkey1
434 for($i=0;$i<4;$i++) {
435 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
437 cmp `32+4*$i`(%rsp),$one
438 aesdec $rndkey,@out[0]
439 aesdec $rndkey,@out[1]
440 cmovge $sink,@inptr[$i] # cancel input
441 aesdec $rndkey,@out[2]
442 cmovg $sink,@outptr[$i] # sink output
443 aesdec $rndkey,@out[3]
444 movups `0x40+16*$i-0x78`($key),$rndkey
448 movdqa $counters,$mask
449 aesdec $rndkey0,@out[0]
450 aesdec $rndkey0,@out[1]
451 aesdec $rndkey0,@out[2]
452 aesdec $rndkey0,@out[3]
453 movups 0x80-0x78($key),$rndkey0
456 aesdec $rndkey1,@out[0]
458 movdqu -0x78($key),$zero # reload 0-round key
459 aesdec $rndkey1,@out[1]
460 paddd $mask,$counters # decrement counters
461 movdqa $counters,32(%rsp) # update counters
462 aesdec $rndkey1,@out[2]
463 aesdec $rndkey1,@out[3]
464 movups 0x90-0x78($key),$rndkey1
468 aesdec $rndkey0,@out[0]
469 aesdec $rndkey0,@out[1]
470 aesdec $rndkey0,@out[2]
471 aesdec $rndkey0,@out[3]
472 movups 0xa0-0x78($key),$rndkey0
476 aesdec $rndkey1,@out[0]
477 aesdec $rndkey1,@out[1]
478 aesdec $rndkey1,@out[2]
479 aesdec $rndkey1,@out[3]
480 movups 0xb0-0x78($key),$rndkey1
482 aesdec $rndkey0,@out[0]
483 aesdec $rndkey0,@out[1]
484 aesdec $rndkey0,@out[2]
485 aesdec $rndkey0,@out[3]
486 movups 0xc0-0x78($key),$rndkey0
490 aesdec $rndkey1,@out[0]
491 aesdec $rndkey1,@out[1]
492 aesdec $rndkey1,@out[2]
493 aesdec $rndkey1,@out[3]
494 movups 0xd0-0x78($key),$rndkey1
496 aesdec $rndkey0,@out[0]
497 aesdec $rndkey0,@out[1]
498 aesdec $rndkey0,@out[2]
499 aesdec $rndkey0,@out[3]
500 movups 0xe0-0x78($key),$rndkey0
503 aesdec $rndkey1,@out[0]
504 aesdec $rndkey1,@out[1]
505 aesdec $rndkey1,@out[2]
506 pxor $rndkey0,@inp[0]
507 pxor $rndkey0,@inp[1]
508 aesdec $rndkey1,@out[3]
509 movdqu 0x10-0x78($key),$rndkey1
510 pxor $rndkey0,@inp[2]
511 pxor $rndkey0,@inp[3]
512 movdqu 0x20-0x78($key),$rndkey0
514 aesdeclast @inp[0],@out[0]
515 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
516 aesdeclast @inp[1],@out[1]
517 movdqu -16(@inptr[1],$offset),@inp[1]
518 aesdeclast @inp[2],@out[2]
519 movdqu -16(@inptr[2],$offset),@inp[2]
520 aesdeclast @inp[3],@out[3]
521 movdqu -16(@inptr[3],$offset),@inp[3]
523 movups @out[0],-16(@outptr[0],$offset)
524 movdqu (@inptr[0],$offset),@out[0]
525 movups @out[1],-16(@outptr[1],$offset)
526 movdqu (@inptr[1],$offset),@out[1]
528 movups @out[2],-16(@outptr[2],$offset)
529 movdqu (@inptr[2],$offset),@out[2]
531 movups @out[3],-16(@outptr[3],$offset)
532 movdqu (@inptr[3],$offset),@out[3]
539 mov 16(%rsp),%rax # original %rsp
542 lea `40*4`($inp),$inp
544 jnz .Ldec4x_loop_grande
548 $code.=<<___ if ($win64);
549 movaps -0xa8(%rax),%xmm6
550 movaps -0x98(%rax),%xmm7
551 movaps -0x88(%rax),%xmm8
552 movaps -0x78(%rax),%xmm9
553 movaps -0x68(%rax),%xmm10
554 movaps -0x58(%rax),%xmm11
555 movaps -0x48(%rax),%xmm12
566 .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
570 my @ptr=map("%r$_",(8..15));
573 my @out=map("%xmm$_",(2..9));
574 my @inp=map("%xmm$_",(10..13));
575 my ($counters,$zero)=("%xmm14","%xmm15");
578 .type aesni_multi_cbc_encrypt_avx,\@function,3
580 aesni_multi_cbc_encrypt_avx:
581 _avx_cbc_enc_shortcut:
590 $code.=<<___ if ($win64);
593 movaps %xmm7,0x10(%rsp)
594 movaps %xmm8,0x20(%rsp)
595 movaps %xmm9,0x30(%rsp)
596 movaps %xmm10,0x40(%rsp)
597 movaps %xmm11,0x50(%rsp)
598 movaps %xmm12,-0x78(%rax)
599 movaps %xmm13,-0x68(%rax)
600 movaps %xmm14,-0x58(%rax)
601 movaps %xmm15,-0x48(%rax)
607 # +16 input sink [original %rsp and $num]
609 # +64 distances between inputs and outputs
610 # +128 off-load area for @inp[0..3]
614 mov %rax,16(%rsp) # original %rsp
618 vmovdqu ($key),$zero # 0-round key
619 lea 0x78($key),$key # size optimization
624 #mov $num,24(%rsp) # original $num
627 for($i=0;$i<8;$i++) {
628 my $temp = $i ? $offload : $offset;
630 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
631 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
633 mov `40*$i+8-40*4`($inp),$temp # output pointer
634 cmovg $one,$num # find maximum
636 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
637 mov $one,`32+4*$i`(%rsp) # initialize counters
638 cmovle %rsp,@ptr[$i] # cancel input
639 sub @ptr[$i],$temp # distance between input and output
640 mov $temp,`64+8*$i`(%rsp) # initialize distances
647 vmovups 0x10-0x78($key),$rndkey1
648 vmovups 0x20-0x78($key),$rndkey0
649 mov 0xf0-0x78($key),$rounds
651 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
652 lea 128(%rsp),$offload # offload area
653 vpxor (@ptr[1]),$zero,@inp[1]
654 vpxor (@ptr[2]),$zero,@inp[2]
655 vpxor (@ptr[3]),$zero,@inp[3]
656 vpxor @inp[0],@out[0],@out[0]
657 vpxor (@ptr[4]),$zero,@inp[0]
658 vpxor @inp[1],@out[1],@out[1]
659 vpxor (@ptr[5]),$zero,@inp[1]
660 vpxor @inp[2],@out[2],@out[2]
661 vpxor (@ptr[6]),$zero,@inp[2]
662 vpxor @inp[3],@out[3],@out[3]
663 vpxor (@ptr[7]),$zero,@inp[3]
664 vpxor @inp[0],@out[4],@out[4]
665 mov \$1,$one # constant of 1
666 vpxor @inp[1],@out[5],@out[5]
667 vpxor @inp[2],@out[6],@out[6]
668 vpxor @inp[3],@out[7],@out[7]
674 for($i=0;$i<8;$i++) {
675 my $rndkey=($i&1)?$rndkey0:$rndkey1;
677 vaesenc $rndkey,@out[0],@out[0]
678 cmp 32+4*$i(%rsp),$one
680 $code.=<<___ if ($i);
681 mov 64+8*$i(%rsp),$offset
684 vaesenc $rndkey,@out[1],@out[1]
685 vaesenc $rndkey,@out[2],@out[2]
686 vaesenc $rndkey,@out[3],@out[3]
687 lea (@ptr[$i],$offset),$offset
688 cmovge %rsp,@ptr[$i] # cancel input
689 vaesenc $rndkey,@out[4],@out[4]
690 cmovg %rsp,$offset # sink output
691 vaesenc $rndkey,@out[5],@out[5]
693 vaesenc $rndkey,@out[6],@out[6]
694 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
695 mov $offset,64+8*$i(%rsp)
696 vaesenc $rndkey,@out[7],@out[7]
697 vmovups `16*(3+$i)-0x78`($key),$rndkey
698 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
700 $code.=<<___ if ($i<4)
701 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
705 vmovdqu 32(%rsp),$counters
709 vaesenc $rndkey1,@out[0],@out[0]
710 vaesenc $rndkey1,@out[1],@out[1]
711 vaesenc $rndkey1,@out[2],@out[2]
712 vaesenc $rndkey1,@out[3],@out[3]
713 vaesenc $rndkey1,@out[4],@out[4]
714 vaesenc $rndkey1,@out[5],@out[5]
715 vaesenc $rndkey1,@out[6],@out[6]
716 vaesenc $rndkey1,@out[7],@out[7]
717 vmovups 0xb0-0x78($key),$rndkey1
719 vaesenc $rndkey0,@out[0],@out[0]
720 vaesenc $rndkey0,@out[1],@out[1]
721 vaesenc $rndkey0,@out[2],@out[2]
722 vaesenc $rndkey0,@out[3],@out[3]
723 vaesenc $rndkey0,@out[4],@out[4]
724 vaesenc $rndkey0,@out[5],@out[5]
725 vaesenc $rndkey0,@out[6],@out[6]
726 vaesenc $rndkey0,@out[7],@out[7]
727 vmovups 0xc0-0x78($key),$rndkey0
730 vaesenc $rndkey1,@out[0],@out[0]
731 vaesenc $rndkey1,@out[1],@out[1]
732 vaesenc $rndkey1,@out[2],@out[2]
733 vaesenc $rndkey1,@out[3],@out[3]
734 vaesenc $rndkey1,@out[4],@out[4]
735 vaesenc $rndkey1,@out[5],@out[5]
736 vaesenc $rndkey1,@out[6],@out[6]
737 vaesenc $rndkey1,@out[7],@out[7]
738 vmovups 0xd0-0x78($key),$rndkey1
740 vaesenc $rndkey0,@out[0],@out[0]
741 vaesenc $rndkey0,@out[1],@out[1]
742 vaesenc $rndkey0,@out[2],@out[2]
743 vaesenc $rndkey0,@out[3],@out[3]
744 vaesenc $rndkey0,@out[4],@out[4]
745 vaesenc $rndkey0,@out[5],@out[5]
746 vaesenc $rndkey0,@out[6],@out[6]
747 vaesenc $rndkey0,@out[7],@out[7]
748 vmovups 0xe0-0x78($key),$rndkey0
751 vaesenc $rndkey1,@out[0],@out[0]
752 vpxor $zero,$zero,$zero
753 vaesenc $rndkey1,@out[1],@out[1]
754 vaesenc $rndkey1,@out[2],@out[2]
755 vpcmpgtd $zero,$counters,$zero
756 vaesenc $rndkey1,@out[3],@out[3]
757 vaesenc $rndkey1,@out[4],@out[4]
758 vpaddd $counters,$zero,$zero # decrement counters
759 vmovdqu 48(%rsp),$counters
760 vaesenc $rndkey1,@out[5],@out[5]
761 mov 64(%rsp),$offset # pre-load 1st offset
762 vaesenc $rndkey1,@out[6],@out[6]
763 vaesenc $rndkey1,@out[7],@out[7]
764 vmovups 0x10-0x78($key),$rndkey1
766 vaesenclast $rndkey0,@out[0],@out[0]
767 vmovdqa $zero,32(%rsp) # update counters
768 vpxor $zero,$zero,$zero
769 vaesenclast $rndkey0,@out[1],@out[1]
770 vaesenclast $rndkey0,@out[2],@out[2]
771 vpcmpgtd $zero,$counters,$zero
772 vaesenclast $rndkey0,@out[3],@out[3]
773 vaesenclast $rndkey0,@out[4],@out[4]
774 vpaddd $zero,$counters,$counters # decrement counters
775 vmovdqu -0x78($key),$zero # 0-round
776 vaesenclast $rndkey0,@out[5],@out[5]
777 vaesenclast $rndkey0,@out[6],@out[6]
778 vmovdqa $counters,48(%rsp) # update counters
779 vaesenclast $rndkey0,@out[7],@out[7]
780 vmovups 0x20-0x78($key),$rndkey0
782 vmovups @out[0],-16(@ptr[0]) # write output
783 sub $offset,@ptr[0] # switch to input
784 vpxor 0x00($offload),@out[0],@out[0]
785 vmovups @out[1],-16(@ptr[1])
786 sub `64+1*8`(%rsp),@ptr[1]
787 vpxor 0x10($offload),@out[1],@out[1]
788 vmovups @out[2],-16(@ptr[2])
789 sub `64+2*8`(%rsp),@ptr[2]
790 vpxor 0x20($offload),@out[2],@out[2]
791 vmovups @out[3],-16(@ptr[3])
792 sub `64+3*8`(%rsp),@ptr[3]
793 vpxor 0x30($offload),@out[3],@out[3]
794 vmovups @out[4],-16(@ptr[4])
795 sub `64+4*8`(%rsp),@ptr[4]
796 vpxor @inp[0],@out[4],@out[4]
797 vmovups @out[5],-16(@ptr[5])
798 sub `64+5*8`(%rsp),@ptr[5]
799 vpxor @inp[1],@out[5],@out[5]
800 vmovups @out[6],-16(@ptr[6])
801 sub `64+6*8`(%rsp),@ptr[6]
802 vpxor @inp[2],@out[6],@out[6]
803 vmovups @out[7],-16(@ptr[7])
804 sub `64+7*8`(%rsp),@ptr[7]
805 vpxor @inp[3],@out[7],@out[7]
810 mov 16(%rsp),%rax # original %rsp
812 #lea `40*8`($inp),$inp
814 #jnz .Lenc8x_loop_grande
819 $code.=<<___ if ($win64);
820 movaps -0xd8(%rax),%xmm6
821 movaps -0xc8(%rax),%xmm7
822 movaps -0xb8(%rax),%xmm8
823 movaps -0xa8(%rax),%xmm9
824 movaps -0x98(%rax),%xmm10
825 movaps -0x88(%rax),%xmm11
826 movaps -0x78(%rax),%xmm12
827 movaps -0x68(%rax),%xmm13
828 movaps -0x58(%rax),%xmm14
829 movaps -0x48(%rax),%xmm15
840 .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
842 .type aesni_multi_cbc_decrypt_avx,\@function,3
844 aesni_multi_cbc_decrypt_avx:
845 _avx_cbc_dec_shortcut:
854 $code.=<<___ if ($win64);
857 movaps %xmm7,0x10(%rsp)
858 movaps %xmm8,0x20(%rsp)
859 movaps %xmm9,0x30(%rsp)
860 movaps %xmm10,0x40(%rsp)
861 movaps %xmm11,0x50(%rsp)
862 movaps %xmm12,-0x78(%rax)
863 movaps %xmm13,-0x68(%rax)
864 movaps %xmm14,-0x58(%rax)
865 movaps %xmm15,-0x48(%rax)
871 # +16 input sink [original %rsp and $num]
873 # +64 distances between inputs and outputs
874 # +128 off-load area for @inp[0..3]
875 # +192 IV/input offload
880 mov %rax,16(%rsp) # original %rsp
884 vmovdqu ($key),$zero # 0-round key
885 lea 0x78($key),$key # size optimization
890 #mov $num,24(%rsp) # original $num
893 for($i=0;$i<8;$i++) {
894 my $temp = $i ? $offload : $offset;
896 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
897 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
899 mov `40*$i+8-40*4`($inp),$temp # output pointer
900 cmovg $one,$num # find maximum
902 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
903 mov $one,`32+4*$i`(%rsp) # initialize counters
904 cmovle %rsp,@ptr[$i] # cancel input
905 sub @ptr[$i],$temp # distance between input and output
906 mov $temp,`64+8*$i`(%rsp) # initialize distances
907 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
914 vmovups 0x10-0x78($key),$rndkey1
915 vmovups 0x20-0x78($key),$rndkey0
916 mov 0xf0-0x78($key),$rounds
917 lea 192+128(%rsp),$offload # offload area
919 vmovdqu (@ptr[0]),@out[0] # load inputs
920 vmovdqu (@ptr[1]),@out[1]
921 vmovdqu (@ptr[2]),@out[2]
922 vmovdqu (@ptr[3]),@out[3]
923 vmovdqu (@ptr[4]),@out[4]
924 vmovdqu (@ptr[5]),@out[5]
925 vmovdqu (@ptr[6]),@out[6]
926 vmovdqu (@ptr[7]),@out[7]
927 vmovdqu @out[0],0x00($offload) # offload inputs
928 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
929 vmovdqu @out[1],0x10($offload)
930 vpxor $zero,@out[1],@out[1]
931 vmovdqu @out[2],0x20($offload)
932 vpxor $zero,@out[2],@out[2]
933 vmovdqu @out[3],0x30($offload)
934 vpxor $zero,@out[3],@out[3]
935 vmovdqu @out[4],0x40($offload)
936 vpxor $zero,@out[4],@out[4]
937 vmovdqu @out[5],0x50($offload)
938 vpxor $zero,@out[5],@out[5]
939 vmovdqu @out[6],0x60($offload)
940 vpxor $zero,@out[6],@out[6]
941 vmovdqu @out[7],0x70($offload)
942 vpxor $zero,@out[7],@out[7]
944 mov \$1,$one # constant of 1
950 for($i=0;$i<8;$i++) {
951 my $rndkey=($i&1)?$rndkey0:$rndkey1;
953 vaesdec $rndkey,@out[0],@out[0]
954 cmp 32+4*$i(%rsp),$one
956 $code.=<<___ if ($i);
957 mov 64+8*$i(%rsp),$offset
960 vaesdec $rndkey,@out[1],@out[1]
961 vaesdec $rndkey,@out[2],@out[2]
962 vaesdec $rndkey,@out[3],@out[3]
963 lea (@ptr[$i],$offset),$offset
964 cmovge %rsp,@ptr[$i] # cancel input
965 vaesdec $rndkey,@out[4],@out[4]
966 cmovg %rsp,$offset # sink output
967 vaesdec $rndkey,@out[5],@out[5]
969 vaesdec $rndkey,@out[6],@out[6]
970 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
971 mov $offset,64+8*$i(%rsp)
972 vaesdec $rndkey,@out[7],@out[7]
973 vmovups `16*(3+$i)-0x78`($key),$rndkey
974 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
976 $code.=<<___ if ($i<4);
977 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
981 vmovdqu 32(%rsp),$counters
985 vaesdec $rndkey1,@out[0],@out[0]
986 vaesdec $rndkey1,@out[1],@out[1]
987 vaesdec $rndkey1,@out[2],@out[2]
988 vaesdec $rndkey1,@out[3],@out[3]
989 vaesdec $rndkey1,@out[4],@out[4]
990 vaesdec $rndkey1,@out[5],@out[5]
991 vaesdec $rndkey1,@out[6],@out[6]
992 vaesdec $rndkey1,@out[7],@out[7]
993 vmovups 0xb0-0x78($key),$rndkey1
995 vaesdec $rndkey0,@out[0],@out[0]
996 vaesdec $rndkey0,@out[1],@out[1]
997 vaesdec $rndkey0,@out[2],@out[2]
998 vaesdec $rndkey0,@out[3],@out[3]
999 vaesdec $rndkey0,@out[4],@out[4]
1000 vaesdec $rndkey0,@out[5],@out[5]
1001 vaesdec $rndkey0,@out[6],@out[6]
1002 vaesdec $rndkey0,@out[7],@out[7]
1003 vmovups 0xc0-0x78($key),$rndkey0
1006 vaesdec $rndkey1,@out[0],@out[0]
1007 vaesdec $rndkey1,@out[1],@out[1]
1008 vaesdec $rndkey1,@out[2],@out[2]
1009 vaesdec $rndkey1,@out[3],@out[3]
1010 vaesdec $rndkey1,@out[4],@out[4]
1011 vaesdec $rndkey1,@out[5],@out[5]
1012 vaesdec $rndkey1,@out[6],@out[6]
1013 vaesdec $rndkey1,@out[7],@out[7]
1014 vmovups 0xd0-0x78($key),$rndkey1
1016 vaesdec $rndkey0,@out[0],@out[0]
1017 vaesdec $rndkey0,@out[1],@out[1]
1018 vaesdec $rndkey0,@out[2],@out[2]
1019 vaesdec $rndkey0,@out[3],@out[3]
1020 vaesdec $rndkey0,@out[4],@out[4]
1021 vaesdec $rndkey0,@out[5],@out[5]
1022 vaesdec $rndkey0,@out[6],@out[6]
1023 vaesdec $rndkey0,@out[7],@out[7]
1024 vmovups 0xe0-0x78($key),$rndkey0
1027 vaesdec $rndkey1,@out[0],@out[0]
1028 vpxor $zero,$zero,$zero
1029 vaesdec $rndkey1,@out[1],@out[1]
1030 vaesdec $rndkey1,@out[2],@out[2]
1031 vpcmpgtd $zero,$counters,$zero
1032 vaesdec $rndkey1,@out[3],@out[3]
1033 vaesdec $rndkey1,@out[4],@out[4]
1034 vpaddd $counters,$zero,$zero # decrement counters
1035 vmovdqu 48(%rsp),$counters
1036 vaesdec $rndkey1,@out[5],@out[5]
1037 mov 64(%rsp),$offset # pre-load 1st offset
1038 vaesdec $rndkey1,@out[6],@out[6]
1039 vaesdec $rndkey1,@out[7],@out[7]
1040 vmovups 0x10-0x78($key),$rndkey1
1042 vaesdeclast $rndkey0,@out[0],@out[0]
1043 vmovdqa $zero,32(%rsp) # update counters
1044 vpxor $zero,$zero,$zero
1045 vaesdeclast $rndkey0,@out[1],@out[1]
1046 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1047 vaesdeclast $rndkey0,@out[2],@out[2]
1048 vpxor 0x10($offload),@out[1],@out[1]
1049 vpcmpgtd $zero,$counters,$zero
1050 vaesdeclast $rndkey0,@out[3],@out[3]
1051 vpxor 0x20($offload),@out[2],@out[2]
1052 vaesdeclast $rndkey0,@out[4],@out[4]
1053 vpxor 0x30($offload),@out[3],@out[3]
1054 vpaddd $zero,$counters,$counters # decrement counters
1055 vmovdqu -0x78($key),$zero # 0-round
1056 vaesdeclast $rndkey0,@out[5],@out[5]
1057 vpxor 0x40($offload),@out[4],@out[4]
1058 vaesdeclast $rndkey0,@out[6],@out[6]
1059 vpxor 0x50($offload),@out[5],@out[5]
1060 vmovdqa $counters,48(%rsp) # update counters
1061 vaesdeclast $rndkey0,@out[7],@out[7]
1062 vpxor 0x60($offload),@out[6],@out[6]
1063 vmovups 0x20-0x78($key),$rndkey0
1065 vmovups @out[0],-16(@ptr[0]) # write output
1066 sub $offset,@ptr[0] # switch to input
1067 vmovdqu 128+0(%rsp),@out[0]
1068 vpxor 0x70($offload),@out[7],@out[7]
1069 vmovups @out[1],-16(@ptr[1])
1070 sub `64+1*8`(%rsp),@ptr[1]
1071 vmovdqu @out[0],0x00($offload)
1072 vpxor $zero,@out[0],@out[0]
1073 vmovdqu 128+16(%rsp),@out[1]
1074 vmovups @out[2],-16(@ptr[2])
1075 sub `64+2*8`(%rsp),@ptr[2]
1076 vmovdqu @out[1],0x10($offload)
1077 vpxor $zero,@out[1],@out[1]
1078 vmovdqu 128+32(%rsp),@out[2]
1079 vmovups @out[3],-16(@ptr[3])
1080 sub `64+3*8`(%rsp),@ptr[3]
1081 vmovdqu @out[2],0x20($offload)
1082 vpxor $zero,@out[2],@out[2]
1083 vmovdqu 128+48(%rsp),@out[3]
1084 vmovups @out[4],-16(@ptr[4])
1085 sub `64+4*8`(%rsp),@ptr[4]
1086 vmovdqu @out[3],0x30($offload)
1087 vpxor $zero,@out[3],@out[3]
1088 vmovdqu @inp[0],0x40($offload)
1089 vpxor @inp[0],$zero,@out[4]
1090 vmovups @out[5],-16(@ptr[5])
1091 sub `64+5*8`(%rsp),@ptr[5]
1092 vmovdqu @inp[1],0x50($offload)
1093 vpxor @inp[1],$zero,@out[5]
1094 vmovups @out[6],-16(@ptr[6])
1095 sub `64+6*8`(%rsp),@ptr[6]
1096 vmovdqu @inp[2],0x60($offload)
1097 vpxor @inp[2],$zero,@out[6]
1098 vmovups @out[7],-16(@ptr[7])
1099 sub `64+7*8`(%rsp),@ptr[7]
1100 vmovdqu @inp[3],0x70($offload)
1101 vpxor @inp[3],$zero,@out[7]
1107 mov 16(%rsp),%rax # original %rsp
1109 #lea `40*8`($inp),$inp
1111 #jnz .Ldec8x_loop_grande
1116 $code.=<<___ if ($win64);
1117 movaps -0xd8(%rax),%xmm6
1118 movaps -0xc8(%rax),%xmm7
1119 movaps -0xb8(%rax),%xmm8
1120 movaps -0xa8(%rax),%xmm9
1121 movaps -0x98(%rax),%xmm10
1122 movaps -0x88(%rax),%xmm11
1123 movaps -0x78(%rax),%xmm12
1124 movaps -0x68(%rax),%xmm13
1125 movaps -0x58(%rax),%xmm14
1126 movaps -0x48(%rax),%xmm15
1137 .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1142 local *opcode=shift;
1146 $rex|=0x04 if($dst>=8);
1147 $rex|=0x01 if($src>=8);
1148 push @opcode,$rex|0x40 if($rex);
1155 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1156 rex(\@opcode,$4,$3);
1157 push @opcode,0x0f,0x3a,0xdf;
1158 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1160 push @opcode,$c=~/^0/?oct($c):$c;
1161 return ".byte\t".join(',',@opcode);
1163 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1166 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1167 "aesdec" => 0xde, "aesdeclast" => 0xdf
1169 return undef if (!defined($opcodelet{$1}));
1170 rex(\@opcode,$3,$2);
1171 push @opcode,0x0f,0x38,$opcodelet{$1};
1172 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1173 return ".byte\t".join(',',@opcode);
1175 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1177 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1178 "aesdec" => 0xde, "aesdeclast" => 0xdf
1180 return undef if (!defined($opcodelet{$1}));
1182 push @opcode,0x44 if ($3>=8);
1183 push @opcode,0x0f,0x38,$opcodelet{$1};
1184 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1185 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1186 return ".byte\t".join(',',@opcode);
1191 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1192 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;