3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer AES-NI procedures process several independent buffers
11 # in parallel by interleaving independent instructions.
13 # Cycles per byte for interleave factor 4:
16 # ---------------------------
17 # Westmere 5.00/4=1.25 5.13/4=1.28
18 # Atom 15.0/4=3.75 ?15.7/4=3.93
19 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29
20 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29
21 # Haswell 4.44/4=1.11 4.44/4=1.11
22 # Bulldozer 5.75/4=1.44 5.76/4=1.44
24 # Cycles per byte for interleave factor 8 (not implemented for
25 # pre-AVX processors, where higher interleave factor incidentally
26 # doesn't result in improvement):
29 # ---------------------------
30 # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
31 # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
32 # Haswell 5.00/8=0.63 5.00/8=0.63
33 # Bulldozer 5.75/8=0.72 5.77/8=0.72
35 # (*) Sandy/Ivy Bridge are known to handle high interleave factors
40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47 die "can't locate x86_64-xlate.pl";
51 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
53 $avx = ($1>=2.19) + ($1>=2.22);
56 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
58 $avx = ($1>=2.09) + ($1>=2.10);
61 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
62 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
63 $avx = ($1>=10) + ($1>=11);
66 open OUT,"| \"$^X\" $xlate $flavour $output";
69 # void aesni_multi_cbc_encrypt (
70 # struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
72 # int num); /* 1 or 2 */
74 $inp="%rdi"; # 1st arg
75 $key="%rsi"; # 2nd arg
78 @inptr=map("%r$_",(8..11));
79 @outptr=map("%r$_",(12..15));
81 ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
82 @out=map("%xmm$_",(2..5));
83 @inp=map("%xmm$_",(6..9));
84 ($counters,$mask,$zero)=map("%xmm$_",(10..12));
86 ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
91 .extern OPENSSL_ia32cap_P
93 .globl aesni_multi_cbc_encrypt
94 .type aesni_multi_cbc_encrypt,\@function,3
96 aesni_multi_cbc_encrypt:
98 $code.=<<___ if ($avx);
101 mov OPENSSL_ia32cap_P+4(%rip),%ecx
102 test \$`1<<28`,%ecx # AVX bit
103 jnz _avx_cbc_enc_shortcut
117 $code.=<<___ if ($win64);
120 movaps %xmm7,0x10(%rsp)
121 movaps %xmm8,0x20(%rsp)
122 movaps %xmm9,0x30(%rsp)
123 movaps %xmm10,0x40(%rsp)
124 movaps %xmm11,0x50(%rsp)
125 movaps %xmm12,0x60(%rsp)
131 # +16 input sink [original %rsp and $num]
136 mov %rax,16(%rsp) # original %rsp
139 movdqu ($key),$zero # 0-round key
140 lea 0x78($key),$key # size optimization
144 mov $num,24(%rsp) # original $num
147 for($i=0;$i<4;$i++) {
149 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
150 mov `40*$i+0-40*2`($inp),@inptr[$i]
152 mov `40*$i+8-40*2`($inp),@outptr[$i]
153 cmovg $one,$num # find maximum
155 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
156 mov $one,`32+4*$i`(%rsp) # initialize counters
157 cmovle %rsp,@inptr[$i] # cancel input
164 movups 0x10-0x78($key),$rndkey1
166 movups 0x20-0x78($key),$rndkey0
168 mov 0xf0-0x78($key),$rounds
170 movdqu (@inptr[0]),@inp[0] # load inputs
172 movdqu (@inptr[1]),@inp[1]
174 movdqu (@inptr[2]),@inp[2]
176 movdqu (@inptr[3]),@inp[3]
179 movdqa 32(%rsp),$counters # load counters
186 lea 16(%rsp),$sink # sink pointer
187 mov \$1,$one # constant of 1
190 aesenc $rndkey1,@out[0]
191 prefetcht0 31(@inptr[0],$offset) # prefetch input
192 prefetcht0 31(@inptr[1],$offset)
193 aesenc $rndkey1,@out[1]
194 prefetcht0 31(@inptr[2],$offset)
195 prefetcht0 31(@inptr[2],$offset)
196 aesenc $rndkey1,@out[2]
197 aesenc $rndkey1,@out[3]
198 movups 0x30-0x78($key),$rndkey1
200 for($i=0;$i<4;$i++) {
201 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
203 cmp `32+4*$i`(%rsp),$one
204 aesenc $rndkey,@out[0]
205 aesenc $rndkey,@out[1]
206 aesenc $rndkey,@out[2]
207 cmovge $sink,@inptr[$i] # cancel input
208 cmovg $sink,@outptr[$i] # sink output
209 aesenc $rndkey,@out[3]
210 movups `0x40+16*$i-0x78`($key),$rndkey
214 movdqa $counters,$mask
215 aesenc $rndkey0,@out[0]
216 prefetcht0 15(@outptr[0],$offset) # prefetch output
217 prefetcht0 15(@outptr[1],$offset)
218 aesenc $rndkey0,@out[1]
219 prefetcht0 15(@outptr[2],$offset)
220 prefetcht0 15(@outptr[3],$offset)
221 aesenc $rndkey0,@out[2]
222 aesenc $rndkey0,@out[3]
223 movups 0x80-0x78($key),$rndkey0
226 aesenc $rndkey1,@out[0]
228 movdqu -0x78($key),$zero # reload 0-round key
229 aesenc $rndkey1,@out[1]
230 paddd $mask,$counters # decrement counters
231 movdqa $counters,32(%rsp) # update counters
232 aesenc $rndkey1,@out[2]
233 aesenc $rndkey1,@out[3]
234 movups 0x90-0x78($key),$rndkey1
238 aesenc $rndkey0,@out[0]
239 aesenc $rndkey0,@out[1]
240 aesenc $rndkey0,@out[2]
241 aesenc $rndkey0,@out[3]
242 movups 0xa0-0x78($key),$rndkey0
246 aesenc $rndkey1,@out[0]
247 aesenc $rndkey1,@out[1]
248 aesenc $rndkey1,@out[2]
249 aesenc $rndkey1,@out[3]
250 movups 0xb0-0x78($key),$rndkey1
252 aesenc $rndkey0,@out[0]
253 aesenc $rndkey0,@out[1]
254 aesenc $rndkey0,@out[2]
255 aesenc $rndkey0,@out[3]
256 movups 0xc0-0x78($key),$rndkey0
260 aesenc $rndkey1,@out[0]
261 aesenc $rndkey1,@out[1]
262 aesenc $rndkey1,@out[2]
263 aesenc $rndkey1,@out[3]
264 movups 0xd0-0x78($key),$rndkey1
266 aesenc $rndkey0,@out[0]
267 aesenc $rndkey0,@out[1]
268 aesenc $rndkey0,@out[2]
269 aesenc $rndkey0,@out[3]
270 movups 0xe0-0x78($key),$rndkey0
275 aesenc $rndkey1,@out[0]
276 aesenc $rndkey1,@out[1]
277 aesenc $rndkey1,@out[2]
278 aesenc $rndkey1,@out[3]
279 movdqu (@inptr[0],$offset),@inp[0]
280 movdqu 0x10-0x78($key),$rndkey1
282 aesenclast $rndkey0,@out[0]
283 movdqu (@inptr[1],$offset),@inp[1]
285 aesenclast $rndkey0,@out[1]
286 movdqu (@inptr[2],$offset),@inp[2]
288 aesenclast $rndkey0,@out[2]
289 movdqu (@inptr[3],$offset),@inp[3]
291 aesenclast $rndkey0,@out[3]
292 movdqu 0x20-0x78($key),$rndkey0
295 movups @out[0],-16(@outptr[0],$offset)
297 movups @out[1],-16(@outptr[1],$offset)
299 movups @out[2],-16(@outptr[2],$offset)
301 movups @out[3],-16(@outptr[3],$offset)
307 mov 16(%rsp),%rax # original %rsp
310 #pxor @inp[0],@out[0]
311 #pxor @inp[1],@out[1]
312 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
313 #pxor @inp[2],@out[2]
314 #movdqu @out[1],`40*1+24-40*2`($inp)
315 #pxor @inp[3],@out[3]
316 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
317 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
319 lea `40*4`($inp),$inp
321 jnz .Lenc4x_loop_grande
325 $code.=<<___ if ($win64);
326 movaps -0xa8(%rax),%xmm6
327 movaps -0x98(%rax),%xmm7
328 movaps -0x88(%rax),%xmm8
329 movaps -0x78(%rax),%xmm9
330 movaps -0x68(%rax),%xmm10
331 movaps -0x58(%rax),%xmm11
332 movaps -0x48(%rax),%xmm12
343 .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
345 .globl aesni_multi_cbc_decrypt
346 .type aesni_multi_cbc_decrypt,\@function,3
348 aesni_multi_cbc_decrypt:
350 $code.=<<___ if ($avx);
353 mov OPENSSL_ia32cap_P+4(%rip),%ecx
354 test \$`1<<28`,%ecx # AVX bit
355 jnz _avx_cbc_dec_shortcut
369 $code.=<<___ if ($win64);
372 movaps %xmm7,0x10(%rsp)
373 movaps %xmm8,0x20(%rsp)
374 movaps %xmm9,0x30(%rsp)
375 movaps %xmm10,0x40(%rsp)
376 movaps %xmm11,0x50(%rsp)
377 movaps %xmm12,0x60(%rsp)
383 # +16 input sink [original %rsp and $num]
388 mov %rax,16(%rsp) # original %rsp
391 movdqu ($key),$zero # 0-round key
392 lea 0x78($key),$key # size optimization
396 mov $num,24(%rsp) # original $num
399 for($i=0;$i<4;$i++) {
401 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
402 mov `40*$i+0-40*2`($inp),@inptr[$i]
404 mov `40*$i+8-40*2`($inp),@outptr[$i]
405 cmovg $one,$num # find maximum
407 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
408 mov $one,`32+4*$i`(%rsp) # initialize counters
409 cmovle %rsp,@inptr[$i] # cancel input
416 movups 0x10-0x78($key),$rndkey1
417 movups 0x20-0x78($key),$rndkey0
418 mov 0xf0-0x78($key),$rounds
419 movdqu (@inptr[0]),@out[0] # load inputs
420 movdqu (@inptr[1]),@out[1]
422 movdqu (@inptr[2]),@out[2]
424 movdqu (@inptr[3]),@out[3]
427 movdqa 32(%rsp),$counters # load counters
434 lea 16(%rsp),$sink # sink pointer
435 mov \$1,$one # constant of 1
438 aesdec $rndkey1,@out[0]
439 prefetcht0 31(@inptr[0],$offset) # prefetch input
440 prefetcht0 31(@inptr[1],$offset)
441 aesdec $rndkey1,@out[1]
442 prefetcht0 31(@inptr[2],$offset)
443 prefetcht0 31(@inptr[3],$offset)
444 aesdec $rndkey1,@out[2]
445 aesdec $rndkey1,@out[3]
446 movups 0x30-0x78($key),$rndkey1
448 for($i=0;$i<4;$i++) {
449 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
451 cmp `32+4*$i`(%rsp),$one
452 aesdec $rndkey,@out[0]
453 aesdec $rndkey,@out[1]
454 cmovge $sink,@inptr[$i] # cancel input
455 aesdec $rndkey,@out[2]
456 cmovg $sink,@outptr[$i] # sink output
457 aesdec $rndkey,@out[3]
458 movups `0x40+16*$i-0x78`($key),$rndkey
462 movdqa $counters,$mask
463 aesdec $rndkey0,@out[0]
464 prefetcht0 15(@outptr[0],$offset) # prefetch output
465 prefetcht0 15(@outptr[1],$offset)
466 aesdec $rndkey0,@out[1]
467 prefetcht0 15(@outptr[2],$offset)
468 prefetcht0 15(@outptr[3],$offset)
469 aesdec $rndkey0,@out[2]
470 aesdec $rndkey0,@out[3]
471 movups 0x80-0x78($key),$rndkey0
474 aesdec $rndkey1,@out[0]
476 movdqu -0x78($key),$zero # reload 0-round key
477 aesdec $rndkey1,@out[1]
478 paddd $mask,$counters # decrement counters
479 movdqa $counters,32(%rsp) # update counters
480 aesdec $rndkey1,@out[2]
481 aesdec $rndkey1,@out[3]
482 movups 0x90-0x78($key),$rndkey1
486 aesdec $rndkey0,@out[0]
487 aesdec $rndkey0,@out[1]
488 aesdec $rndkey0,@out[2]
489 aesdec $rndkey0,@out[3]
490 movups 0xa0-0x78($key),$rndkey0
494 aesdec $rndkey1,@out[0]
495 aesdec $rndkey1,@out[1]
496 aesdec $rndkey1,@out[2]
497 aesdec $rndkey1,@out[3]
498 movups 0xb0-0x78($key),$rndkey1
500 aesdec $rndkey0,@out[0]
501 aesdec $rndkey0,@out[1]
502 aesdec $rndkey0,@out[2]
503 aesdec $rndkey0,@out[3]
504 movups 0xc0-0x78($key),$rndkey0
508 aesdec $rndkey1,@out[0]
509 aesdec $rndkey1,@out[1]
510 aesdec $rndkey1,@out[2]
511 aesdec $rndkey1,@out[3]
512 movups 0xd0-0x78($key),$rndkey1
514 aesdec $rndkey0,@out[0]
515 aesdec $rndkey0,@out[1]
516 aesdec $rndkey0,@out[2]
517 aesdec $rndkey0,@out[3]
518 movups 0xe0-0x78($key),$rndkey0
523 aesdec $rndkey1,@out[0]
524 aesdec $rndkey1,@out[1]
525 aesdec $rndkey1,@out[2]
526 pxor $rndkey0,@inp[0]
527 pxor $rndkey0,@inp[1]
528 aesdec $rndkey1,@out[3]
529 movdqu 0x10-0x78($key),$rndkey1
530 pxor $rndkey0,@inp[2]
531 pxor $rndkey0,@inp[3]
532 movdqu 0x20-0x78($key),$rndkey0
534 aesdeclast @inp[0],@out[0]
535 aesdeclast @inp[1],@out[1]
536 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
537 movdqu -16(@inptr[1],$offset),@inp[1]
538 aesdeclast @inp[2],@out[2]
539 aesdeclast @inp[3],@out[3]
540 movdqu -16(@inptr[2],$offset),@inp[2]
541 movdqu -16(@inptr[3],$offset),@inp[3]
543 movups @out[0],-16(@outptr[0],$offset)
544 movdqu (@inptr[0],$offset),@out[0]
545 movups @out[1],-16(@outptr[1],$offset)
546 movdqu (@inptr[1],$offset),@out[1]
548 movups @out[2],-16(@outptr[2],$offset)
549 movdqu (@inptr[2],$offset),@out[2]
551 movups @out[3],-16(@outptr[3],$offset)
552 movdqu (@inptr[3],$offset),@out[3]
559 mov 16(%rsp),%rax # original %rsp
562 lea `40*4`($inp),$inp
564 jnz .Ldec4x_loop_grande
568 $code.=<<___ if ($win64);
569 movaps -0xa8(%rax),%xmm6
570 movaps -0x98(%rax),%xmm7
571 movaps -0x88(%rax),%xmm8
572 movaps -0x78(%rax),%xmm9
573 movaps -0x68(%rax),%xmm10
574 movaps -0x58(%rax),%xmm11
575 movaps -0x48(%rax),%xmm12
586 .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
590 my @ptr=map("%r$_",(8..15));
593 my @out=map("%xmm$_",(2..9));
594 my @inp=map("%xmm$_",(10..13));
595 my ($counters,$zero)=("%xmm14","%xmm15");
598 .type aesni_multi_cbc_encrypt_avx,\@function,3
600 aesni_multi_cbc_encrypt_avx:
601 _avx_cbc_enc_shortcut:
610 $code.=<<___ if ($win64);
613 movaps %xmm7,0x10(%rsp)
614 movaps %xmm8,0x20(%rsp)
615 movaps %xmm9,0x30(%rsp)
616 movaps %xmm10,0x40(%rsp)
617 movaps %xmm11,0x50(%rsp)
618 movaps %xmm12,-0x78(%rax)
619 movaps %xmm13,-0x68(%rax)
620 movaps %xmm14,-0x58(%rax)
621 movaps %xmm15,-0x48(%rax)
627 # +16 input sink [original %rsp and $num]
629 # +64 distances between inputs and outputs
630 # +128 off-load area for @inp[0..3]
634 mov %rax,16(%rsp) # original %rsp
638 vmovdqu ($key),$zero # 0-round key
639 lea 0x78($key),$key # size optimization
644 #mov $num,24(%rsp) # original $num
647 for($i=0;$i<8;$i++) {
648 my $temp = $i ? $offload : $offset;
650 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
651 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
653 mov `40*$i+8-40*4`($inp),$temp # output pointer
654 cmovg $one,$num # find maximum
656 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
657 mov $one,`32+4*$i`(%rsp) # initialize counters
658 cmovle %rsp,@ptr[$i] # cancel input
659 sub @ptr[$i],$temp # distance between input and output
660 mov $temp,`64+8*$i`(%rsp) # initialize distances
667 vmovups 0x10-0x78($key),$rndkey1
668 vmovups 0x20-0x78($key),$rndkey0
669 mov 0xf0-0x78($key),$rounds
671 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
672 lea 128(%rsp),$offload # offload area
673 vpxor (@ptr[1]),$zero,@inp[1]
674 vpxor (@ptr[2]),$zero,@inp[2]
675 vpxor (@ptr[3]),$zero,@inp[3]
676 vpxor @inp[0],@out[0],@out[0]
677 vpxor (@ptr[4]),$zero,@inp[0]
678 vpxor @inp[1],@out[1],@out[1]
679 vpxor (@ptr[5]),$zero,@inp[1]
680 vpxor @inp[2],@out[2],@out[2]
681 vpxor (@ptr[6]),$zero,@inp[2]
682 vpxor @inp[3],@out[3],@out[3]
683 vpxor (@ptr[7]),$zero,@inp[3]
684 vpxor @inp[0],@out[4],@out[4]
685 mov \$1,$one # constant of 1
686 vpxor @inp[1],@out[5],@out[5]
687 vpxor @inp[2],@out[6],@out[6]
688 vpxor @inp[3],@out[7],@out[7]
694 for($i=0;$i<8;$i++) {
695 my $rndkey=($i&1)?$rndkey0:$rndkey1;
697 vaesenc $rndkey,@out[0],@out[0]
698 cmp 32+4*$i(%rsp),$one
700 $code.=<<___ if ($i);
701 mov 64+8*$i(%rsp),$offset
704 vaesenc $rndkey,@out[1],@out[1]
705 prefetcht0 31(@ptr[$i]) # prefetch input
706 vaesenc $rndkey,@out[2],@out[2]
708 $code.=<<___ if ($i>1);
709 prefetcht0 15(@ptr[$i-2]) # prefetch output
712 vaesenc $rndkey,@out[3],@out[3]
713 lea (@ptr[$i],$offset),$offset
714 cmovge %rsp,@ptr[$i] # cancel input
715 vaesenc $rndkey,@out[4],@out[4]
716 cmovg %rsp,$offset # sink output
717 vaesenc $rndkey,@out[5],@out[5]
719 vaesenc $rndkey,@out[6],@out[6]
720 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
721 mov $offset,64+8*$i(%rsp)
722 vaesenc $rndkey,@out[7],@out[7]
723 vmovups `16*(3+$i)-0x78`($key),$rndkey
724 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
726 $code.=<<___ if ($i<4)
727 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
731 vmovdqu 32(%rsp),$counters
732 prefetcht0 15(@ptr[$i-2]) # prefetch output
733 prefetcht0 15(@ptr[$i-1])
737 vaesenc $rndkey1,@out[0],@out[0]
738 vaesenc $rndkey1,@out[1],@out[1]
739 vaesenc $rndkey1,@out[2],@out[2]
740 vaesenc $rndkey1,@out[3],@out[3]
741 vaesenc $rndkey1,@out[4],@out[4]
742 vaesenc $rndkey1,@out[5],@out[5]
743 vaesenc $rndkey1,@out[6],@out[6]
744 vaesenc $rndkey1,@out[7],@out[7]
745 vmovups 0xb0-0x78($key),$rndkey1
747 vaesenc $rndkey0,@out[0],@out[0]
748 vaesenc $rndkey0,@out[1],@out[1]
749 vaesenc $rndkey0,@out[2],@out[2]
750 vaesenc $rndkey0,@out[3],@out[3]
751 vaesenc $rndkey0,@out[4],@out[4]
752 vaesenc $rndkey0,@out[5],@out[5]
753 vaesenc $rndkey0,@out[6],@out[6]
754 vaesenc $rndkey0,@out[7],@out[7]
755 vmovups 0xc0-0x78($key),$rndkey0
758 vaesenc $rndkey1,@out[0],@out[0]
759 vaesenc $rndkey1,@out[1],@out[1]
760 vaesenc $rndkey1,@out[2],@out[2]
761 vaesenc $rndkey1,@out[3],@out[3]
762 vaesenc $rndkey1,@out[4],@out[4]
763 vaesenc $rndkey1,@out[5],@out[5]
764 vaesenc $rndkey1,@out[6],@out[6]
765 vaesenc $rndkey1,@out[7],@out[7]
766 vmovups 0xd0-0x78($key),$rndkey1
768 vaesenc $rndkey0,@out[0],@out[0]
769 vaesenc $rndkey0,@out[1],@out[1]
770 vaesenc $rndkey0,@out[2],@out[2]
771 vaesenc $rndkey0,@out[3],@out[3]
772 vaesenc $rndkey0,@out[4],@out[4]
773 vaesenc $rndkey0,@out[5],@out[5]
774 vaesenc $rndkey0,@out[6],@out[6]
775 vaesenc $rndkey0,@out[7],@out[7]
776 vmovups 0xe0-0x78($key),$rndkey0
779 vaesenc $rndkey1,@out[0],@out[0]
780 vpxor $zero,$zero,$zero
781 vaesenc $rndkey1,@out[1],@out[1]
782 vaesenc $rndkey1,@out[2],@out[2]
783 vpcmpgtd $zero,$counters,$zero
784 vaesenc $rndkey1,@out[3],@out[3]
785 vaesenc $rndkey1,@out[4],@out[4]
786 vpaddd $counters,$zero,$zero # decrement counters
787 vmovdqu 48(%rsp),$counters
788 vaesenc $rndkey1,@out[5],@out[5]
789 mov 64(%rsp),$offset # pre-load 1st offset
790 vaesenc $rndkey1,@out[6],@out[6]
791 vaesenc $rndkey1,@out[7],@out[7]
792 vmovups 0x10-0x78($key),$rndkey1
794 vaesenclast $rndkey0,@out[0],@out[0]
795 vmovdqa $zero,32(%rsp) # update counters
796 vpxor $zero,$zero,$zero
797 vaesenclast $rndkey0,@out[1],@out[1]
798 vaesenclast $rndkey0,@out[2],@out[2]
799 vpcmpgtd $zero,$counters,$zero
800 vaesenclast $rndkey0,@out[3],@out[3]
801 vaesenclast $rndkey0,@out[4],@out[4]
802 vpaddd $zero,$counters,$counters # decrement counters
803 vmovdqu -0x78($key),$zero # 0-round
804 vaesenclast $rndkey0,@out[5],@out[5]
805 vaesenclast $rndkey0,@out[6],@out[6]
806 vmovdqa $counters,48(%rsp) # update counters
807 vaesenclast $rndkey0,@out[7],@out[7]
808 vmovups 0x20-0x78($key),$rndkey0
810 vmovups @out[0],-16(@ptr[0]) # write output
811 sub $offset,@ptr[0] # switch to input
812 vpxor 0x00($offload),@out[0],@out[0]
813 vmovups @out[1],-16(@ptr[1])
814 sub `64+1*8`(%rsp),@ptr[1]
815 vpxor 0x10($offload),@out[1],@out[1]
816 vmovups @out[2],-16(@ptr[2])
817 sub `64+2*8`(%rsp),@ptr[2]
818 vpxor 0x20($offload),@out[2],@out[2]
819 vmovups @out[3],-16(@ptr[3])
820 sub `64+3*8`(%rsp),@ptr[3]
821 vpxor 0x30($offload),@out[3],@out[3]
822 vmovups @out[4],-16(@ptr[4])
823 sub `64+4*8`(%rsp),@ptr[4]
824 vpxor @inp[0],@out[4],@out[4]
825 vmovups @out[5],-16(@ptr[5])
826 sub `64+5*8`(%rsp),@ptr[5]
827 vpxor @inp[1],@out[5],@out[5]
828 vmovups @out[6],-16(@ptr[6])
829 sub `64+6*8`(%rsp),@ptr[6]
830 vpxor @inp[2],@out[6],@out[6]
831 vmovups @out[7],-16(@ptr[7])
832 sub `64+7*8`(%rsp),@ptr[7]
833 vpxor @inp[3],@out[7],@out[7]
838 mov 16(%rsp),%rax # original %rsp
840 #lea `40*8`($inp),$inp
842 #jnz .Lenc8x_loop_grande
847 $code.=<<___ if ($win64);
848 movaps -0xd8(%rax),%xmm6
849 movaps -0xc8(%rax),%xmm7
850 movaps -0xb8(%rax),%xmm8
851 movaps -0xa8(%rax),%xmm9
852 movaps -0x98(%rax),%xmm10
853 movaps -0x88(%rax),%xmm11
854 movaps -0x78(%rax),%xmm12
855 movaps -0x68(%rax),%xmm13
856 movaps -0x58(%rax),%xmm14
857 movaps -0x48(%rax),%xmm15
868 .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
870 .type aesni_multi_cbc_decrypt_avx,\@function,3
872 aesni_multi_cbc_decrypt_avx:
873 _avx_cbc_dec_shortcut:
882 $code.=<<___ if ($win64);
885 movaps %xmm7,0x10(%rsp)
886 movaps %xmm8,0x20(%rsp)
887 movaps %xmm9,0x30(%rsp)
888 movaps %xmm10,0x40(%rsp)
889 movaps %xmm11,0x50(%rsp)
890 movaps %xmm12,-0x78(%rax)
891 movaps %xmm13,-0x68(%rax)
892 movaps %xmm14,-0x58(%rax)
893 movaps %xmm15,-0x48(%rax)
899 # +16 input sink [original %rsp and $num]
901 # +64 distances between inputs and outputs
902 # +128 off-load area for @inp[0..3]
903 # +192 IV/input offload
908 mov %rax,16(%rsp) # original %rsp
912 vmovdqu ($key),$zero # 0-round key
913 lea 0x78($key),$key # size optimization
918 #mov $num,24(%rsp) # original $num
921 for($i=0;$i<8;$i++) {
922 my $temp = $i ? $offload : $offset;
924 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
925 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
927 mov `40*$i+8-40*4`($inp),$temp # output pointer
928 cmovg $one,$num # find maximum
930 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
931 mov $one,`32+4*$i`(%rsp) # initialize counters
932 cmovle %rsp,@ptr[$i] # cancel input
933 sub @ptr[$i],$temp # distance between input and output
934 mov $temp,`64+8*$i`(%rsp) # initialize distances
935 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
942 vmovups 0x10-0x78($key),$rndkey1
943 vmovups 0x20-0x78($key),$rndkey0
944 mov 0xf0-0x78($key),$rounds
945 lea 192+128(%rsp),$offload # offload area
947 vmovdqu (@ptr[0]),@out[0] # load inputs
948 vmovdqu (@ptr[1]),@out[1]
949 vmovdqu (@ptr[2]),@out[2]
950 vmovdqu (@ptr[3]),@out[3]
951 vmovdqu (@ptr[4]),@out[4]
952 vmovdqu (@ptr[5]),@out[5]
953 vmovdqu (@ptr[6]),@out[6]
954 vmovdqu (@ptr[7]),@out[7]
955 vmovdqu @out[0],0x00($offload) # offload inputs
956 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
957 vmovdqu @out[1],0x10($offload)
958 vpxor $zero,@out[1],@out[1]
959 vmovdqu @out[2],0x20($offload)
960 vpxor $zero,@out[2],@out[2]
961 vmovdqu @out[3],0x30($offload)
962 vpxor $zero,@out[3],@out[3]
963 vmovdqu @out[4],0x40($offload)
964 vpxor $zero,@out[4],@out[4]
965 vmovdqu @out[5],0x50($offload)
966 vpxor $zero,@out[5],@out[5]
967 vmovdqu @out[6],0x60($offload)
968 vpxor $zero,@out[6],@out[6]
969 vmovdqu @out[7],0x70($offload)
970 vpxor $zero,@out[7],@out[7]
972 mov \$1,$one # constant of 1
978 for($i=0;$i<8;$i++) {
979 my $rndkey=($i&1)?$rndkey0:$rndkey1;
981 vaesdec $rndkey,@out[0],@out[0]
982 cmp 32+4*$i(%rsp),$one
984 $code.=<<___ if ($i);
985 mov 64+8*$i(%rsp),$offset
988 vaesdec $rndkey,@out[1],@out[1]
989 prefetcht0 31(@ptr[$i]) # prefetch input
990 vaesdec $rndkey,@out[2],@out[2]
992 $code.=<<___ if ($i>1);
993 prefetcht0 15(@ptr[$i-2]) # prefetch output
996 vaesdec $rndkey,@out[3],@out[3]
997 lea (@ptr[$i],$offset),$offset
998 cmovge %rsp,@ptr[$i] # cancel input
999 vaesdec $rndkey,@out[4],@out[4]
1000 cmovg %rsp,$offset # sink output
1001 vaesdec $rndkey,@out[5],@out[5]
1002 sub @ptr[$i],$offset
1003 vaesdec $rndkey,@out[6],@out[6]
1004 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
1005 mov $offset,64+8*$i(%rsp)
1006 vaesdec $rndkey,@out[7],@out[7]
1007 vmovups `16*(3+$i)-0x78`($key),$rndkey
1008 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
1010 $code.=<<___ if ($i<4);
1011 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
1015 vmovdqu 32(%rsp),$counters
1016 prefetcht0 15(@ptr[$i-2]) # prefetch output
1017 prefetcht0 15(@ptr[$i-1])
1021 vaesdec $rndkey1,@out[0],@out[0]
1022 vaesdec $rndkey1,@out[1],@out[1]
1023 vaesdec $rndkey1,@out[2],@out[2]
1024 vaesdec $rndkey1,@out[3],@out[3]
1025 vaesdec $rndkey1,@out[4],@out[4]
1026 vaesdec $rndkey1,@out[5],@out[5]
1027 vaesdec $rndkey1,@out[6],@out[6]
1028 vaesdec $rndkey1,@out[7],@out[7]
1029 vmovups 0xb0-0x78($key),$rndkey1
1031 vaesdec $rndkey0,@out[0],@out[0]
1032 vaesdec $rndkey0,@out[1],@out[1]
1033 vaesdec $rndkey0,@out[2],@out[2]
1034 vaesdec $rndkey0,@out[3],@out[3]
1035 vaesdec $rndkey0,@out[4],@out[4]
1036 vaesdec $rndkey0,@out[5],@out[5]
1037 vaesdec $rndkey0,@out[6],@out[6]
1038 vaesdec $rndkey0,@out[7],@out[7]
1039 vmovups 0xc0-0x78($key),$rndkey0
1042 vaesdec $rndkey1,@out[0],@out[0]
1043 vaesdec $rndkey1,@out[1],@out[1]
1044 vaesdec $rndkey1,@out[2],@out[2]
1045 vaesdec $rndkey1,@out[3],@out[3]
1046 vaesdec $rndkey1,@out[4],@out[4]
1047 vaesdec $rndkey1,@out[5],@out[5]
1048 vaesdec $rndkey1,@out[6],@out[6]
1049 vaesdec $rndkey1,@out[7],@out[7]
1050 vmovups 0xd0-0x78($key),$rndkey1
1052 vaesdec $rndkey0,@out[0],@out[0]
1053 vaesdec $rndkey0,@out[1],@out[1]
1054 vaesdec $rndkey0,@out[2],@out[2]
1055 vaesdec $rndkey0,@out[3],@out[3]
1056 vaesdec $rndkey0,@out[4],@out[4]
1057 vaesdec $rndkey0,@out[5],@out[5]
1058 vaesdec $rndkey0,@out[6],@out[6]
1059 vaesdec $rndkey0,@out[7],@out[7]
1060 vmovups 0xe0-0x78($key),$rndkey0
1063 vaesdec $rndkey1,@out[0],@out[0]
1064 vpxor $zero,$zero,$zero
1065 vaesdec $rndkey1,@out[1],@out[1]
1066 vaesdec $rndkey1,@out[2],@out[2]
1067 vpcmpgtd $zero,$counters,$zero
1068 vaesdec $rndkey1,@out[3],@out[3]
1069 vaesdec $rndkey1,@out[4],@out[4]
1070 vpaddd $counters,$zero,$zero # decrement counters
1071 vmovdqu 48(%rsp),$counters
1072 vaesdec $rndkey1,@out[5],@out[5]
1073 mov 64(%rsp),$offset # pre-load 1st offset
1074 vaesdec $rndkey1,@out[6],@out[6]
1075 vaesdec $rndkey1,@out[7],@out[7]
1076 vmovups 0x10-0x78($key),$rndkey1
1078 vaesdeclast $rndkey0,@out[0],@out[0]
1079 vmovdqa $zero,32(%rsp) # update counters
1080 vpxor $zero,$zero,$zero
1081 vaesdeclast $rndkey0,@out[1],@out[1]
1082 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1083 vaesdeclast $rndkey0,@out[2],@out[2]
1084 vpxor 0x10($offload),@out[1],@out[1]
1085 vpcmpgtd $zero,$counters,$zero
1086 vaesdeclast $rndkey0,@out[3],@out[3]
1087 vpxor 0x20($offload),@out[2],@out[2]
1088 vaesdeclast $rndkey0,@out[4],@out[4]
1089 vpxor 0x30($offload),@out[3],@out[3]
1090 vpaddd $zero,$counters,$counters # decrement counters
1091 vmovdqu -0x78($key),$zero # 0-round
1092 vaesdeclast $rndkey0,@out[5],@out[5]
1093 vpxor 0x40($offload),@out[4],@out[4]
1094 vaesdeclast $rndkey0,@out[6],@out[6]
1095 vpxor 0x50($offload),@out[5],@out[5]
1096 vmovdqa $counters,48(%rsp) # update counters
1097 vaesdeclast $rndkey0,@out[7],@out[7]
1098 vpxor 0x60($offload),@out[6],@out[6]
1099 vmovups 0x20-0x78($key),$rndkey0
1101 vmovups @out[0],-16(@ptr[0]) # write output
1102 sub $offset,@ptr[0] # switch to input
1103 vmovdqu 128+0(%rsp),@out[0]
1104 vpxor 0x70($offload),@out[7],@out[7]
1105 vmovups @out[1],-16(@ptr[1])
1106 sub `64+1*8`(%rsp),@ptr[1]
1107 vmovdqu @out[0],0x00($offload)
1108 vpxor $zero,@out[0],@out[0]
1109 vmovdqu 128+16(%rsp),@out[1]
1110 vmovups @out[2],-16(@ptr[2])
1111 sub `64+2*8`(%rsp),@ptr[2]
1112 vmovdqu @out[1],0x10($offload)
1113 vpxor $zero,@out[1],@out[1]
1114 vmovdqu 128+32(%rsp),@out[2]
1115 vmovups @out[3],-16(@ptr[3])
1116 sub `64+3*8`(%rsp),@ptr[3]
1117 vmovdqu @out[2],0x20($offload)
1118 vpxor $zero,@out[2],@out[2]
1119 vmovdqu 128+48(%rsp),@out[3]
1120 vmovups @out[4],-16(@ptr[4])
1121 sub `64+4*8`(%rsp),@ptr[4]
1122 vmovdqu @out[3],0x30($offload)
1123 vpxor $zero,@out[3],@out[3]
1124 vmovdqu @inp[0],0x40($offload)
1125 vpxor @inp[0],$zero,@out[4]
1126 vmovups @out[5],-16(@ptr[5])
1127 sub `64+5*8`(%rsp),@ptr[5]
1128 vmovdqu @inp[1],0x50($offload)
1129 vpxor @inp[1],$zero,@out[5]
1130 vmovups @out[6],-16(@ptr[6])
1131 sub `64+6*8`(%rsp),@ptr[6]
1132 vmovdqu @inp[2],0x60($offload)
1133 vpxor @inp[2],$zero,@out[6]
1134 vmovups @out[7],-16(@ptr[7])
1135 sub `64+7*8`(%rsp),@ptr[7]
1136 vmovdqu @inp[3],0x70($offload)
1137 vpxor @inp[3],$zero,@out[7]
1143 mov 16(%rsp),%rax # original %rsp
1145 #lea `40*8`($inp),$inp
1147 #jnz .Ldec8x_loop_grande
1152 $code.=<<___ if ($win64);
1153 movaps -0xd8(%rax),%xmm6
1154 movaps -0xc8(%rax),%xmm7
1155 movaps -0xb8(%rax),%xmm8
1156 movaps -0xa8(%rax),%xmm9
1157 movaps -0x98(%rax),%xmm10
1158 movaps -0x88(%rax),%xmm11
1159 movaps -0x78(%rax),%xmm12
1160 movaps -0x68(%rax),%xmm13
1161 movaps -0x58(%rax),%xmm14
1162 movaps -0x48(%rax),%xmm15
1173 .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1178 local *opcode=shift;
1182 $rex|=0x04 if($dst>=8);
1183 $rex|=0x01 if($src>=8);
1184 push @opcode,$rex|0x40 if($rex);
1191 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1192 rex(\@opcode,$4,$3);
1193 push @opcode,0x0f,0x3a,0xdf;
1194 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1196 push @opcode,$c=~/^0/?oct($c):$c;
1197 return ".byte\t".join(',',@opcode);
1199 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1202 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1203 "aesdec" => 0xde, "aesdeclast" => 0xdf
1205 return undef if (!defined($opcodelet{$1}));
1206 rex(\@opcode,$3,$2);
1207 push @opcode,0x0f,0x38,$opcodelet{$1};
1208 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1209 return ".byte\t".join(',',@opcode);
1211 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1213 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1214 "aesdec" => 0xde, "aesdeclast" => 0xdf
1216 return undef if (!defined($opcodelet{$1}));
1218 push @opcode,0x44 if ($3>=8);
1219 push @opcode,0x0f,0x38,$opcodelet{$1};
1220 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1221 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1222 return ".byte\t".join(',',@opcode);
1227 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1228 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;