3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # Multi-buffer AES-NI procedures process several independent buffers
11 # in parallel by interleaving independent instructions.
13 # Cycles per byte for interleave factor 4:
16 # ---------------------------
17 # Westmere 5.00/4=1.25 5.13/4=1.28
18 # Atom 15.0/4=3.75 ?15.7/4=3.93
19 # Sandy Bridge 5.06/4=1.27 5.18/4=1.29
20 # Ivy Bridge 5.06/4=1.27 5.14/4=1.29
21 # Haswell 4.44/4=1.11 4.44/4=1.11
22 # Bulldozer 5.75/4=1.44 5.76/4=1.44
24 # Cycles per byte for interleave factor 8 (not implemented for
25 # pre-AVX processors, where higher interleave factor incidentally
26 # doesn't result in improvement):
29 # ---------------------------
30 # Sandy Bridge 5.06/8=0.64 7.10/8=0.89(*)
31 # Ivy Bridge 5.06/8=0.64 7.14/8=0.89(*)
32 # Haswell 5.00/8=0.63 5.00/8=0.63
33 # Bulldozer 5.75/8=0.72 5.77/8=0.72
35 # (*) Sandy/Ivy Bridge are known to handle high interleave factors
40 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
42 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
47 die "can't locate x86_64-xlate.pl";
51 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
52 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
53 $avx = ($1>=2.19) + ($1>=2.22);
56 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
57 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
58 $avx = ($1>=2.09) + ($1>=2.10);
61 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
62 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
63 $avx = ($1>=10) + ($1>=11);
66 open OUT,"| \"$^X\" $xlate $flavour $output";
69 # void aesni_multi_cbc_encrypt (
70 # struct { void *inp,*out; int blocks; double iv[2]; } inp[8];
72 # int num); /* 1 or 2 */
74 $inp="%rdi"; # 1st arg
75 $key="%rsi"; # 2nd arg
78 @inptr=map("%r$_",(8..11));
79 @outptr=map("%r$_",(12..15));
81 ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
82 @out=map("%xmm$_",(2..5));
83 @inp=map("%xmm$_",(6..9));
84 ($counters,$mask,$zero)=map("%xmm$_",(10..12));
86 ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
91 .extern OPENSSL_ia32cap_P
93 .globl aesni_multi_cbc_encrypt
94 .type aesni_multi_cbc_encrypt,\@function,3
96 aesni_multi_cbc_encrypt:
98 $code.=<<___ if ($avx);
101 mov OPENSSL_ia32cap_P+4(%rip),%ecx
102 test \$`1<<28`,%ecx # AVX bit
103 jnz _avx_cbc_enc_shortcut
117 $code.=<<___ if ($win64);
120 movaps %xmm7,0x10(%rsp)
121 movaps %xmm8,0x20(%rsp)
122 movaps %xmm9,0x30(%rsp)
123 movaps %xmm10,0x40(%rsp)
124 movaps %xmm11,0x50(%rsp)
125 movaps %xmm12,0x60(%rsp)
126 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
127 movaps %xmm14,-0x58(%rax)
128 movaps %xmm15,-0x48(%rax)
134 # +16 input sink [original %rsp and $num]
139 mov %rax,16(%rsp) # original %rsp
142 movdqu ($key),$zero # 0-round key
143 lea 0x78($key),$key # size optimization
147 mov $num,24(%rsp) # original $num
150 for($i=0;$i<4;$i++) {
152 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
153 mov `40*$i+0-40*2`($inp),@inptr[$i]
155 mov `40*$i+8-40*2`($inp),@outptr[$i]
156 cmovg $one,$num # find maximum
158 movdqu `40*$i+24-40*2`($inp),@out[$i] # load IV
159 mov $one,`32+4*$i`(%rsp) # initialize counters
160 cmovle %rsp,@inptr[$i] # cancel input
167 movups 0x10-0x78($key),$rndkey1
169 movups 0x20-0x78($key),$rndkey0
171 mov 0xf0-0x78($key),$rounds
173 movdqu (@inptr[0]),@inp[0] # load inputs
175 movdqu (@inptr[1]),@inp[1]
177 movdqu (@inptr[2]),@inp[2]
179 movdqu (@inptr[3]),@inp[3]
182 movdqa 32(%rsp),$counters # load counters
189 lea 16(%rsp),$sink # sink pointer
190 mov \$1,$one # constant of 1
193 aesenc $rndkey1,@out[0]
194 prefetcht0 31(@inptr[0],$offset) # prefetch input
195 prefetcht0 31(@inptr[1],$offset)
196 aesenc $rndkey1,@out[1]
197 prefetcht0 31(@inptr[2],$offset)
198 prefetcht0 31(@inptr[2],$offset)
199 aesenc $rndkey1,@out[2]
200 aesenc $rndkey1,@out[3]
201 movups 0x30-0x78($key),$rndkey1
203 for($i=0;$i<4;$i++) {
204 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
206 cmp `32+4*$i`(%rsp),$one
207 aesenc $rndkey,@out[0]
208 aesenc $rndkey,@out[1]
209 aesenc $rndkey,@out[2]
210 cmovge $sink,@inptr[$i] # cancel input
211 cmovg $sink,@outptr[$i] # sink output
212 aesenc $rndkey,@out[3]
213 movups `0x40+16*$i-0x78`($key),$rndkey
217 movdqa $counters,$mask
218 aesenc $rndkey0,@out[0]
219 prefetcht0 15(@outptr[0],$offset) # prefetch output
220 prefetcht0 15(@outptr[1],$offset)
221 aesenc $rndkey0,@out[1]
222 prefetcht0 15(@outptr[2],$offset)
223 prefetcht0 15(@outptr[3],$offset)
224 aesenc $rndkey0,@out[2]
225 aesenc $rndkey0,@out[3]
226 movups 0x80-0x78($key),$rndkey0
229 aesenc $rndkey1,@out[0]
231 movdqu -0x78($key),$zero # reload 0-round key
232 aesenc $rndkey1,@out[1]
233 paddd $mask,$counters # decrement counters
234 movdqa $counters,32(%rsp) # update counters
235 aesenc $rndkey1,@out[2]
236 aesenc $rndkey1,@out[3]
237 movups 0x90-0x78($key),$rndkey1
241 aesenc $rndkey0,@out[0]
242 aesenc $rndkey0,@out[1]
243 aesenc $rndkey0,@out[2]
244 aesenc $rndkey0,@out[3]
245 movups 0xa0-0x78($key),$rndkey0
249 aesenc $rndkey1,@out[0]
250 aesenc $rndkey1,@out[1]
251 aesenc $rndkey1,@out[2]
252 aesenc $rndkey1,@out[3]
253 movups 0xb0-0x78($key),$rndkey1
255 aesenc $rndkey0,@out[0]
256 aesenc $rndkey0,@out[1]
257 aesenc $rndkey0,@out[2]
258 aesenc $rndkey0,@out[3]
259 movups 0xc0-0x78($key),$rndkey0
263 aesenc $rndkey1,@out[0]
264 aesenc $rndkey1,@out[1]
265 aesenc $rndkey1,@out[2]
266 aesenc $rndkey1,@out[3]
267 movups 0xd0-0x78($key),$rndkey1
269 aesenc $rndkey0,@out[0]
270 aesenc $rndkey0,@out[1]
271 aesenc $rndkey0,@out[2]
272 aesenc $rndkey0,@out[3]
273 movups 0xe0-0x78($key),$rndkey0
278 aesenc $rndkey1,@out[0]
279 aesenc $rndkey1,@out[1]
280 aesenc $rndkey1,@out[2]
281 aesenc $rndkey1,@out[3]
282 movdqu (@inptr[0],$offset),@inp[0]
283 movdqu 0x10-0x78($key),$rndkey1
285 aesenclast $rndkey0,@out[0]
286 movdqu (@inptr[1],$offset),@inp[1]
288 aesenclast $rndkey0,@out[1]
289 movdqu (@inptr[2],$offset),@inp[2]
291 aesenclast $rndkey0,@out[2]
292 movdqu (@inptr[3],$offset),@inp[3]
294 aesenclast $rndkey0,@out[3]
295 movdqu 0x20-0x78($key),$rndkey0
298 movups @out[0],-16(@outptr[0],$offset)
300 movups @out[1],-16(@outptr[1],$offset)
302 movups @out[2],-16(@outptr[2],$offset)
304 movups @out[3],-16(@outptr[3],$offset)
310 mov 16(%rsp),%rax # original %rsp
313 #pxor @inp[0],@out[0]
314 #pxor @inp[1],@out[1]
315 #movdqu @out[0],`40*0+24-40*2`($inp) # output iv FIX ME!
316 #pxor @inp[2],@out[2]
317 #movdqu @out[1],`40*1+24-40*2`($inp)
318 #pxor @inp[3],@out[3]
319 #movdqu @out[2],`40*2+24-40*2`($inp) # won't fix, let caller
320 #movdqu @out[3],`40*3+24-40*2`($inp) # figure this out...
322 lea `40*4`($inp),$inp
324 jnz .Lenc4x_loop_grande
328 $code.=<<___ if ($win64);
329 movaps -0xd8(%rax),%xmm6
330 movaps -0xc8(%rax),%xmm7
331 movaps -0xb8(%rax),%xmm8
332 movaps -0xa8(%rax),%xmm9
333 movaps -0x98(%rax),%xmm10
334 movaps -0x88(%rax),%xmm11
335 movaps -0x78(%rax),%xmm12
336 #movaps -0x68(%rax),%xmm13
337 #movaps -0x58(%rax),%xmm14
338 #movaps -0x48(%rax),%xmm15
350 .size aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
352 .globl aesni_multi_cbc_decrypt
353 .type aesni_multi_cbc_decrypt,\@function,3
355 aesni_multi_cbc_decrypt:
357 $code.=<<___ if ($avx);
360 mov OPENSSL_ia32cap_P+4(%rip),%ecx
361 test \$`1<<28`,%ecx # AVX bit
362 jnz _avx_cbc_dec_shortcut
376 $code.=<<___ if ($win64);
379 movaps %xmm7,0x10(%rsp)
380 movaps %xmm8,0x20(%rsp)
381 movaps %xmm9,0x30(%rsp)
382 movaps %xmm10,0x40(%rsp)
383 movaps %xmm11,0x50(%rsp)
384 movaps %xmm12,0x60(%rsp)
385 movaps %xmm13,-0x68(%rax) # not used, saved to share se_handler
386 movaps %xmm14,-0x58(%rax)
387 movaps %xmm15,-0x48(%rax)
393 # +16 input sink [original %rsp and $num]
398 mov %rax,16(%rsp) # original %rsp
401 movdqu ($key),$zero # 0-round key
402 lea 0x78($key),$key # size optimization
406 mov $num,24(%rsp) # original $num
409 for($i=0;$i<4;$i++) {
411 mov `40*$i+16-40*2`($inp),$one # borrow $one for number of blocks
412 mov `40*$i+0-40*2`($inp),@inptr[$i]
414 mov `40*$i+8-40*2`($inp),@outptr[$i]
415 cmovg $one,$num # find maximum
417 movdqu `40*$i+24-40*2`($inp),@inp[$i] # load IV
418 mov $one,`32+4*$i`(%rsp) # initialize counters
419 cmovle %rsp,@inptr[$i] # cancel input
426 movups 0x10-0x78($key),$rndkey1
427 movups 0x20-0x78($key),$rndkey0
428 mov 0xf0-0x78($key),$rounds
429 movdqu (@inptr[0]),@out[0] # load inputs
430 movdqu (@inptr[1]),@out[1]
432 movdqu (@inptr[2]),@out[2]
434 movdqu (@inptr[3]),@out[3]
437 movdqa 32(%rsp),$counters # load counters
444 lea 16(%rsp),$sink # sink pointer
445 mov \$1,$one # constant of 1
448 aesdec $rndkey1,@out[0]
449 prefetcht0 31(@inptr[0],$offset) # prefetch input
450 prefetcht0 31(@inptr[1],$offset)
451 aesdec $rndkey1,@out[1]
452 prefetcht0 31(@inptr[2],$offset)
453 prefetcht0 31(@inptr[3],$offset)
454 aesdec $rndkey1,@out[2]
455 aesdec $rndkey1,@out[3]
456 movups 0x30-0x78($key),$rndkey1
458 for($i=0;$i<4;$i++) {
459 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
461 cmp `32+4*$i`(%rsp),$one
462 aesdec $rndkey,@out[0]
463 aesdec $rndkey,@out[1]
464 aesdec $rndkey,@out[2]
465 cmovge $sink,@inptr[$i] # cancel input
466 cmovg $sink,@outptr[$i] # sink output
467 aesdec $rndkey,@out[3]
468 movups `0x40+16*$i-0x78`($key),$rndkey
472 movdqa $counters,$mask
473 aesdec $rndkey0,@out[0]
474 prefetcht0 15(@outptr[0],$offset) # prefetch output
475 prefetcht0 15(@outptr[1],$offset)
476 aesdec $rndkey0,@out[1]
477 prefetcht0 15(@outptr[2],$offset)
478 prefetcht0 15(@outptr[3],$offset)
479 aesdec $rndkey0,@out[2]
480 aesdec $rndkey0,@out[3]
481 movups 0x80-0x78($key),$rndkey0
484 aesdec $rndkey1,@out[0]
486 movdqu -0x78($key),$zero # reload 0-round key
487 aesdec $rndkey1,@out[1]
488 paddd $mask,$counters # decrement counters
489 movdqa $counters,32(%rsp) # update counters
490 aesdec $rndkey1,@out[2]
491 aesdec $rndkey1,@out[3]
492 movups 0x90-0x78($key),$rndkey1
496 aesdec $rndkey0,@out[0]
497 aesdec $rndkey0,@out[1]
498 aesdec $rndkey0,@out[2]
499 aesdec $rndkey0,@out[3]
500 movups 0xa0-0x78($key),$rndkey0
504 aesdec $rndkey1,@out[0]
505 aesdec $rndkey1,@out[1]
506 aesdec $rndkey1,@out[2]
507 aesdec $rndkey1,@out[3]
508 movups 0xb0-0x78($key),$rndkey1
510 aesdec $rndkey0,@out[0]
511 aesdec $rndkey0,@out[1]
512 aesdec $rndkey0,@out[2]
513 aesdec $rndkey0,@out[3]
514 movups 0xc0-0x78($key),$rndkey0
518 aesdec $rndkey1,@out[0]
519 aesdec $rndkey1,@out[1]
520 aesdec $rndkey1,@out[2]
521 aesdec $rndkey1,@out[3]
522 movups 0xd0-0x78($key),$rndkey1
524 aesdec $rndkey0,@out[0]
525 aesdec $rndkey0,@out[1]
526 aesdec $rndkey0,@out[2]
527 aesdec $rndkey0,@out[3]
528 movups 0xe0-0x78($key),$rndkey0
533 aesdec $rndkey1,@out[0]
534 aesdec $rndkey1,@out[1]
535 aesdec $rndkey1,@out[2]
536 pxor $rndkey0,@inp[0]
537 pxor $rndkey0,@inp[1]
538 aesdec $rndkey1,@out[3]
539 movdqu 0x10-0x78($key),$rndkey1
540 pxor $rndkey0,@inp[2]
541 pxor $rndkey0,@inp[3]
542 movdqu 0x20-0x78($key),$rndkey0
544 aesdeclast @inp[0],@out[0]
545 aesdeclast @inp[1],@out[1]
546 movdqu -16(@inptr[0],$offset),@inp[0] # load next IV
547 movdqu -16(@inptr[1],$offset),@inp[1]
548 aesdeclast @inp[2],@out[2]
549 aesdeclast @inp[3],@out[3]
550 movdqu -16(@inptr[2],$offset),@inp[2]
551 movdqu -16(@inptr[3],$offset),@inp[3]
553 movups @out[0],-16(@outptr[0],$offset)
554 movdqu (@inptr[0],$offset),@out[0]
555 movups @out[1],-16(@outptr[1],$offset)
556 movdqu (@inptr[1],$offset),@out[1]
558 movups @out[2],-16(@outptr[2],$offset)
559 movdqu (@inptr[2],$offset),@out[2]
561 movups @out[3],-16(@outptr[3],$offset)
562 movdqu (@inptr[3],$offset),@out[3]
569 mov 16(%rsp),%rax # original %rsp
572 lea `40*4`($inp),$inp
574 jnz .Ldec4x_loop_grande
578 $code.=<<___ if ($win64);
579 movaps -0xd8(%rax),%xmm6
580 movaps -0xc8(%rax),%xmm7
581 movaps -0xb8(%rax),%xmm8
582 movaps -0xa8(%rax),%xmm9
583 movaps -0x98(%rax),%xmm10
584 movaps -0x88(%rax),%xmm11
585 movaps -0x78(%rax),%xmm12
586 #movaps -0x68(%rax),%xmm13
587 #movaps -0x58(%rax),%xmm14
588 #movaps -0x48(%rax),%xmm15
600 .size aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
604 my @ptr=map("%r$_",(8..15));
607 my @out=map("%xmm$_",(2..9));
608 my @inp=map("%xmm$_",(10..13));
609 my ($counters,$zero)=("%xmm14","%xmm15");
612 .type aesni_multi_cbc_encrypt_avx,\@function,3
614 aesni_multi_cbc_encrypt_avx:
615 _avx_cbc_enc_shortcut:
624 $code.=<<___ if ($win64);
627 movaps %xmm7,0x10(%rsp)
628 movaps %xmm8,0x20(%rsp)
629 movaps %xmm9,0x30(%rsp)
630 movaps %xmm10,0x40(%rsp)
631 movaps %xmm11,0x50(%rsp)
632 movaps %xmm12,-0x78(%rax)
633 movaps %xmm13,-0x68(%rax)
634 movaps %xmm14,-0x58(%rax)
635 movaps %xmm15,-0x48(%rax)
641 # +16 input sink [original %rsp and $num]
643 # +64 distances between inputs and outputs
644 # +128 off-load area for @inp[0..3]
648 mov %rax,16(%rsp) # original %rsp
652 vmovdqu ($key),$zero # 0-round key
653 lea 0x78($key),$key # size optimization
658 #mov $num,24(%rsp) # original $num
661 for($i=0;$i<8;$i++) {
662 my $temp = $i ? $offload : $offset;
664 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
665 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
667 mov `40*$i+8-40*4`($inp),$temp # output pointer
668 cmovg $one,$num # find maximum
670 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
671 mov $one,`32+4*$i`(%rsp) # initialize counters
672 cmovle %rsp,@ptr[$i] # cancel input
673 sub @ptr[$i],$temp # distance between input and output
674 mov $temp,`64+8*$i`(%rsp) # initialize distances
681 vmovups 0x10-0x78($key),$rndkey1
682 vmovups 0x20-0x78($key),$rndkey0
683 mov 0xf0-0x78($key),$rounds
685 vpxor (@ptr[0]),$zero,@inp[0] # load inputs and xor with 0-round
686 lea 128(%rsp),$offload # offload area
687 vpxor (@ptr[1]),$zero,@inp[1]
688 vpxor (@ptr[2]),$zero,@inp[2]
689 vpxor (@ptr[3]),$zero,@inp[3]
690 vpxor @inp[0],@out[0],@out[0]
691 vpxor (@ptr[4]),$zero,@inp[0]
692 vpxor @inp[1],@out[1],@out[1]
693 vpxor (@ptr[5]),$zero,@inp[1]
694 vpxor @inp[2],@out[2],@out[2]
695 vpxor (@ptr[6]),$zero,@inp[2]
696 vpxor @inp[3],@out[3],@out[3]
697 vpxor (@ptr[7]),$zero,@inp[3]
698 vpxor @inp[0],@out[4],@out[4]
699 mov \$1,$one # constant of 1
700 vpxor @inp[1],@out[5],@out[5]
701 vpxor @inp[2],@out[6],@out[6]
702 vpxor @inp[3],@out[7],@out[7]
708 for($i=0;$i<8;$i++) {
709 my $rndkey=($i&1)?$rndkey0:$rndkey1;
711 vaesenc $rndkey,@out[0],@out[0]
712 cmp 32+4*$i(%rsp),$one
714 $code.=<<___ if ($i);
715 mov 64+8*$i(%rsp),$offset
718 vaesenc $rndkey,@out[1],@out[1]
719 prefetcht0 31(@ptr[$i]) # prefetch input
720 vaesenc $rndkey,@out[2],@out[2]
722 $code.=<<___ if ($i>1);
723 prefetcht0 15(@ptr[$i-2]) # prefetch output
726 vaesenc $rndkey,@out[3],@out[3]
727 lea (@ptr[$i],$offset),$offset
728 cmovge %rsp,@ptr[$i] # cancel input
729 vaesenc $rndkey,@out[4],@out[4]
730 cmovg %rsp,$offset # sink output
731 vaesenc $rndkey,@out[5],@out[5]
733 vaesenc $rndkey,@out[6],@out[6]
734 vpxor 16(@ptr[$i]),$zero,@inp[$i%4] # load input and xor with 0-round
735 mov $offset,64+8*$i(%rsp)
736 vaesenc $rndkey,@out[7],@out[7]
737 vmovups `16*(3+$i)-0x78`($key),$rndkey
738 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
740 $code.=<<___ if ($i<4)
741 vmovdqu @inp[$i%4],`16*$i`($offload) # off-load
745 vmovdqu 32(%rsp),$counters
746 prefetcht0 15(@ptr[$i-2]) # prefetch output
747 prefetcht0 15(@ptr[$i-1])
751 vaesenc $rndkey1,@out[0],@out[0]
752 vaesenc $rndkey1,@out[1],@out[1]
753 vaesenc $rndkey1,@out[2],@out[2]
754 vaesenc $rndkey1,@out[3],@out[3]
755 vaesenc $rndkey1,@out[4],@out[4]
756 vaesenc $rndkey1,@out[5],@out[5]
757 vaesenc $rndkey1,@out[6],@out[6]
758 vaesenc $rndkey1,@out[7],@out[7]
759 vmovups 0xb0-0x78($key),$rndkey1
761 vaesenc $rndkey0,@out[0],@out[0]
762 vaesenc $rndkey0,@out[1],@out[1]
763 vaesenc $rndkey0,@out[2],@out[2]
764 vaesenc $rndkey0,@out[3],@out[3]
765 vaesenc $rndkey0,@out[4],@out[4]
766 vaesenc $rndkey0,@out[5],@out[5]
767 vaesenc $rndkey0,@out[6],@out[6]
768 vaesenc $rndkey0,@out[7],@out[7]
769 vmovups 0xc0-0x78($key),$rndkey0
772 vaesenc $rndkey1,@out[0],@out[0]
773 vaesenc $rndkey1,@out[1],@out[1]
774 vaesenc $rndkey1,@out[2],@out[2]
775 vaesenc $rndkey1,@out[3],@out[3]
776 vaesenc $rndkey1,@out[4],@out[4]
777 vaesenc $rndkey1,@out[5],@out[5]
778 vaesenc $rndkey1,@out[6],@out[6]
779 vaesenc $rndkey1,@out[7],@out[7]
780 vmovups 0xd0-0x78($key),$rndkey1
782 vaesenc $rndkey0,@out[0],@out[0]
783 vaesenc $rndkey0,@out[1],@out[1]
784 vaesenc $rndkey0,@out[2],@out[2]
785 vaesenc $rndkey0,@out[3],@out[3]
786 vaesenc $rndkey0,@out[4],@out[4]
787 vaesenc $rndkey0,@out[5],@out[5]
788 vaesenc $rndkey0,@out[6],@out[6]
789 vaesenc $rndkey0,@out[7],@out[7]
790 vmovups 0xe0-0x78($key),$rndkey0
793 vaesenc $rndkey1,@out[0],@out[0]
794 vpxor $zero,$zero,$zero
795 vaesenc $rndkey1,@out[1],@out[1]
796 vaesenc $rndkey1,@out[2],@out[2]
797 vpcmpgtd $zero,$counters,$zero
798 vaesenc $rndkey1,@out[3],@out[3]
799 vaesenc $rndkey1,@out[4],@out[4]
800 vpaddd $counters,$zero,$zero # decrement counters
801 vmovdqu 48(%rsp),$counters
802 vaesenc $rndkey1,@out[5],@out[5]
803 mov 64(%rsp),$offset # pre-load 1st offset
804 vaesenc $rndkey1,@out[6],@out[6]
805 vaesenc $rndkey1,@out[7],@out[7]
806 vmovups 0x10-0x78($key),$rndkey1
808 vaesenclast $rndkey0,@out[0],@out[0]
809 vmovdqa $zero,32(%rsp) # update counters
810 vpxor $zero,$zero,$zero
811 vaesenclast $rndkey0,@out[1],@out[1]
812 vaesenclast $rndkey0,@out[2],@out[2]
813 vpcmpgtd $zero,$counters,$zero
814 vaesenclast $rndkey0,@out[3],@out[3]
815 vaesenclast $rndkey0,@out[4],@out[4]
816 vpaddd $zero,$counters,$counters # decrement counters
817 vmovdqu -0x78($key),$zero # 0-round
818 vaesenclast $rndkey0,@out[5],@out[5]
819 vaesenclast $rndkey0,@out[6],@out[6]
820 vmovdqa $counters,48(%rsp) # update counters
821 vaesenclast $rndkey0,@out[7],@out[7]
822 vmovups 0x20-0x78($key),$rndkey0
824 vmovups @out[0],-16(@ptr[0]) # write output
825 sub $offset,@ptr[0] # switch to input
826 vpxor 0x00($offload),@out[0],@out[0]
827 vmovups @out[1],-16(@ptr[1])
828 sub `64+1*8`(%rsp),@ptr[1]
829 vpxor 0x10($offload),@out[1],@out[1]
830 vmovups @out[2],-16(@ptr[2])
831 sub `64+2*8`(%rsp),@ptr[2]
832 vpxor 0x20($offload),@out[2],@out[2]
833 vmovups @out[3],-16(@ptr[3])
834 sub `64+3*8`(%rsp),@ptr[3]
835 vpxor 0x30($offload),@out[3],@out[3]
836 vmovups @out[4],-16(@ptr[4])
837 sub `64+4*8`(%rsp),@ptr[4]
838 vpxor @inp[0],@out[4],@out[4]
839 vmovups @out[5],-16(@ptr[5])
840 sub `64+5*8`(%rsp),@ptr[5]
841 vpxor @inp[1],@out[5],@out[5]
842 vmovups @out[6],-16(@ptr[6])
843 sub `64+6*8`(%rsp),@ptr[6]
844 vpxor @inp[2],@out[6],@out[6]
845 vmovups @out[7],-16(@ptr[7])
846 sub `64+7*8`(%rsp),@ptr[7]
847 vpxor @inp[3],@out[7],@out[7]
852 mov 16(%rsp),%rax # original %rsp
854 #lea `40*8`($inp),$inp
856 #jnz .Lenc8x_loop_grande
861 $code.=<<___ if ($win64);
862 movaps -0xd8(%rax),%xmm6
863 movaps -0xc8(%rax),%xmm7
864 movaps -0xb8(%rax),%xmm8
865 movaps -0xa8(%rax),%xmm9
866 movaps -0x98(%rax),%xmm10
867 movaps -0x88(%rax),%xmm11
868 movaps -0x78(%rax),%xmm12
869 movaps -0x68(%rax),%xmm13
870 movaps -0x58(%rax),%xmm14
871 movaps -0x48(%rax),%xmm15
883 .size aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
885 .type aesni_multi_cbc_decrypt_avx,\@function,3
887 aesni_multi_cbc_decrypt_avx:
888 _avx_cbc_dec_shortcut:
897 $code.=<<___ if ($win64);
900 movaps %xmm7,0x10(%rsp)
901 movaps %xmm8,0x20(%rsp)
902 movaps %xmm9,0x30(%rsp)
903 movaps %xmm10,0x40(%rsp)
904 movaps %xmm11,0x50(%rsp)
905 movaps %xmm12,-0x78(%rax)
906 movaps %xmm13,-0x68(%rax)
907 movaps %xmm14,-0x58(%rax)
908 movaps %xmm15,-0x48(%rax)
914 # +16 input sink [original %rsp and $num]
916 # +64 distances between inputs and outputs
917 # +128 off-load area for @inp[0..3]
918 # +192 IV/input offload
923 mov %rax,16(%rsp) # original %rsp
927 vmovdqu ($key),$zero # 0-round key
928 lea 0x78($key),$key # size optimization
933 #mov $num,24(%rsp) # original $num
936 for($i=0;$i<8;$i++) {
937 my $temp = $i ? $offload : $offset;
939 mov `40*$i+16-40*4`($inp),$one # borrow $one for number of blocks
940 mov `40*$i+0-40*4`($inp),@ptr[$i] # input pointer
942 mov `40*$i+8-40*4`($inp),$temp # output pointer
943 cmovg $one,$num # find maximum
945 vmovdqu `40*$i+24-40*4`($inp),@out[$i] # load IV
946 mov $one,`32+4*$i`(%rsp) # initialize counters
947 cmovle %rsp,@ptr[$i] # cancel input
948 sub @ptr[$i],$temp # distance between input and output
949 mov $temp,`64+8*$i`(%rsp) # initialize distances
950 vmovdqu @out[$i],`192+16*$i`(%rsp) # offload IV
957 vmovups 0x10-0x78($key),$rndkey1
958 vmovups 0x20-0x78($key),$rndkey0
959 mov 0xf0-0x78($key),$rounds
960 lea 192+128(%rsp),$offload # offload area
962 vmovdqu (@ptr[0]),@out[0] # load inputs
963 vmovdqu (@ptr[1]),@out[1]
964 vmovdqu (@ptr[2]),@out[2]
965 vmovdqu (@ptr[3]),@out[3]
966 vmovdqu (@ptr[4]),@out[4]
967 vmovdqu (@ptr[5]),@out[5]
968 vmovdqu (@ptr[6]),@out[6]
969 vmovdqu (@ptr[7]),@out[7]
970 vmovdqu @out[0],0x00($offload) # offload inputs
971 vpxor $zero,@out[0],@out[0] # xor inputs with 0-round
972 vmovdqu @out[1],0x10($offload)
973 vpxor $zero,@out[1],@out[1]
974 vmovdqu @out[2],0x20($offload)
975 vpxor $zero,@out[2],@out[2]
976 vmovdqu @out[3],0x30($offload)
977 vpxor $zero,@out[3],@out[3]
978 vmovdqu @out[4],0x40($offload)
979 vpxor $zero,@out[4],@out[4]
980 vmovdqu @out[5],0x50($offload)
981 vpxor $zero,@out[5],@out[5]
982 vmovdqu @out[6],0x60($offload)
983 vpxor $zero,@out[6],@out[6]
984 vmovdqu @out[7],0x70($offload)
985 vpxor $zero,@out[7],@out[7]
987 mov \$1,$one # constant of 1
993 for($i=0;$i<8;$i++) {
994 my $rndkey=($i&1)?$rndkey0:$rndkey1;
996 vaesdec $rndkey,@out[0],@out[0]
997 cmp 32+4*$i(%rsp),$one
999 $code.=<<___ if ($i);
1000 mov 64+8*$i(%rsp),$offset
1003 vaesdec $rndkey,@out[1],@out[1]
1004 prefetcht0 31(@ptr[$i]) # prefetch input
1005 vaesdec $rndkey,@out[2],@out[2]
1007 $code.=<<___ if ($i>1);
1008 prefetcht0 15(@ptr[$i-2]) # prefetch output
1011 vaesdec $rndkey,@out[3],@out[3]
1012 lea (@ptr[$i],$offset),$offset
1013 cmovge %rsp,@ptr[$i] # cancel input
1014 vaesdec $rndkey,@out[4],@out[4]
1015 cmovg %rsp,$offset # sink output
1016 vaesdec $rndkey,@out[5],@out[5]
1017 sub @ptr[$i],$offset
1018 vaesdec $rndkey,@out[6],@out[6]
1019 vmovdqu 16(@ptr[$i]),@inp[$i%4] # load input
1020 mov $offset,64+8*$i(%rsp)
1021 vaesdec $rndkey,@out[7],@out[7]
1022 vmovups `16*(3+$i)-0x78`($key),$rndkey
1023 lea 16(@ptr[$i],$offset),@ptr[$i] # switch to output
1025 $code.=<<___ if ($i<4);
1026 vmovdqu @inp[$i%4],`128+16*$i`(%rsp) # off-load
1030 vmovdqu 32(%rsp),$counters
1031 prefetcht0 15(@ptr[$i-2]) # prefetch output
1032 prefetcht0 15(@ptr[$i-1])
1036 vaesdec $rndkey1,@out[0],@out[0]
1037 vaesdec $rndkey1,@out[1],@out[1]
1038 vaesdec $rndkey1,@out[2],@out[2]
1039 vaesdec $rndkey1,@out[3],@out[3]
1040 vaesdec $rndkey1,@out[4],@out[4]
1041 vaesdec $rndkey1,@out[5],@out[5]
1042 vaesdec $rndkey1,@out[6],@out[6]
1043 vaesdec $rndkey1,@out[7],@out[7]
1044 vmovups 0xb0-0x78($key),$rndkey1
1046 vaesdec $rndkey0,@out[0],@out[0]
1047 vaesdec $rndkey0,@out[1],@out[1]
1048 vaesdec $rndkey0,@out[2],@out[2]
1049 vaesdec $rndkey0,@out[3],@out[3]
1050 vaesdec $rndkey0,@out[4],@out[4]
1051 vaesdec $rndkey0,@out[5],@out[5]
1052 vaesdec $rndkey0,@out[6],@out[6]
1053 vaesdec $rndkey0,@out[7],@out[7]
1054 vmovups 0xc0-0x78($key),$rndkey0
1057 vaesdec $rndkey1,@out[0],@out[0]
1058 vaesdec $rndkey1,@out[1],@out[1]
1059 vaesdec $rndkey1,@out[2],@out[2]
1060 vaesdec $rndkey1,@out[3],@out[3]
1061 vaesdec $rndkey1,@out[4],@out[4]
1062 vaesdec $rndkey1,@out[5],@out[5]
1063 vaesdec $rndkey1,@out[6],@out[6]
1064 vaesdec $rndkey1,@out[7],@out[7]
1065 vmovups 0xd0-0x78($key),$rndkey1
1067 vaesdec $rndkey0,@out[0],@out[0]
1068 vaesdec $rndkey0,@out[1],@out[1]
1069 vaesdec $rndkey0,@out[2],@out[2]
1070 vaesdec $rndkey0,@out[3],@out[3]
1071 vaesdec $rndkey0,@out[4],@out[4]
1072 vaesdec $rndkey0,@out[5],@out[5]
1073 vaesdec $rndkey0,@out[6],@out[6]
1074 vaesdec $rndkey0,@out[7],@out[7]
1075 vmovups 0xe0-0x78($key),$rndkey0
1078 vaesdec $rndkey1,@out[0],@out[0]
1079 vpxor $zero,$zero,$zero
1080 vaesdec $rndkey1,@out[1],@out[1]
1081 vaesdec $rndkey1,@out[2],@out[2]
1082 vpcmpgtd $zero,$counters,$zero
1083 vaesdec $rndkey1,@out[3],@out[3]
1084 vaesdec $rndkey1,@out[4],@out[4]
1085 vpaddd $counters,$zero,$zero # decrement counters
1086 vmovdqu 48(%rsp),$counters
1087 vaesdec $rndkey1,@out[5],@out[5]
1088 mov 64(%rsp),$offset # pre-load 1st offset
1089 vaesdec $rndkey1,@out[6],@out[6]
1090 vaesdec $rndkey1,@out[7],@out[7]
1091 vmovups 0x10-0x78($key),$rndkey1
1093 vaesdeclast $rndkey0,@out[0],@out[0]
1094 vmovdqa $zero,32(%rsp) # update counters
1095 vpxor $zero,$zero,$zero
1096 vaesdeclast $rndkey0,@out[1],@out[1]
1097 vpxor 0x00($offload),@out[0],@out[0] # xor with IV
1098 vaesdeclast $rndkey0,@out[2],@out[2]
1099 vpxor 0x10($offload),@out[1],@out[1]
1100 vpcmpgtd $zero,$counters,$zero
1101 vaesdeclast $rndkey0,@out[3],@out[3]
1102 vpxor 0x20($offload),@out[2],@out[2]
1103 vaesdeclast $rndkey0,@out[4],@out[4]
1104 vpxor 0x30($offload),@out[3],@out[3]
1105 vpaddd $zero,$counters,$counters # decrement counters
1106 vmovdqu -0x78($key),$zero # 0-round
1107 vaesdeclast $rndkey0,@out[5],@out[5]
1108 vpxor 0x40($offload),@out[4],@out[4]
1109 vaesdeclast $rndkey0,@out[6],@out[6]
1110 vpxor 0x50($offload),@out[5],@out[5]
1111 vmovdqa $counters,48(%rsp) # update counters
1112 vaesdeclast $rndkey0,@out[7],@out[7]
1113 vpxor 0x60($offload),@out[6],@out[6]
1114 vmovups 0x20-0x78($key),$rndkey0
1116 vmovups @out[0],-16(@ptr[0]) # write output
1117 sub $offset,@ptr[0] # switch to input
1118 vmovdqu 128+0(%rsp),@out[0]
1119 vpxor 0x70($offload),@out[7],@out[7]
1120 vmovups @out[1],-16(@ptr[1])
1121 sub `64+1*8`(%rsp),@ptr[1]
1122 vmovdqu @out[0],0x00($offload)
1123 vpxor $zero,@out[0],@out[0]
1124 vmovdqu 128+16(%rsp),@out[1]
1125 vmovups @out[2],-16(@ptr[2])
1126 sub `64+2*8`(%rsp),@ptr[2]
1127 vmovdqu @out[1],0x10($offload)
1128 vpxor $zero,@out[1],@out[1]
1129 vmovdqu 128+32(%rsp),@out[2]
1130 vmovups @out[3],-16(@ptr[3])
1131 sub `64+3*8`(%rsp),@ptr[3]
1132 vmovdqu @out[2],0x20($offload)
1133 vpxor $zero,@out[2],@out[2]
1134 vmovdqu 128+48(%rsp),@out[3]
1135 vmovups @out[4],-16(@ptr[4])
1136 sub `64+4*8`(%rsp),@ptr[4]
1137 vmovdqu @out[3],0x30($offload)
1138 vpxor $zero,@out[3],@out[3]
1139 vmovdqu @inp[0],0x40($offload)
1140 vpxor @inp[0],$zero,@out[4]
1141 vmovups @out[5],-16(@ptr[5])
1142 sub `64+5*8`(%rsp),@ptr[5]
1143 vmovdqu @inp[1],0x50($offload)
1144 vpxor @inp[1],$zero,@out[5]
1145 vmovups @out[6],-16(@ptr[6])
1146 sub `64+6*8`(%rsp),@ptr[6]
1147 vmovdqu @inp[2],0x60($offload)
1148 vpxor @inp[2],$zero,@out[6]
1149 vmovups @out[7],-16(@ptr[7])
1150 sub `64+7*8`(%rsp),@ptr[7]
1151 vmovdqu @inp[3],0x70($offload)
1152 vpxor @inp[3],$zero,@out[7]
1158 mov 16(%rsp),%rax # original %rsp
1160 #lea `40*8`($inp),$inp
1162 #jnz .Ldec8x_loop_grande
1167 $code.=<<___ if ($win64);
1168 movaps -0xd8(%rax),%xmm6
1169 movaps -0xc8(%rax),%xmm7
1170 movaps -0xb8(%rax),%xmm8
1171 movaps -0xa8(%rax),%xmm9
1172 movaps -0x98(%rax),%xmm10
1173 movaps -0x88(%rax),%xmm11
1174 movaps -0x78(%rax),%xmm12
1175 movaps -0x68(%rax),%xmm13
1176 movaps -0x58(%rax),%xmm14
1177 movaps -0x48(%rax),%xmm15
1189 .size aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1194 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1195 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1202 .extern __imp_RtlVirtualUnwind
1203 .type se_handler,\@abi-omnipotent
1217 mov 120($context),%rax # pull context->Rax
1218 mov 248($context),%rbx # pull context->Rip
1220 mov 8($disp),%rsi # disp->ImageBase
1221 mov 56($disp),%r11 # disp->HandlerData
1223 mov 0(%r11),%r10d # HandlerData[0]
1224 lea (%rsi,%r10),%r10 # prologue label
1225 cmp %r10,%rbx # context->Rip<.Lprologue
1228 mov 152($context),%rax # pull context->Rsp
1230 mov 4(%r11),%r10d # HandlerData[1]
1231 lea (%rsi,%r10),%r10 # epilogue label
1232 cmp %r10,%rbx # context->Rip>=.Lepilogue
1235 mov 16(%rax),%rax # pull saved stack pointer
1243 mov %rbx,144($context) # restore context->Rbx
1244 mov %rbp,160($context) # restore context->Rbp
1245 mov %r12,216($context) # restore cotnext->R12
1246 mov %r13,224($context) # restore cotnext->R13
1247 mov %r14,232($context) # restore cotnext->R14
1248 mov %r15,240($context) # restore cotnext->R15
1250 lea -56-10*16(%rax),%rsi
1251 lea 512($context),%rdi # &context.Xmm6
1253 .long 0xa548f3fc # cld; rep movsq
1258 mov %rax,152($context) # restore context->Rsp
1259 mov %rsi,168($context) # restore context->Rsi
1260 mov %rdi,176($context) # restore context->Rdi
1262 mov 40($disp),%rdi # disp->ContextRecord
1263 mov $context,%rsi # context
1264 mov \$154,%ecx # sizeof(CONTEXT)
1265 .long 0xa548f3fc # cld; rep movsq
1268 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1269 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1270 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1271 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1272 mov 40(%rsi),%r10 # disp->ContextRecord
1273 lea 56(%rsi),%r11 # &disp->HandlerData
1274 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1275 mov %r10,32(%rsp) # arg5
1276 mov %r11,40(%rsp) # arg6
1277 mov %r12,48(%rsp) # arg7
1278 mov %rcx,56(%rsp) # arg8, (NULL)
1279 call *__imp_RtlVirtualUnwind(%rip)
1281 mov \$1,%eax # ExceptionContinueSearch
1293 .size se_handler,.-se_handler
1297 .rva .LSEH_begin_aesni_multi_cbc_encrypt
1298 .rva .LSEH_end_aesni_multi_cbc_encrypt
1299 .rva .LSEH_info_aesni_multi_cbc_encrypt
1300 .rva .LSEH_begin_aesni_multi_cbc_decrypt
1301 .rva .LSEH_end_aesni_multi_cbc_decrypt
1302 .rva .LSEH_info_aesni_multi_cbc_decrypt
1304 $code.=<<___ if ($avx);
1305 .rva .LSEH_begin_aesni_multi_cbc_encrypt_avx
1306 .rva .LSEH_end_aesni_multi_cbc_encrypt_avx
1307 .rva .LSEH_info_aesni_multi_cbc_encrypt_avx
1308 .rva .LSEH_begin_aesni_multi_cbc_decrypt_avx
1309 .rva .LSEH_end_aesni_multi_cbc_decrypt_avx
1310 .rva .LSEH_info_aesni_multi_cbc_decrypt_avx
1315 .LSEH_info_aesni_multi_cbc_encrypt:
1318 .rva .Lenc4x_body,.Lenc4x_epilogue # HandlerData[]
1319 .LSEH_info_aesni_multi_cbc_decrypt:
1322 .rva .Ldec4x_body,.Ldec4x_epilogue # HandlerData[]
1324 $code.=<<___ if ($avx);
1325 .LSEH_info_aesni_multi_cbc_encrypt_avx:
1328 .rva .Lenc8x_body,.Lenc8x_epilogue # HandlerData[]
1329 .LSEH_info_aesni_multi_cbc_decrypt_avx:
1332 .rva .Ldec8x_body,.Ldec8x_epilogue # HandlerData[]
1335 ####################################################################
1338 local *opcode=shift;
1342 $rex|=0x04 if($dst>=8);
1343 $rex|=0x01 if($src>=8);
1344 push @opcode,$rex|0x40 if($rex);
1351 if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1352 rex(\@opcode,$4,$3);
1353 push @opcode,0x0f,0x3a,0xdf;
1354 push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
1356 push @opcode,$c=~/^0/?oct($c):$c;
1357 return ".byte\t".join(',',@opcode);
1359 elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1362 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1363 "aesdec" => 0xde, "aesdeclast" => 0xdf
1365 return undef if (!defined($opcodelet{$1}));
1366 rex(\@opcode,$3,$2);
1367 push @opcode,0x0f,0x38,$opcodelet{$1};
1368 push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
1369 return ".byte\t".join(',',@opcode);
1371 elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1373 "aesenc" => 0xdc, "aesenclast" => 0xdd,
1374 "aesdec" => 0xde, "aesdeclast" => 0xdf
1376 return undef if (!defined($opcodelet{$1}));
1378 push @opcode,0x44 if ($3>=8);
1379 push @opcode,0x0f,0x38,$opcodelet{$1};
1380 push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
1381 push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1382 return ".byte\t".join(',',@opcode);
1387 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1388 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;