Update copyright year
[openssl.git] / crypto / aes / asm / aesni-mb-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2020 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16
17 # Multi-buffer AES-NI procedures process several independent buffers
18 # in parallel by interleaving independent instructions.
19 #
20 # Cycles per byte for interleave factor 4:
21 #
22 #                       asymptotic      measured
23 #                       ---------------------------
24 # Westmere              5.00/4=1.25     5.13/4=1.28
25 # Atom                  15.0/4=3.75     ?15.7/4=3.93
26 # Sandy Bridge          5.06/4=1.27     5.18/4=1.29
27 # Ivy Bridge            5.06/4=1.27     5.14/4=1.29
28 # Haswell               4.44/4=1.11     4.44/4=1.11
29 # Bulldozer             5.75/4=1.44     5.76/4=1.44
30 #
31 # Cycles per byte for interleave factor 8 (not implemented for
32 # pre-AVX processors, where higher interleave factor incidentally
33 # doesn't result in improvement):
34 #
35 #                       asymptotic      measured
36 #                       ---------------------------
37 # Sandy Bridge          5.06/8=0.64     7.10/8=0.89(*)
38 # Ivy Bridge            5.06/8=0.64     7.14/8=0.89(*)
39 # Haswell               5.00/8=0.63     5.00/8=0.63
40 # Bulldozer             5.75/8=0.72     5.77/8=0.72
41 #
42 # (*)   Sandy/Ivy Bridge are known to handle high interleave factors
43 #       suboptimally;
44
45 # $output is the last argument if it looks like a file (it has an extension)
46 # $flavour is the first argument if it doesn't look like a file
47 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
48 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
49
50 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
51
52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
53 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
54 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
55 die "can't locate x86_64-xlate.pl";
56
57 push(@INC,"${dir}","${dir}../../perlasm");
58 require "x86_64-support.pl";
59
60 $ptr_size=&pointer_size($flavour);
61
62 $avx=0;
63
64 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
65                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
66         $avx = ($1>=2.19) + ($1>=2.22);
67 }
68
69 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
70            `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
71         $avx = ($1>=2.09) + ($1>=2.10);
72 }
73
74 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
75            `ml64 2>&1` =~ /Version ([0-9]+)\./) {
76         $avx = ($1>=10) + ($1>=11);
77 }
78
79 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
80         $avx = ($2>=3.0) + ($2>3.0);
81 }
82
83 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
84     or die "can't call $xlate: $!";
85 *STDOUT=*OUT;
86
87 # void aesni_multi_cbc_encrypt (
88 #     struct {  void *inp,*out; int blocks; double iv[2]; } inp[8];
89 #     const AES_KEY *key,
90 #     int num);         /* 1 or 2 */
91 #
92 $inp="%rdi";    # 1st arg
93 $key="%rsi";    # 2nd arg
94 $num="%edx";
95
96 $inp_elm_size=2*$ptr_size+8+16;
97
98 @inptr=map("%r$_",(8..11));
99 @outptr=map("%r$_",(12..15));
100
101 ($rndkey0,$rndkey1)=("%xmm0","%xmm1");
102 @out=map("%xmm$_",(2..5));
103 @inp=map("%xmm$_",(6..9));
104 ($counters,$mask,$zero)=map("%xmm$_",(10..12));
105
106 ($rounds,$one,$sink,$offset)=("%eax","%ecx","%rbp","%rbx");
107
108 $code.=<<___;
109 .text
110
111 .extern OPENSSL_ia32cap_P
112
113 .globl  aesni_multi_cbc_encrypt
114 .type   aesni_multi_cbc_encrypt,\@function,3
115 .align  32
116 aesni_multi_cbc_encrypt:
117 .cfi_startproc
118 ___
119 $code.=<<___ if ($avx);
120         cmp     \$2,$num
121         jb      .Lenc_non_avx
122         mov     OPENSSL_ia32cap_P+4(%rip),%ecx
123         test    \$`1<<28`,%ecx                  # AVX bit
124         jnz     _avx_cbc_enc_shortcut
125         jmp     .Lenc_non_avx
126 .align  16
127 .Lenc_non_avx:
128 ___
129 $code.=<<___;
130         mov     %rsp,%rax
131 .cfi_def_cfa_register   %rax
132         push    %rbx
133 .cfi_push       %rbx
134         push    %rbp
135 .cfi_push       %rbp
136         push    %r12
137 .cfi_push       %r12
138         push    %r13
139 .cfi_push       %r13
140         push    %r14
141 .cfi_push       %r14
142         push    %r15
143 .cfi_push       %r15
144 ___
145 $code.=<<___ if ($win64);
146         lea     -0xa8(%rsp),%rsp
147         movaps  %xmm6,(%rsp)
148         movaps  %xmm7,0x10(%rsp)
149         movaps  %xmm8,0x20(%rsp)
150         movaps  %xmm9,0x30(%rsp)
151         movaps  %xmm10,0x40(%rsp)
152         movaps  %xmm11,0x50(%rsp)
153         movaps  %xmm12,0x60(%rsp)
154         movaps  %xmm13,-0x68(%rax)      # not used, saved to share se_handler
155         movaps  %xmm14,-0x58(%rax)
156         movaps  %xmm15,-0x48(%rax)
157 ___
158 $code.=<<___;
159         # stack layout
160         #
161         # +0    output sink
162         # +16   input sink [original %rsp and $num]
163         # +32   counters
164
165         sub     \$48,%rsp
166         and     \$-64,%rsp
167         mov     %rax,16(%rsp)                   # original %rsp
168 .cfi_cfa_expression     %rsp+16,deref,+8
169
170 .Lenc4x_body:
171         movdqu  ($key),$zero                    # 0-round key
172         lea     0x78($key),$key                 # size optimization
173         lea     $inp_elm_size*2($inp),$inp
174
175 .Lenc4x_loop_grande:
176         mov     $num,24(%rsp)                   # original $num
177         xor     $num,$num
178 ___
179 for($i=0;$i<4;$i++) {
180     $inptr_reg=&pointer_register($flavour,@inptr[$i]);
181     $outptr_reg=&pointer_register($flavour,@outptr[$i]);
182     $code.=<<___;
183         # borrow $one for number of blocks
184         mov     `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
185         mov     `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
186         cmp     $num,$one
187         mov     `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
188         cmovg   $one,$num                       # find maximum
189         test    $one,$one
190         # load IV
191         movdqu  `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@out[$i]
192         mov     $one,`32+4*$i`(%rsp)            # initialize counters
193         cmovle  %rsp,@inptr[$i]                 # cancel input
194 ___
195 }
196 $code.=<<___;
197         test    $num,$num
198         jz      .Lenc4x_done
199
200         movups  0x10-0x78($key),$rndkey1
201          pxor   $zero,@out[0]
202         movups  0x20-0x78($key),$rndkey0
203          pxor   $zero,@out[1]
204         mov     0xf0-0x78($key),$rounds
205          pxor   $zero,@out[2]
206         movdqu  (@inptr[0]),@inp[0]             # load inputs
207          pxor   $zero,@out[3]
208         movdqu  (@inptr[1]),@inp[1]
209          pxor   @inp[0],@out[0]
210         movdqu  (@inptr[2]),@inp[2]
211          pxor   @inp[1],@out[1]
212         movdqu  (@inptr[3]),@inp[3]
213          pxor   @inp[2],@out[2]
214          pxor   @inp[3],@out[3]
215         movdqa  32(%rsp),$counters              # load counters
216         xor     $offset,$offset
217         jmp     .Loop_enc4x
218
219 .align  32
220 .Loop_enc4x:
221         add     \$16,$offset
222         lea     16(%rsp),$sink                  # sink pointer
223         mov     \$1,$one                        # constant of 1
224         sub     $offset,$sink
225
226         aesenc          $rndkey1,@out[0]
227         prefetcht0      31(@inptr[0],$offset)   # prefetch input
228         prefetcht0      31(@inptr[1],$offset)
229         aesenc          $rndkey1,@out[1]
230         prefetcht0      31(@inptr[2],$offset)
231         prefetcht0      31(@inptr[2],$offset)
232         aesenc          $rndkey1,@out[2]
233         aesenc          $rndkey1,@out[3]
234         movups          0x30-0x78($key),$rndkey1
235 ___
236 for($i=0;$i<4;$i++) {
237 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
238 $code.=<<___;
239          cmp            `32+4*$i`(%rsp),$one
240         aesenc          $rndkey,@out[0]
241         aesenc          $rndkey,@out[1]
242         aesenc          $rndkey,@out[2]
243          cmovge         $sink,@inptr[$i]        # cancel input
244          cmovg          $sink,@outptr[$i]       # sink output
245         aesenc          $rndkey,@out[3]
246         movups          `0x40+16*$i-0x78`($key),$rndkey
247 ___
248 }
249 $code.=<<___;
250          movdqa         $counters,$mask
251         aesenc          $rndkey0,@out[0]
252         prefetcht0      15(@outptr[0],$offset)  # prefetch output
253         prefetcht0      15(@outptr[1],$offset)
254         aesenc          $rndkey0,@out[1]
255         prefetcht0      15(@outptr[2],$offset)
256         prefetcht0      15(@outptr[3],$offset)
257         aesenc          $rndkey0,@out[2]
258         aesenc          $rndkey0,@out[3]
259         movups          0x80-0x78($key),$rndkey0
260          pxor           $zero,$zero
261
262         aesenc          $rndkey1,@out[0]
263          pcmpgtd        $zero,$mask
264          movdqu         -0x78($key),$zero       # reload 0-round key
265         aesenc          $rndkey1,@out[1]
266          paddd          $mask,$counters         # decrement counters
267          movdqa         $counters,32(%rsp)      # update counters
268         aesenc          $rndkey1,@out[2]
269         aesenc          $rndkey1,@out[3]
270         movups          0x90-0x78($key),$rndkey1
271
272         cmp     \$11,$rounds
273
274         aesenc          $rndkey0,@out[0]
275         aesenc          $rndkey0,@out[1]
276         aesenc          $rndkey0,@out[2]
277         aesenc          $rndkey0,@out[3]
278         movups          0xa0-0x78($key),$rndkey0
279
280         jb      .Lenc4x_tail
281
282         aesenc          $rndkey1,@out[0]
283         aesenc          $rndkey1,@out[1]
284         aesenc          $rndkey1,@out[2]
285         aesenc          $rndkey1,@out[3]
286         movups          0xb0-0x78($key),$rndkey1
287
288         aesenc          $rndkey0,@out[0]
289         aesenc          $rndkey0,@out[1]
290         aesenc          $rndkey0,@out[2]
291         aesenc          $rndkey0,@out[3]
292         movups          0xc0-0x78($key),$rndkey0
293
294         je      .Lenc4x_tail
295
296         aesenc          $rndkey1,@out[0]
297         aesenc          $rndkey1,@out[1]
298         aesenc          $rndkey1,@out[2]
299         aesenc          $rndkey1,@out[3]
300         movups          0xd0-0x78($key),$rndkey1
301
302         aesenc          $rndkey0,@out[0]
303         aesenc          $rndkey0,@out[1]
304         aesenc          $rndkey0,@out[2]
305         aesenc          $rndkey0,@out[3]
306         movups          0xe0-0x78($key),$rndkey0
307         jmp     .Lenc4x_tail
308
309 .align  32
310 .Lenc4x_tail:
311         aesenc          $rndkey1,@out[0]
312         aesenc          $rndkey1,@out[1]
313         aesenc          $rndkey1,@out[2]
314         aesenc          $rndkey1,@out[3]
315          movdqu         (@inptr[0],$offset),@inp[0]
316         movdqu          0x10-0x78($key),$rndkey1
317
318         aesenclast      $rndkey0,@out[0]
319          movdqu         (@inptr[1],$offset),@inp[1]
320          pxor           $zero,@inp[0]
321         aesenclast      $rndkey0,@out[1]
322          movdqu         (@inptr[2],$offset),@inp[2]
323          pxor           $zero,@inp[1]
324         aesenclast      $rndkey0,@out[2]
325          movdqu         (@inptr[3],$offset),@inp[3]
326          pxor           $zero,@inp[2]
327         aesenclast      $rndkey0,@out[3]
328         movdqu          0x20-0x78($key),$rndkey0
329          pxor           $zero,@inp[3]
330
331         movups          @out[0],-16(@outptr[0],$offset)
332          pxor           @inp[0],@out[0]
333         movups          @out[1],-16(@outptr[1],$offset)
334          pxor           @inp[1],@out[1]
335         movups          @out[2],-16(@outptr[2],$offset)
336          pxor           @inp[2],@out[2]
337         movups          @out[3],-16(@outptr[3],$offset)
338          pxor           @inp[3],@out[3]
339
340         dec     $num
341         jnz     .Loop_enc4x
342
343         mov     16(%rsp),%rax                   # original %rsp
344 .cfi_def_cfa    %rax,8
345         mov     24(%rsp),$num
346
347         #pxor   @inp[0],@out[0]
348         #pxor   @inp[1],@out[1]
349         # output iv FIX ME!
350         #movdqu @out[0],`$inp_elm_size*0+2*$ptr_size+8-$inp_elm_size*2`($inp)
351         #pxor   @inp[2],@out[2]
352         #movdqu @out[1],`$inp_elm_size*1+2*$ptr_size+8-$inp_elm_size*2`($inp)
353         #pxor   @inp[3],@out[3]
354         #movdqu @out[2],`$inp_elm_size*2+2*$ptr_size+8-$inp_elm_size*2`($inp)   # won't fix, let caller
355         #movdqu @out[3],`$inp_elm_size*3+2*$ptr_size+8-$inp_elm_size*2`($inp)   # figure this out...
356
357         lea     `$inp_elm_size*4`($inp),$inp
358         dec     $num
359         jnz     .Lenc4x_loop_grande
360
361 .Lenc4x_done:
362 ___
363 $code.=<<___ if ($win64);
364         movaps  -0xd8(%rax),%xmm6
365         movaps  -0xc8(%rax),%xmm7
366         movaps  -0xb8(%rax),%xmm8
367         movaps  -0xa8(%rax),%xmm9
368         movaps  -0x98(%rax),%xmm10
369         movaps  -0x88(%rax),%xmm11
370         movaps  -0x78(%rax),%xmm12
371         #movaps -0x68(%rax),%xmm13
372         #movaps -0x58(%rax),%xmm14
373         #movaps -0x48(%rax),%xmm15
374 ___
375 $code.=<<___;
376         mov     -48(%rax),%r15
377 .cfi_restore    %r15
378         mov     -40(%rax),%r14
379 .cfi_restore    %r14
380         mov     -32(%rax),%r13
381 .cfi_restore    %r13
382         mov     -24(%rax),%r12
383 .cfi_restore    %r12
384         mov     -16(%rax),%rbp
385 .cfi_restore    %rbp
386         mov     -8(%rax),%rbx
387 .cfi_restore    %rbx
388         lea     (%rax),%rsp
389 .cfi_def_cfa_register   %rsp
390 .Lenc4x_epilogue:
391         ret
392 .cfi_endproc
393 .size   aesni_multi_cbc_encrypt,.-aesni_multi_cbc_encrypt
394
395 .globl  aesni_multi_cbc_decrypt
396 .type   aesni_multi_cbc_decrypt,\@function,3
397 .align  32
398 aesni_multi_cbc_decrypt:
399 .cfi_startproc
400 ___
401 $code.=<<___ if ($avx);
402         cmp     \$2,$num
403         jb      .Ldec_non_avx
404         mov     OPENSSL_ia32cap_P+4(%rip),%ecx
405         test    \$`1<<28`,%ecx                  # AVX bit
406         jnz     _avx_cbc_dec_shortcut
407         jmp     .Ldec_non_avx
408 .align  16
409 .Ldec_non_avx:
410 ___
411 $code.=<<___;
412         mov     %rsp,%rax
413 .cfi_def_cfa_register   %rax
414         push    %rbx
415 .cfi_push       %rbx
416         push    %rbp
417 .cfi_push       %rbp
418         push    %r12
419 .cfi_push       %r12
420         push    %r13
421 .cfi_push       %r13
422         push    %r14
423 .cfi_push       %r14
424         push    %r15
425 .cfi_push       %r15
426 ___
427 $code.=<<___ if ($win64);
428         lea     -0xa8(%rsp),%rsp
429         movaps  %xmm6,(%rsp)
430         movaps  %xmm7,0x10(%rsp)
431         movaps  %xmm8,0x20(%rsp)
432         movaps  %xmm9,0x30(%rsp)
433         movaps  %xmm10,0x40(%rsp)
434         movaps  %xmm11,0x50(%rsp)
435         movaps  %xmm12,0x60(%rsp)
436         movaps  %xmm13,-0x68(%rax)      # not used, saved to share se_handler
437         movaps  %xmm14,-0x58(%rax)
438         movaps  %xmm15,-0x48(%rax)
439 ___
440 $code.=<<___;
441         # stack layout
442         #
443         # +0    output sink
444         # +16   input sink [original %rsp and $num]
445         # +32   counters
446
447         sub     \$48,%rsp
448         and     \$-64,%rsp
449         mov     %rax,16(%rsp)                   # original %rsp
450 .cfi_cfa_expression     %rsp+16,deref,+8
451
452 .Ldec4x_body:
453         movdqu  ($key),$zero                    # 0-round key
454         lea     0x78($key),$key                 # size optimization
455         lea     $inp_elm_size*2($inp),$inp
456
457 .Ldec4x_loop_grande:
458         mov     $num,24(%rsp)                   # original $num
459         xor     $num,$num
460 ___
461 for($i=0;$i<4;$i++) {
462     $inptr_reg=&pointer_register($flavour,@inptr[$i]);
463     $outptr_reg=&pointer_register($flavour,@outptr[$i]);
464     $code.=<<___;
465         # borrow $one for number of blocks
466         mov     `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*2`($inp),$one
467         mov     `$inp_elm_size*$i+0-$inp_elm_size*2`($inp),$inptr_reg
468         cmp     $num,$one
469         mov     `$inp_elm_size*$i+$ptr_size-$inp_elm_size*2`($inp),$outptr_reg
470         cmovg   $one,$num                       # find maximum
471         test    $one,$one
472         # load IV
473         movdqu  `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*2`($inp),@inp[$i]
474         mov     $one,`32+4*$i`(%rsp)            # initialize counters
475         cmovle  %rsp,@inptr[$i]                 # cancel input
476 ___
477 }
478 $code.=<<___;
479         test    $num,$num
480         jz      .Ldec4x_done
481
482         movups  0x10-0x78($key),$rndkey1
483         movups  0x20-0x78($key),$rndkey0
484         mov     0xf0-0x78($key),$rounds
485         movdqu  (@inptr[0]),@out[0]             # load inputs
486         movdqu  (@inptr[1]),@out[1]
487          pxor   $zero,@out[0]
488         movdqu  (@inptr[2]),@out[2]
489          pxor   $zero,@out[1]
490         movdqu  (@inptr[3]),@out[3]
491          pxor   $zero,@out[2]
492          pxor   $zero,@out[3]
493         movdqa  32(%rsp),$counters              # load counters
494         xor     $offset,$offset
495         jmp     .Loop_dec4x
496
497 .align  32
498 .Loop_dec4x:
499         add     \$16,$offset
500         lea     16(%rsp),$sink                  # sink pointer
501         mov     \$1,$one                        # constant of 1
502         sub     $offset,$sink
503
504         aesdec          $rndkey1,@out[0]
505         prefetcht0      31(@inptr[0],$offset)   # prefetch input
506         prefetcht0      31(@inptr[1],$offset)
507         aesdec          $rndkey1,@out[1]
508         prefetcht0      31(@inptr[2],$offset)
509         prefetcht0      31(@inptr[3],$offset)
510         aesdec          $rndkey1,@out[2]
511         aesdec          $rndkey1,@out[3]
512         movups          0x30-0x78($key),$rndkey1
513 ___
514 for($i=0;$i<4;$i++) {
515 my $rndkey = ($i&1) ? $rndkey1 : $rndkey0;
516 $code.=<<___;
517          cmp            `32+4*$i`(%rsp),$one
518         aesdec          $rndkey,@out[0]
519         aesdec          $rndkey,@out[1]
520         aesdec          $rndkey,@out[2]
521          cmovge         $sink,@inptr[$i]        # cancel input
522          cmovg          $sink,@outptr[$i]       # sink output
523         aesdec          $rndkey,@out[3]
524         movups          `0x40+16*$i-0x78`($key),$rndkey
525 ___
526 }
527 $code.=<<___;
528          movdqa         $counters,$mask
529         aesdec          $rndkey0,@out[0]
530         prefetcht0      15(@outptr[0],$offset)  # prefetch output
531         prefetcht0      15(@outptr[1],$offset)
532         aesdec          $rndkey0,@out[1]
533         prefetcht0      15(@outptr[2],$offset)
534         prefetcht0      15(@outptr[3],$offset)
535         aesdec          $rndkey0,@out[2]
536         aesdec          $rndkey0,@out[3]
537         movups          0x80-0x78($key),$rndkey0
538          pxor           $zero,$zero
539
540         aesdec          $rndkey1,@out[0]
541          pcmpgtd        $zero,$mask
542          movdqu         -0x78($key),$zero       # reload 0-round key
543         aesdec          $rndkey1,@out[1]
544          paddd          $mask,$counters         # decrement counters
545          movdqa         $counters,32(%rsp)      # update counters
546         aesdec          $rndkey1,@out[2]
547         aesdec          $rndkey1,@out[3]
548         movups          0x90-0x78($key),$rndkey1
549
550         cmp     \$11,$rounds
551
552         aesdec          $rndkey0,@out[0]
553         aesdec          $rndkey0,@out[1]
554         aesdec          $rndkey0,@out[2]
555         aesdec          $rndkey0,@out[3]
556         movups          0xa0-0x78($key),$rndkey0
557
558         jb      .Ldec4x_tail
559
560         aesdec          $rndkey1,@out[0]
561         aesdec          $rndkey1,@out[1]
562         aesdec          $rndkey1,@out[2]
563         aesdec          $rndkey1,@out[3]
564         movups          0xb0-0x78($key),$rndkey1
565
566         aesdec          $rndkey0,@out[0]
567         aesdec          $rndkey0,@out[1]
568         aesdec          $rndkey0,@out[2]
569         aesdec          $rndkey0,@out[3]
570         movups          0xc0-0x78($key),$rndkey0
571
572         je      .Ldec4x_tail
573
574         aesdec          $rndkey1,@out[0]
575         aesdec          $rndkey1,@out[1]
576         aesdec          $rndkey1,@out[2]
577         aesdec          $rndkey1,@out[3]
578         movups          0xd0-0x78($key),$rndkey1
579
580         aesdec          $rndkey0,@out[0]
581         aesdec          $rndkey0,@out[1]
582         aesdec          $rndkey0,@out[2]
583         aesdec          $rndkey0,@out[3]
584         movups          0xe0-0x78($key),$rndkey0
585         jmp     .Ldec4x_tail
586
587 .align  32
588 .Ldec4x_tail:
589         aesdec          $rndkey1,@out[0]
590         aesdec          $rndkey1,@out[1]
591         aesdec          $rndkey1,@out[2]
592          pxor           $rndkey0,@inp[0]
593          pxor           $rndkey0,@inp[1]
594         aesdec          $rndkey1,@out[3]
595         movdqu          0x10-0x78($key),$rndkey1
596          pxor           $rndkey0,@inp[2]
597          pxor           $rndkey0,@inp[3]
598         movdqu          0x20-0x78($key),$rndkey0
599
600         aesdeclast      @inp[0],@out[0]
601         aesdeclast      @inp[1],@out[1]
602          movdqu         -16(@inptr[0],$offset),@inp[0]  # load next IV
603          movdqu         -16(@inptr[1],$offset),@inp[1]
604         aesdeclast      @inp[2],@out[2]
605         aesdeclast      @inp[3],@out[3]
606          movdqu         -16(@inptr[2],$offset),@inp[2]
607          movdqu         -16(@inptr[3],$offset),@inp[3]
608
609         movups          @out[0],-16(@outptr[0],$offset)
610          movdqu         (@inptr[0],$offset),@out[0]
611         movups          @out[1],-16(@outptr[1],$offset)
612          movdqu         (@inptr[1],$offset),@out[1]
613          pxor           $zero,@out[0]
614         movups          @out[2],-16(@outptr[2],$offset)
615          movdqu         (@inptr[2],$offset),@out[2]
616          pxor           $zero,@out[1]
617         movups          @out[3],-16(@outptr[3],$offset)
618          movdqu         (@inptr[3],$offset),@out[3]
619          pxor           $zero,@out[2]
620          pxor           $zero,@out[3]
621
622         dec     $num
623         jnz     .Loop_dec4x
624
625         mov     16(%rsp),%rax                   # original %rsp
626 .cfi_def_cfa    %rax,8
627         mov     24(%rsp),$num
628
629         lea     `$inp_elm_size*4`($inp),$inp
630         dec     $num
631         jnz     .Ldec4x_loop_grande
632
633 .Ldec4x_done:
634 ___
635 $code.=<<___ if ($win64);
636         movaps  -0xd8(%rax),%xmm6
637         movaps  -0xc8(%rax),%xmm7
638         movaps  -0xb8(%rax),%xmm8
639         movaps  -0xa8(%rax),%xmm9
640         movaps  -0x98(%rax),%xmm10
641         movaps  -0x88(%rax),%xmm11
642         movaps  -0x78(%rax),%xmm12
643         #movaps -0x68(%rax),%xmm13
644         #movaps -0x58(%rax),%xmm14
645         #movaps -0x48(%rax),%xmm15
646 ___
647 $code.=<<___;
648         mov     -48(%rax),%r15
649 .cfi_restore    %r15
650         mov     -40(%rax),%r14
651 .cfi_restore    %r14
652         mov     -32(%rax),%r13
653 .cfi_restore    %r13
654         mov     -24(%rax),%r12
655 .cfi_restore    %r12
656         mov     -16(%rax),%rbp
657 .cfi_restore    %rbp
658         mov     -8(%rax),%rbx
659 .cfi_restore    %rbx
660         lea     (%rax),%rsp
661 .cfi_def_cfa_register   %rsp
662 .Ldec4x_epilogue:
663         ret
664 .cfi_endproc
665 .size   aesni_multi_cbc_decrypt,.-aesni_multi_cbc_decrypt
666 ___
667
668                                                 if ($avx) {{{
669 my @ptr=map("%r$_",(8..15));
670 my $offload=$sink;
671
672 my @out=map("%xmm$_",(2..9));
673 my @inp=map("%xmm$_",(10..13));
674 my ($counters,$zero)=("%xmm14","%xmm15");
675
676 $code.=<<___;
677 .type   aesni_multi_cbc_encrypt_avx,\@function,3
678 .align  32
679 aesni_multi_cbc_encrypt_avx:
680 .cfi_startproc
681 _avx_cbc_enc_shortcut:
682         mov     %rsp,%rax
683 .cfi_def_cfa_register   %rax
684         push    %rbx
685 .cfi_push       %rbx
686         push    %rbp
687 .cfi_push       %rbp
688         push    %r12
689 .cfi_push       %r12
690         push    %r13
691 .cfi_push       %r13
692         push    %r14
693 .cfi_push       %r14
694         push    %r15
695 .cfi_push       %r15
696 ___
697 $code.=<<___ if ($win64);
698         lea     -0xa8(%rsp),%rsp
699         movaps  %xmm6,(%rsp)
700         movaps  %xmm7,0x10(%rsp)
701         movaps  %xmm8,0x20(%rsp)
702         movaps  %xmm9,0x30(%rsp)
703         movaps  %xmm10,0x40(%rsp)
704         movaps  %xmm11,0x50(%rsp)
705         movaps  %xmm12,-0x78(%rax)
706         movaps  %xmm13,-0x68(%rax)
707         movaps  %xmm14,-0x58(%rax)
708         movaps  %xmm15,-0x48(%rax)
709 ___
710 $code.=<<___;
711         # stack layout
712         #
713         # +0    output sink
714         # +16   input sink [original %rsp and $num]
715         # +32   counters
716         # +64   distances between inputs and outputs
717         # +128  off-load area for @inp[0..3]
718
719         sub     \$192,%rsp
720         and     \$-128,%rsp
721         mov     %rax,16(%rsp)                   # original %rsp
722 .cfi_cfa_expression     %rsp+16,deref,+8
723
724 .Lenc8x_body:
725         vzeroupper
726         vmovdqu ($key),$zero                    # 0-round key
727         lea     0x78($key),$key                 # size optimization
728         lea     `$inp_elm_size*4`($inp),$inp
729         shr     \$1,$num
730
731 .Lenc8x_loop_grande:
732         #mov    $num,24(%rsp)                   # original $num
733         xor     $num,$num
734 ___
735 for($i=0;$i<8;$i++) {
736   my $temp = $i ? $offload : $offset;
737     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
738     $temp_reg=&pointer_register($flavour,$temp);
739     $code.=<<___;
740         # borrow $one for number of blocks
741         mov     `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
742         # input pointer
743         mov     `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
744         cmp     $num,$one
745         # output pointer
746         mov     `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
747         cmovg   $one,$num                       # find maximum
748         test    $one,$one
749         # load IV
750         vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
751         mov     $one,`32+4*$i`(%rsp)            # initialize counters
752         cmovle  %rsp,@ptr[$i]                   # cancel input
753         sub     @ptr[$i],$temp                  # distance between input and output
754         mov     $temp,`64+8*$i`(%rsp)           # initialize distances
755 ___
756 }
757 $code.=<<___;
758         test    $num,$num
759         jz      .Lenc8x_done
760
761         vmovups 0x10-0x78($key),$rndkey1
762         vmovups 0x20-0x78($key),$rndkey0
763         mov     0xf0-0x78($key),$rounds
764
765         vpxor   (@ptr[0]),$zero,@inp[0]         # load inputs and xor with 0-round
766          lea    128(%rsp),$offload              # offload area
767         vpxor   (@ptr[1]),$zero,@inp[1]
768         vpxor   (@ptr[2]),$zero,@inp[2]
769         vpxor   (@ptr[3]),$zero,@inp[3]
770          vpxor  @inp[0],@out[0],@out[0]
771         vpxor   (@ptr[4]),$zero,@inp[0]
772          vpxor  @inp[1],@out[1],@out[1]
773         vpxor   (@ptr[5]),$zero,@inp[1]
774          vpxor  @inp[2],@out[2],@out[2]
775         vpxor   (@ptr[6]),$zero,@inp[2]
776          vpxor  @inp[3],@out[3],@out[3]
777         vpxor   (@ptr[7]),$zero,@inp[3]
778          vpxor  @inp[0],@out[4],@out[4]
779         mov     \$1,$one                        # constant of 1
780          vpxor  @inp[1],@out[5],@out[5]
781          vpxor  @inp[2],@out[6],@out[6]
782          vpxor  @inp[3],@out[7],@out[7]
783         jmp     .Loop_enc8x
784
785 .align  32
786 .Loop_enc8x:
787 ___
788 for($i=0;$i<8;$i++) {
789 my $rndkey=($i&1)?$rndkey0:$rndkey1;
790 $code.=<<___;
791         vaesenc         $rndkey,@out[0],@out[0]
792          cmp            32+4*$i(%rsp),$one
793 ___
794 $code.=<<___ if ($i);
795          mov            64+8*$i(%rsp),$offset
796 ___
797 $code.=<<___;
798         vaesenc         $rndkey,@out[1],@out[1]
799         prefetcht0      31(@ptr[$i])                    # prefetch input
800         vaesenc         $rndkey,@out[2],@out[2]
801 ___
802 $code.=<<___ if ($i>1);
803         prefetcht0      15(@ptr[$i-2])                  # prefetch output
804 ___
805 $code.=<<___;
806         vaesenc         $rndkey,@out[3],@out[3]
807          lea            (@ptr[$i],$offset),$offset
808          cmovge         %rsp,@ptr[$i]                   # cancel input
809         vaesenc         $rndkey,@out[4],@out[4]
810          cmovg          %rsp,$offset                    # sink output
811         vaesenc         $rndkey,@out[5],@out[5]
812          sub            @ptr[$i],$offset
813         vaesenc         $rndkey,@out[6],@out[6]
814          vpxor          16(@ptr[$i]),$zero,@inp[$i%4]   # load input and xor with 0-round
815          mov            $offset,64+8*$i(%rsp)
816         vaesenc         $rndkey,@out[7],@out[7]
817         vmovups         `16*(3+$i)-0x78`($key),$rndkey
818          lea            16(@ptr[$i],$offset),@ptr[$i]   # switch to output
819 ___
820 $code.=<<___ if ($i<4)
821          vmovdqu        @inp[$i%4],`16*$i`($offload)    # off-load
822 ___
823 }
824 $code.=<<___;
825          vmovdqu        32(%rsp),$counters
826         prefetcht0      15(@ptr[$i-2])                  # prefetch output
827         prefetcht0      15(@ptr[$i-1])
828         cmp     \$11,$rounds
829         jb      .Lenc8x_tail
830
831         vaesenc         $rndkey1,@out[0],@out[0]
832         vaesenc         $rndkey1,@out[1],@out[1]
833         vaesenc         $rndkey1,@out[2],@out[2]
834         vaesenc         $rndkey1,@out[3],@out[3]
835         vaesenc         $rndkey1,@out[4],@out[4]
836         vaesenc         $rndkey1,@out[5],@out[5]
837         vaesenc         $rndkey1,@out[6],@out[6]
838         vaesenc         $rndkey1,@out[7],@out[7]
839         vmovups         0xb0-0x78($key),$rndkey1
840
841         vaesenc         $rndkey0,@out[0],@out[0]
842         vaesenc         $rndkey0,@out[1],@out[1]
843         vaesenc         $rndkey0,@out[2],@out[2]
844         vaesenc         $rndkey0,@out[3],@out[3]
845         vaesenc         $rndkey0,@out[4],@out[4]
846         vaesenc         $rndkey0,@out[5],@out[5]
847         vaesenc         $rndkey0,@out[6],@out[6]
848         vaesenc         $rndkey0,@out[7],@out[7]
849         vmovups         0xc0-0x78($key),$rndkey0
850         je      .Lenc8x_tail
851
852         vaesenc         $rndkey1,@out[0],@out[0]
853         vaesenc         $rndkey1,@out[1],@out[1]
854         vaesenc         $rndkey1,@out[2],@out[2]
855         vaesenc         $rndkey1,@out[3],@out[3]
856         vaesenc         $rndkey1,@out[4],@out[4]
857         vaesenc         $rndkey1,@out[5],@out[5]
858         vaesenc         $rndkey1,@out[6],@out[6]
859         vaesenc         $rndkey1,@out[7],@out[7]
860         vmovups         0xd0-0x78($key),$rndkey1
861
862         vaesenc         $rndkey0,@out[0],@out[0]
863         vaesenc         $rndkey0,@out[1],@out[1]
864         vaesenc         $rndkey0,@out[2],@out[2]
865         vaesenc         $rndkey0,@out[3],@out[3]
866         vaesenc         $rndkey0,@out[4],@out[4]
867         vaesenc         $rndkey0,@out[5],@out[5]
868         vaesenc         $rndkey0,@out[6],@out[6]
869         vaesenc         $rndkey0,@out[7],@out[7]
870         vmovups         0xe0-0x78($key),$rndkey0
871
872 .Lenc8x_tail:
873         vaesenc         $rndkey1,@out[0],@out[0]
874          vpxor          $zero,$zero,$zero
875         vaesenc         $rndkey1,@out[1],@out[1]
876         vaesenc         $rndkey1,@out[2],@out[2]
877          vpcmpgtd       $zero,$counters,$zero
878         vaesenc         $rndkey1,@out[3],@out[3]
879         vaesenc         $rndkey1,@out[4],@out[4]
880          vpaddd         $counters,$zero,$zero           # decrement counters
881          vmovdqu        48(%rsp),$counters
882         vaesenc         $rndkey1,@out[5],@out[5]
883          mov            64(%rsp),$offset                # pre-load 1st offset
884         vaesenc         $rndkey1,@out[6],@out[6]
885         vaesenc         $rndkey1,@out[7],@out[7]
886         vmovups         0x10-0x78($key),$rndkey1
887
888         vaesenclast     $rndkey0,@out[0],@out[0]
889          vmovdqa        $zero,32(%rsp)                  # update counters
890          vpxor          $zero,$zero,$zero
891         vaesenclast     $rndkey0,@out[1],@out[1]
892         vaesenclast     $rndkey0,@out[2],@out[2]
893          vpcmpgtd       $zero,$counters,$zero
894         vaesenclast     $rndkey0,@out[3],@out[3]
895         vaesenclast     $rndkey0,@out[4],@out[4]
896          vpaddd         $zero,$counters,$counters       # decrement counters
897          vmovdqu        -0x78($key),$zero               # 0-round
898         vaesenclast     $rndkey0,@out[5],@out[5]
899         vaesenclast     $rndkey0,@out[6],@out[6]
900          vmovdqa        $counters,48(%rsp)              # update counters
901         vaesenclast     $rndkey0,@out[7],@out[7]
902         vmovups         0x20-0x78($key),$rndkey0
903
904         vmovups         @out[0],-16(@ptr[0])            # write output
905          sub            $offset,@ptr[0]                 # switch to input
906          vpxor          0x00($offload),@out[0],@out[0]
907         vmovups         @out[1],-16(@ptr[1])
908          sub            `64+1*8`(%rsp),@ptr[1]
909          vpxor          0x10($offload),@out[1],@out[1]
910         vmovups         @out[2],-16(@ptr[2])
911          sub            `64+2*8`(%rsp),@ptr[2]
912          vpxor          0x20($offload),@out[2],@out[2]
913         vmovups         @out[3],-16(@ptr[3])
914          sub            `64+3*8`(%rsp),@ptr[3]
915          vpxor          0x30($offload),@out[3],@out[3]
916         vmovups         @out[4],-16(@ptr[4])
917          sub            `64+4*8`(%rsp),@ptr[4]
918          vpxor          @inp[0],@out[4],@out[4]
919         vmovups         @out[5],-16(@ptr[5])
920          sub            `64+5*8`(%rsp),@ptr[5]
921          vpxor          @inp[1],@out[5],@out[5]
922         vmovups         @out[6],-16(@ptr[6])
923          sub            `64+6*8`(%rsp),@ptr[6]
924          vpxor          @inp[2],@out[6],@out[6]
925         vmovups         @out[7],-16(@ptr[7])
926          sub            `64+7*8`(%rsp),@ptr[7]
927          vpxor          @inp[3],@out[7],@out[7]
928
929         dec     $num
930         jnz     .Loop_enc8x
931
932         mov     16(%rsp),%rax                   # original %rsp
933 .cfi_def_cfa    %rax,8
934         #mov    24(%rsp),$num
935         #lea    `$inp_elm_size*8`($inp),$inp
936         #dec    $num
937         #jnz    .Lenc8x_loop_grande
938
939 .Lenc8x_done:
940         vzeroupper
941 ___
942 $code.=<<___ if ($win64);
943         movaps  -0xd8(%rax),%xmm6
944         movaps  -0xc8(%rax),%xmm7
945         movaps  -0xb8(%rax),%xmm8
946         movaps  -0xa8(%rax),%xmm9
947         movaps  -0x98(%rax),%xmm10
948         movaps  -0x88(%rax),%xmm11
949         movaps  -0x78(%rax),%xmm12
950         movaps  -0x68(%rax),%xmm13
951         movaps  -0x58(%rax),%xmm14
952         movaps  -0x48(%rax),%xmm15
953 ___
954 $code.=<<___;
955         mov     -48(%rax),%r15
956 .cfi_restore    %r15
957         mov     -40(%rax),%r14
958 .cfi_restore    %r14
959         mov     -32(%rax),%r13
960 .cfi_restore    %r13
961         mov     -24(%rax),%r12
962 .cfi_restore    %r12
963         mov     -16(%rax),%rbp
964 .cfi_restore    %rbp
965         mov     -8(%rax),%rbx
966 .cfi_restore    %rbx
967         lea     (%rax),%rsp
968 .cfi_def_cfa_register   %rsp
969 .Lenc8x_epilogue:
970         ret
971 .cfi_endproc
972 .size   aesni_multi_cbc_encrypt_avx,.-aesni_multi_cbc_encrypt_avx
973
974 .type   aesni_multi_cbc_decrypt_avx,\@function,3
975 .align  32
976 aesni_multi_cbc_decrypt_avx:
977 .cfi_startproc
978 _avx_cbc_dec_shortcut:
979         mov     %rsp,%rax
980 .cfi_def_cfa_register   %rax
981         push    %rbx
982 .cfi_push       %rbx
983         push    %rbp
984 .cfi_push       %rbp
985         push    %r12
986 .cfi_push       %r12
987         push    %r13
988 .cfi_push       %r13
989         push    %r14
990 .cfi_push       %r14
991         push    %r15
992 .cfi_push       %r15
993 ___
994 $code.=<<___ if ($win64);
995         lea     -0xa8(%rsp),%rsp
996         movaps  %xmm6,(%rsp)
997         movaps  %xmm7,0x10(%rsp)
998         movaps  %xmm8,0x20(%rsp)
999         movaps  %xmm9,0x30(%rsp)
1000         movaps  %xmm10,0x40(%rsp)
1001         movaps  %xmm11,0x50(%rsp)
1002         movaps  %xmm12,-0x78(%rax)
1003         movaps  %xmm13,-0x68(%rax)
1004         movaps  %xmm14,-0x58(%rax)
1005         movaps  %xmm15,-0x48(%rax)
1006 ___
1007 $code.=<<___;
1008         # stack layout
1009         #
1010         # +0    output sink
1011         # +16   input sink [original %rsp and $num]
1012         # +32   counters
1013         # +64   distances between inputs and outputs
1014         # +128  off-load area for @inp[0..3]
1015         # +192  IV/input offload
1016
1017         sub     \$256,%rsp
1018         and     \$-256,%rsp
1019         sub     \$192,%rsp
1020         mov     %rax,16(%rsp)                   # original %rsp
1021 .cfi_cfa_expression     %rsp+16,deref,+8
1022
1023 .Ldec8x_body:
1024         vzeroupper
1025         vmovdqu ($key),$zero                    # 0-round key
1026         lea     0x78($key),$key                 # size optimization
1027         lea     `$inp_elm_size*4`($inp),$inp
1028         shr     \$1,$num
1029
1030 .Ldec8x_loop_grande:
1031         #mov    $num,24(%rsp)                   # original $num
1032         xor     $num,$num
1033 ___
1034 for($i=0;$i<8;$i++) {
1035   my $temp = $i ? $offload : $offset;
1036     $ptr_reg=&pointer_register($flavour,@ptr[$i]);
1037     $temp_reg=&pointer_register($flavour,$temp);
1038     $code.=<<___;
1039         # borrow $one for number of blocks
1040         mov     `$inp_elm_size*$i+2*$ptr_size-$inp_elm_size*4`($inp),$one
1041         # input pointer
1042         mov     `$inp_elm_size*$i+0-$inp_elm_size*4`($inp),$ptr_reg
1043         cmp     $num,$one
1044         # output pointer
1045         mov     `$inp_elm_size*$i+$ptr_size-$inp_elm_size*4`($inp),$temp_reg
1046         cmovg   $one,$num                       # find maximum
1047         test    $one,$one
1048         # load IV
1049         vmovdqu `$inp_elm_size*$i+2*$ptr_size+8-$inp_elm_size*4`($inp),@out[$i]
1050         mov     $one,`32+4*$i`(%rsp)            # initialize counters
1051         cmovle  %rsp,@ptr[$i]                   # cancel input
1052         sub     @ptr[$i],$temp                  # distance between input and output
1053         mov     $temp,`64+8*$i`(%rsp)           # initialize distances
1054         vmovdqu @out[$i],`192+16*$i`(%rsp)      # offload IV
1055 ___
1056 }
1057 $code.=<<___;
1058         test    $num,$num
1059         jz      .Ldec8x_done
1060
1061         vmovups 0x10-0x78($key),$rndkey1
1062         vmovups 0x20-0x78($key),$rndkey0
1063         mov     0xf0-0x78($key),$rounds
1064          lea    192+128(%rsp),$offload          # offload area
1065
1066         vmovdqu (@ptr[0]),@out[0]               # load inputs
1067         vmovdqu (@ptr[1]),@out[1]
1068         vmovdqu (@ptr[2]),@out[2]
1069         vmovdqu (@ptr[3]),@out[3]
1070         vmovdqu (@ptr[4]),@out[4]
1071         vmovdqu (@ptr[5]),@out[5]
1072         vmovdqu (@ptr[6]),@out[6]
1073         vmovdqu (@ptr[7]),@out[7]
1074         vmovdqu @out[0],0x00($offload)          # offload inputs
1075         vpxor   $zero,@out[0],@out[0]           # xor inputs with 0-round
1076         vmovdqu @out[1],0x10($offload)
1077         vpxor   $zero,@out[1],@out[1]
1078         vmovdqu @out[2],0x20($offload)
1079         vpxor   $zero,@out[2],@out[2]
1080         vmovdqu @out[3],0x30($offload)
1081         vpxor   $zero,@out[3],@out[3]
1082         vmovdqu @out[4],0x40($offload)
1083         vpxor   $zero,@out[4],@out[4]
1084         vmovdqu @out[5],0x50($offload)
1085         vpxor   $zero,@out[5],@out[5]
1086         vmovdqu @out[6],0x60($offload)
1087         vpxor   $zero,@out[6],@out[6]
1088         vmovdqu @out[7],0x70($offload)
1089         vpxor   $zero,@out[7],@out[7]
1090         xor     \$0x80,$offload
1091         mov     \$1,$one                        # constant of 1
1092         jmp     .Loop_dec8x
1093
1094 .align  32
1095 .Loop_dec8x:
1096 ___
1097 for($i=0;$i<8;$i++) {
1098 my $rndkey=($i&1)?$rndkey0:$rndkey1;
1099 $code.=<<___;
1100         vaesdec         $rndkey,@out[0],@out[0]
1101          cmp            32+4*$i(%rsp),$one
1102 ___
1103 $code.=<<___ if ($i);
1104          mov            64+8*$i(%rsp),$offset
1105 ___
1106 $code.=<<___;
1107         vaesdec         $rndkey,@out[1],@out[1]
1108         prefetcht0      31(@ptr[$i])                    # prefetch input
1109         vaesdec         $rndkey,@out[2],@out[2]
1110 ___
1111 $code.=<<___ if ($i>1);
1112         prefetcht0      15(@ptr[$i-2])                  # prefetch output
1113 ___
1114 $code.=<<___;
1115         vaesdec         $rndkey,@out[3],@out[3]
1116          lea            (@ptr[$i],$offset),$offset
1117          cmovge         %rsp,@ptr[$i]                   # cancel input
1118         vaesdec         $rndkey,@out[4],@out[4]
1119          cmovg          %rsp,$offset                    # sink output
1120         vaesdec         $rndkey,@out[5],@out[5]
1121          sub            @ptr[$i],$offset
1122         vaesdec         $rndkey,@out[6],@out[6]
1123          vmovdqu        16(@ptr[$i]),@inp[$i%4]         # load input
1124          mov            $offset,64+8*$i(%rsp)
1125         vaesdec         $rndkey,@out[7],@out[7]
1126         vmovups         `16*(3+$i)-0x78`($key),$rndkey
1127          lea            16(@ptr[$i],$offset),@ptr[$i]   # switch to output
1128 ___
1129 $code.=<<___ if ($i<4);
1130          vmovdqu        @inp[$i%4],`128+16*$i`(%rsp)    # off-load
1131 ___
1132 }
1133 $code.=<<___;
1134          vmovdqu        32(%rsp),$counters
1135         prefetcht0      15(@ptr[$i-2])                  # prefetch output
1136         prefetcht0      15(@ptr[$i-1])
1137         cmp     \$11,$rounds
1138         jb      .Ldec8x_tail
1139
1140         vaesdec         $rndkey1,@out[0],@out[0]
1141         vaesdec         $rndkey1,@out[1],@out[1]
1142         vaesdec         $rndkey1,@out[2],@out[2]
1143         vaesdec         $rndkey1,@out[3],@out[3]
1144         vaesdec         $rndkey1,@out[4],@out[4]
1145         vaesdec         $rndkey1,@out[5],@out[5]
1146         vaesdec         $rndkey1,@out[6],@out[6]
1147         vaesdec         $rndkey1,@out[7],@out[7]
1148         vmovups         0xb0-0x78($key),$rndkey1
1149
1150         vaesdec         $rndkey0,@out[0],@out[0]
1151         vaesdec         $rndkey0,@out[1],@out[1]
1152         vaesdec         $rndkey0,@out[2],@out[2]
1153         vaesdec         $rndkey0,@out[3],@out[3]
1154         vaesdec         $rndkey0,@out[4],@out[4]
1155         vaesdec         $rndkey0,@out[5],@out[5]
1156         vaesdec         $rndkey0,@out[6],@out[6]
1157         vaesdec         $rndkey0,@out[7],@out[7]
1158         vmovups         0xc0-0x78($key),$rndkey0
1159         je      .Ldec8x_tail
1160
1161         vaesdec         $rndkey1,@out[0],@out[0]
1162         vaesdec         $rndkey1,@out[1],@out[1]
1163         vaesdec         $rndkey1,@out[2],@out[2]
1164         vaesdec         $rndkey1,@out[3],@out[3]
1165         vaesdec         $rndkey1,@out[4],@out[4]
1166         vaesdec         $rndkey1,@out[5],@out[5]
1167         vaesdec         $rndkey1,@out[6],@out[6]
1168         vaesdec         $rndkey1,@out[7],@out[7]
1169         vmovups         0xd0-0x78($key),$rndkey1
1170
1171         vaesdec         $rndkey0,@out[0],@out[0]
1172         vaesdec         $rndkey0,@out[1],@out[1]
1173         vaesdec         $rndkey0,@out[2],@out[2]
1174         vaesdec         $rndkey0,@out[3],@out[3]
1175         vaesdec         $rndkey0,@out[4],@out[4]
1176         vaesdec         $rndkey0,@out[5],@out[5]
1177         vaesdec         $rndkey0,@out[6],@out[6]
1178         vaesdec         $rndkey0,@out[7],@out[7]
1179         vmovups         0xe0-0x78($key),$rndkey0
1180
1181 .Ldec8x_tail:
1182         vaesdec         $rndkey1,@out[0],@out[0]
1183          vpxor          $zero,$zero,$zero
1184         vaesdec         $rndkey1,@out[1],@out[1]
1185         vaesdec         $rndkey1,@out[2],@out[2]
1186          vpcmpgtd       $zero,$counters,$zero
1187         vaesdec         $rndkey1,@out[3],@out[3]
1188         vaesdec         $rndkey1,@out[4],@out[4]
1189          vpaddd         $counters,$zero,$zero           # decrement counters
1190          vmovdqu        48(%rsp),$counters
1191         vaesdec         $rndkey1,@out[5],@out[5]
1192          mov            64(%rsp),$offset                # pre-load 1st offset
1193         vaesdec         $rndkey1,@out[6],@out[6]
1194         vaesdec         $rndkey1,@out[7],@out[7]
1195         vmovups         0x10-0x78($key),$rndkey1
1196
1197         vaesdeclast     $rndkey0,@out[0],@out[0]
1198          vmovdqa        $zero,32(%rsp)                  # update counters
1199          vpxor          $zero,$zero,$zero
1200         vaesdeclast     $rndkey0,@out[1],@out[1]
1201         vpxor           0x00($offload),@out[0],@out[0]  # xor with IV
1202         vaesdeclast     $rndkey0,@out[2],@out[2]
1203         vpxor           0x10($offload),@out[1],@out[1]
1204          vpcmpgtd       $zero,$counters,$zero
1205         vaesdeclast     $rndkey0,@out[3],@out[3]
1206         vpxor           0x20($offload),@out[2],@out[2]
1207         vaesdeclast     $rndkey0,@out[4],@out[4]
1208         vpxor           0x30($offload),@out[3],@out[3]
1209          vpaddd         $zero,$counters,$counters       # decrement counters
1210          vmovdqu        -0x78($key),$zero               # 0-round
1211         vaesdeclast     $rndkey0,@out[5],@out[5]
1212         vpxor           0x40($offload),@out[4],@out[4]
1213         vaesdeclast     $rndkey0,@out[6],@out[6]
1214         vpxor           0x50($offload),@out[5],@out[5]
1215          vmovdqa        $counters,48(%rsp)              # update counters
1216         vaesdeclast     $rndkey0,@out[7],@out[7]
1217         vpxor           0x60($offload),@out[6],@out[6]
1218         vmovups         0x20-0x78($key),$rndkey0
1219
1220         vmovups         @out[0],-16(@ptr[0])            # write output
1221          sub            $offset,@ptr[0]                 # switch to input
1222          vmovdqu        128+0(%rsp),@out[0]
1223         vpxor           0x70($offload),@out[7],@out[7]
1224         vmovups         @out[1],-16(@ptr[1])
1225          sub            `64+1*8`(%rsp),@ptr[1]
1226          vmovdqu        @out[0],0x00($offload)
1227          vpxor          $zero,@out[0],@out[0]
1228          vmovdqu        128+16(%rsp),@out[1]
1229         vmovups         @out[2],-16(@ptr[2])
1230          sub            `64+2*8`(%rsp),@ptr[2]
1231          vmovdqu        @out[1],0x10($offload)
1232          vpxor          $zero,@out[1],@out[1]
1233          vmovdqu        128+32(%rsp),@out[2]
1234         vmovups         @out[3],-16(@ptr[3])
1235          sub            `64+3*8`(%rsp),@ptr[3]
1236          vmovdqu        @out[2],0x20($offload)
1237          vpxor          $zero,@out[2],@out[2]
1238          vmovdqu        128+48(%rsp),@out[3]
1239         vmovups         @out[4],-16(@ptr[4])
1240          sub            `64+4*8`(%rsp),@ptr[4]
1241          vmovdqu        @out[3],0x30($offload)
1242          vpxor          $zero,@out[3],@out[3]
1243          vmovdqu        @inp[0],0x40($offload)
1244          vpxor          @inp[0],$zero,@out[4]
1245         vmovups         @out[5],-16(@ptr[5])
1246          sub            `64+5*8`(%rsp),@ptr[5]
1247          vmovdqu        @inp[1],0x50($offload)
1248          vpxor          @inp[1],$zero,@out[5]
1249         vmovups         @out[6],-16(@ptr[6])
1250          sub            `64+6*8`(%rsp),@ptr[6]
1251          vmovdqu        @inp[2],0x60($offload)
1252          vpxor          @inp[2],$zero,@out[6]
1253         vmovups         @out[7],-16(@ptr[7])
1254          sub            `64+7*8`(%rsp),@ptr[7]
1255          vmovdqu        @inp[3],0x70($offload)
1256          vpxor          @inp[3],$zero,@out[7]
1257
1258         xor     \$128,$offload
1259         dec     $num
1260         jnz     .Loop_dec8x
1261
1262         mov     16(%rsp),%rax                   # original %rsp
1263 .cfi_def_cfa    %rax,8
1264         #mov    24(%rsp),$num
1265         #lea    `$inp_elm_size*8`($inp),$inp
1266         #dec    $num
1267         #jnz    .Ldec8x_loop_grande
1268
1269 .Ldec8x_done:
1270         vzeroupper
1271 ___
1272 $code.=<<___ if ($win64);
1273         movaps  -0xd8(%rax),%xmm6
1274         movaps  -0xc8(%rax),%xmm7
1275         movaps  -0xb8(%rax),%xmm8
1276         movaps  -0xa8(%rax),%xmm9
1277         movaps  -0x98(%rax),%xmm10
1278         movaps  -0x88(%rax),%xmm11
1279         movaps  -0x78(%rax),%xmm12
1280         movaps  -0x68(%rax),%xmm13
1281         movaps  -0x58(%rax),%xmm14
1282         movaps  -0x48(%rax),%xmm15
1283 ___
1284 $code.=<<___;
1285         mov     -48(%rax),%r15
1286 .cfi_restore    %r15
1287         mov     -40(%rax),%r14
1288 .cfi_restore    %r14
1289         mov     -32(%rax),%r13
1290 .cfi_restore    %r13
1291         mov     -24(%rax),%r12
1292 .cfi_restore    %r12
1293         mov     -16(%rax),%rbp
1294 .cfi_restore    %rbp
1295         mov     -8(%rax),%rbx
1296 .cfi_restore    %rbx
1297         lea     (%rax),%rsp
1298 .cfi_def_cfa_register   %rsp
1299 .Ldec8x_epilogue:
1300         ret
1301 .cfi_endproc
1302 .size   aesni_multi_cbc_decrypt_avx,.-aesni_multi_cbc_decrypt_avx
1303 ___
1304                                                 }}}
1305
1306 if ($win64) {
1307 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1308 #               CONTEXT *context,DISPATCHER_CONTEXT *disp)
1309 $rec="%rcx";
1310 $frame="%rdx";
1311 $context="%r8";
1312 $disp="%r9";
1313
1314 $code.=<<___;
1315 .extern __imp_RtlVirtualUnwind
1316 .type   se_handler,\@abi-omnipotent
1317 .align  16
1318 se_handler:
1319         push    %rsi
1320         push    %rdi
1321         push    %rbx
1322         push    %rbp
1323         push    %r12
1324         push    %r13
1325         push    %r14
1326         push    %r15
1327         pushfq
1328         sub     \$64,%rsp
1329
1330         mov     120($context),%rax      # pull context->Rax
1331         mov     248($context),%rbx      # pull context->Rip
1332
1333         mov     8($disp),%rsi           # disp->ImageBase
1334         mov     56($disp),%r11          # disp->HandlerData
1335
1336         mov     0(%r11),%r10d           # HandlerData[0]
1337         lea     (%rsi,%r10),%r10        # prologue label
1338         cmp     %r10,%rbx               # context->Rip<.Lprologue
1339         jb      .Lin_prologue
1340
1341         mov     152($context),%rax      # pull context->Rsp
1342
1343         mov     4(%r11),%r10d           # HandlerData[1]
1344         lea     (%rsi,%r10),%r10        # epilogue label
1345         cmp     %r10,%rbx               # context->Rip>=.Lepilogue
1346         jae     .Lin_prologue
1347
1348         mov     16(%rax),%rax           # pull saved stack pointer
1349
1350         mov     -8(%rax),%rbx
1351         mov     -16(%rax),%rbp
1352         mov     -24(%rax),%r12
1353         mov     -32(%rax),%r13
1354         mov     -40(%rax),%r14
1355         mov     -48(%rax),%r15
1356         mov     %rbx,144($context)      # restore context->Rbx
1357         mov     %rbp,160($context)      # restore context->Rbp
1358         mov     %r12,216($context)      # restore context->R12
1359         mov     %r13,224($context)      # restore context->R13
1360         mov     %r14,232($context)      # restore context->R14
1361         mov     %r15,240($context)      # restore context->R15
1362
1363         lea     -56-10*16(%rax),%rsi
1364         lea     512($context),%rdi      # &context.Xmm6
1365         mov     \$20,%ecx
1366         .long   0xa548f3fc              # cld; rep movsq
1367
1368 .Lin_prologue:
1369         mov     8(%rax),%rdi
1370         mov     16(%rax),%rsi
1371         mov     %rax,152($context)      # restore context->Rsp
1372         mov     %rsi,168($context)      # restore context->Rsi
1373         mov     %rdi,176($context)      # restore context->Rdi
1374
1375         mov     40($disp),%rdi          # disp->ContextRecord
1376         mov     $context,%rsi           # context
1377         mov     \$154,%ecx              # sizeof(CONTEXT)
1378         .long   0xa548f3fc              # cld; rep movsq
1379
1380         mov     $disp,%rsi
1381         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1382         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1383         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1384         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1385         mov     40(%rsi),%r10           # disp->ContextRecord
1386         lea     56(%rsi),%r11           # &disp->HandlerData
1387         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1388         mov     %r10,32(%rsp)           # arg5
1389         mov     %r11,40(%rsp)           # arg6
1390         mov     %r12,48(%rsp)           # arg7
1391         mov     %rcx,56(%rsp)           # arg8, (NULL)
1392         call    *__imp_RtlVirtualUnwind(%rip)
1393
1394         mov     \$1,%eax                # ExceptionContinueSearch
1395         add     \$64,%rsp
1396         popfq
1397         pop     %r15
1398         pop     %r14
1399         pop     %r13
1400         pop     %r12
1401         pop     %rbp
1402         pop     %rbx
1403         pop     %rdi
1404         pop     %rsi
1405         ret
1406 .size   se_handler,.-se_handler
1407
1408 .section        .pdata
1409 .align  4
1410         .rva    .LSEH_begin_aesni_multi_cbc_encrypt
1411         .rva    .LSEH_end_aesni_multi_cbc_encrypt
1412         .rva    .LSEH_info_aesni_multi_cbc_encrypt
1413         .rva    .LSEH_begin_aesni_multi_cbc_decrypt
1414         .rva    .LSEH_end_aesni_multi_cbc_decrypt
1415         .rva    .LSEH_info_aesni_multi_cbc_decrypt
1416 ___
1417 $code.=<<___ if ($avx);
1418         .rva    .LSEH_begin_aesni_multi_cbc_encrypt_avx
1419         .rva    .LSEH_end_aesni_multi_cbc_encrypt_avx
1420         .rva    .LSEH_info_aesni_multi_cbc_encrypt_avx
1421         .rva    .LSEH_begin_aesni_multi_cbc_decrypt_avx
1422         .rva    .LSEH_end_aesni_multi_cbc_decrypt_avx
1423         .rva    .LSEH_info_aesni_multi_cbc_decrypt_avx
1424 ___
1425 $code.=<<___;
1426 .section        .xdata
1427 .align  8
1428 .LSEH_info_aesni_multi_cbc_encrypt:
1429         .byte   9,0,0,0
1430         .rva    se_handler
1431         .rva    .Lenc4x_body,.Lenc4x_epilogue           # HandlerData[]
1432 .LSEH_info_aesni_multi_cbc_decrypt:
1433         .byte   9,0,0,0
1434         .rva    se_handler
1435         .rva    .Ldec4x_body,.Ldec4x_epilogue           # HandlerData[]
1436 ___
1437 $code.=<<___ if ($avx);
1438 .LSEH_info_aesni_multi_cbc_encrypt_avx:
1439         .byte   9,0,0,0
1440         .rva    se_handler
1441         .rva    .Lenc8x_body,.Lenc8x_epilogue           # HandlerData[]
1442 .LSEH_info_aesni_multi_cbc_decrypt_avx:
1443         .byte   9,0,0,0
1444         .rva    se_handler
1445         .rva    .Ldec8x_body,.Ldec8x_epilogue           # HandlerData[]
1446 ___
1447 }
1448 ####################################################################
1449
1450 sub rex {
1451   local *opcode=shift;
1452   my ($dst,$src)=@_;
1453   my $rex=0;
1454
1455     $rex|=0x04                  if($dst>=8);
1456     $rex|=0x01                  if($src>=8);
1457     push @opcode,$rex|0x40      if($rex);
1458 }
1459
1460 sub aesni {
1461   my $line=shift;
1462   my @opcode=(0x66);
1463
1464     if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1465         rex(\@opcode,$4,$3);
1466         push @opcode,0x0f,0x3a,0xdf;
1467         push @opcode,0xc0|($3&7)|(($4&7)<<3);   # ModR/M
1468         my $c=$2;
1469         push @opcode,$c=~/^0/?oct($c):$c;
1470         return ".byte\t".join(',',@opcode);
1471     }
1472     elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
1473         my %opcodelet = (
1474                 "aesimc" => 0xdb,
1475                 "aesenc" => 0xdc,       "aesenclast" => 0xdd,
1476                 "aesdec" => 0xde,       "aesdeclast" => 0xdf
1477         );
1478         return undef if (!defined($opcodelet{$1}));
1479         rex(\@opcode,$3,$2);
1480         push @opcode,0x0f,0x38,$opcodelet{$1};
1481         push @opcode,0xc0|($2&7)|(($3&7)<<3);   # ModR/M
1482         return ".byte\t".join(',',@opcode);
1483     }
1484     elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
1485         my %opcodelet = (
1486                 "aesenc" => 0xdc,       "aesenclast" => 0xdd,
1487                 "aesdec" => 0xde,       "aesdeclast" => 0xdf
1488         );
1489         return undef if (!defined($opcodelet{$1}));
1490         my $off = $2;
1491         push @opcode,0x44 if ($3>=8);
1492         push @opcode,0x0f,0x38,$opcodelet{$1};
1493         push @opcode,0x44|(($3&7)<<3),0x24;     # ModR/M
1494         push @opcode,($off=~/^0/?oct($off):$off)&0xff;
1495         return ".byte\t".join(',',@opcode);
1496     }
1497     return $line;
1498 }
1499
1500 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1501 $code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
1502
1503 print $code;
1504 close STDOUT or die "error closing STDOUT: $!";