modes/asm/*-x86_64.pl: add CFI annotations.
[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 #
18 # AES-NI-CTR+GHASH stitch.
19 #
20 # February 2013
21 #
22 # OpenSSL GCM implementation is organized in such way that its
23 # performance is rather close to the sum of its streamed components,
24 # in the context parallelized AES-NI CTR and modulo-scheduled
25 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26 # was observed to perform significantly better than the sum of the
27 # components on contemporary CPUs, the effort was deemed impossible to
28 # justify. This module is based on combination of Intel submissions,
29 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
31 # pressure with notable relative improvement, achieving 1.0 cycle per
32 # byte processed with 128-bit key on Haswell processor, 0.74 - on
33 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34 # measurements for favourable packet size, one divisible by 96.
35 # Applications using the EVP interface will observe a few percent
36 # worse performance.]
37 #
38 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
39 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
40
41 $flavour = shift;
42 $output  = shift;
43 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
44
45 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
46
47 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
49 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
50 die "can't locate x86_64-xlate.pl";
51
52 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
53                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.20) + ($1>=2.22);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
58             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
59         $avx = ($1>=2.09) + ($1>=2.10);
60 }
61
62 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
63             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
64         $avx = ($1>=10) + ($1>=11);
65 }
66
67 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
68         $avx = ($2>=3.0) + ($2>3.0);
69 }
70
71 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
72 *STDOUT=*OUT;
73
74 if ($avx>1) {{{
75
76 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
77
78 ($Ii,$T1,$T2,$Hkey,
79  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
80
81 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
82
83 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
84
85 $code=<<___;
86 .text
87
88 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
89 .align  32
90 _aesni_ctr32_ghash_6x:
91         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
92         sub             \$6,$len
93         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
94         vmovdqu         0x00-0x80($key),$rndkey
95         vpaddb          $T2,$T1,$inout1
96         vpaddb          $T2,$inout1,$inout2
97         vpaddb          $T2,$inout2,$inout3
98         vpaddb          $T2,$inout3,$inout4
99         vpaddb          $T2,$inout4,$inout5
100         vpxor           $rndkey,$T1,$inout0
101         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
102         jmp             .Loop6x
103
104 .align  32
105 .Loop6x:
106         add             \$`6<<24`,$counter
107         jc              .Lhandle_ctr32          # discard $inout[1-5]?
108         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
109           vpaddb        $T2,$inout5,$T1         # next counter value
110           vpxor         $rndkey,$inout1,$inout1
111           vpxor         $rndkey,$inout2,$inout2
112
113 .Lresume_ctr32:
114         vmovdqu         $T1,($ivp)              # save next counter value
115         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
116           vpxor         $rndkey,$inout3,$inout3
117           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
118         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
119
120         # At this point, the current block of 96 (0x60) bytes has already been
121         # loaded into registers. Concurrently with processing it, we want to
122         # load the next 96 bytes of input for the next round. Obviously, we can
123         # only do this if there are at least 96 more bytes of input beyond the
124         # input we're currently processing, or else we'd read past the end of
125         # the input buffer. Here, we set |%r12| to 96 if there are at least 96
126         # bytes of input beyond the 96 bytes we're already processing, and we
127         # set |%r12| to 0 otherwise. In the case where we set |%r12| to 96,
128         # we'll read in the next block so that it is in registers for the next
129         # loop iteration. In the case where we set |%r12| to 0, we'll re-read
130         # the current block and then ignore what we re-read.
131         #
132         # At this point, |$in0| points to the current (already read into
133         # registers) block, and |$end0| points to 2*96 bytes before the end of
134         # the input. Thus, |$in0| > |$end0| means that we do not have the next
135         # 96-byte block to read in, and |$in0| <= |$end0| means we do.
136         xor             %r12,%r12
137         cmp             $in0,$end0
138
139           vaesenc       $T2,$inout0,$inout0
140         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
141           vpxor         $rndkey,$inout4,$inout4
142         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
143           vaesenc       $T2,$inout1,$inout1
144           vpxor         $rndkey,$inout5,$inout5
145         setnc           %r12b
146         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
147           vaesenc       $T2,$inout2,$inout2
148         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
149         neg             %r12
150           vaesenc       $T2,$inout3,$inout3
151          vpxor          $Z1,$Z2,$Z2
152         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
153          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
154           vaesenc       $T2,$inout4,$inout4
155          vpxor          $Z1,$T1,$Z0
156         and             \$0x60,%r12
157           vmovups       0x20-0x80($key),$rndkey
158         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
159           vaesenc       $T2,$inout5,$inout5
160
161         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
162         lea             ($in0,%r12),$in0
163           vaesenc       $rndkey,$inout0,$inout0
164          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
165         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
166          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
167           vaesenc       $rndkey,$inout1,$inout1
168         movbe           0x58($in0),%r13
169           vaesenc       $rndkey,$inout2,$inout2
170         movbe           0x50($in0),%r12
171           vaesenc       $rndkey,$inout3,$inout3
172         mov             %r13,0x20+8(%rsp)
173           vaesenc       $rndkey,$inout4,$inout4
174         mov             %r12,0x28+8(%rsp)
175         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
176           vaesenc       $rndkey,$inout5,$inout5
177
178           vmovups       0x30-0x80($key),$rndkey
179          vpxor          $T1,$Z2,$Z2
180         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
181           vaesenc       $rndkey,$inout0,$inout0
182          vpxor          $T2,$Z2,$Z2
183         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
184           vaesenc       $rndkey,$inout1,$inout1
185          vpxor          $Hkey,$Z3,$Z3
186         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
187           vaesenc       $rndkey,$inout2,$inout2
188         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
189          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
190           vaesenc       $rndkey,$inout3,$inout3
191           vaesenc       $rndkey,$inout4,$inout4
192          vpxor          $T1,$Z0,$Z0
193         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
194           vaesenc       $rndkey,$inout5,$inout5
195
196           vmovups       0x40-0x80($key),$rndkey
197          vpxor          $T2,$Z2,$Z2
198         vpclmulqdq      \$0x00,$T1,$Ii,$T2
199           vaesenc       $rndkey,$inout0,$inout0
200          vpxor          $Hkey,$Z2,$Z2
201         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
202           vaesenc       $rndkey,$inout1,$inout1
203         movbe           0x48($in0),%r13
204          vpxor          $Z1,$Z3,$Z3
205         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
206           vaesenc       $rndkey,$inout2,$inout2
207         movbe           0x40($in0),%r12
208         vpclmulqdq      \$0x11,$T1,$Ii,$T1
209          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
210           vaesenc       $rndkey,$inout3,$inout3
211         mov             %r13,0x30+8(%rsp)
212           vaesenc       $rndkey,$inout4,$inout4
213         mov             %r12,0x38+8(%rsp)
214          vpxor          $T2,$Z0,$Z0
215         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
216           vaesenc       $rndkey,$inout5,$inout5
217
218           vmovups       0x50-0x80($key),$rndkey
219          vpxor          $Hkey,$Z2,$Z2
220         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
221           vaesenc       $rndkey,$inout0,$inout0
222          vpxor          $Z1,$Z2,$Z2
223         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
224           vaesenc       $rndkey,$inout1,$inout1
225         movbe           0x38($in0),%r13
226          vpxor          $T1,$Z3,$Z3
227         vpclmulqdq      \$0x01,$T2,$Ii,$T1
228          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
229           vaesenc       $rndkey,$inout2,$inout2
230         movbe           0x30($in0),%r12
231         vpclmulqdq      \$0x11,$T2,$Ii,$T2
232           vaesenc       $rndkey,$inout3,$inout3
233         mov             %r13,0x40+8(%rsp)
234           vaesenc       $rndkey,$inout4,$inout4
235         mov             %r12,0x48+8(%rsp)
236          vpxor          $Hkey,$Z0,$Z0
237          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
238           vaesenc       $rndkey,$inout5,$inout5
239
240           vmovups       0x60-0x80($key),$rndkey
241          vpxor          $Z1,$Z2,$Z2
242         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
243           vaesenc       $rndkey,$inout0,$inout0
244          vpxor          $T1,$Z2,$Z2
245         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
246           vaesenc       $rndkey,$inout1,$inout1
247         movbe           0x28($in0),%r13
248          vpxor          $T2,$Z3,$Z3
249         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
250           vaesenc       $rndkey,$inout2,$inout2
251         movbe           0x20($in0),%r12
252         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
253           vaesenc       $rndkey,$inout3,$inout3
254         mov             %r13,0x50+8(%rsp)
255           vaesenc       $rndkey,$inout4,$inout4
256         mov             %r12,0x58+8(%rsp)
257         vpxor           $Z1,$Z2,$Z2
258           vaesenc       $rndkey,$inout5,$inout5
259         vpxor           $T1,$Z2,$Z2
260
261           vmovups       0x70-0x80($key),$rndkey
262         vpslldq         \$8,$Z2,$Z1
263         vpxor           $T2,$Z0,$Z0
264         vmovdqu         0x10($const),$Hkey      # .Lpoly
265
266           vaesenc       $rndkey,$inout0,$inout0
267         vpxor           $Xi,$Z3,$Z3
268           vaesenc       $rndkey,$inout1,$inout1
269         vpxor           $Z1,$Z0,$Z0
270         movbe           0x18($in0),%r13
271           vaesenc       $rndkey,$inout2,$inout2
272         movbe           0x10($in0),%r12
273         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
274         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
275         mov             %r13,0x60+8(%rsp)
276           vaesenc       $rndkey,$inout3,$inout3
277         mov             %r12,0x68+8(%rsp)
278           vaesenc       $rndkey,$inout4,$inout4
279           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
280           vaesenc       $rndkey,$inout5,$inout5
281
282           vaesenc       $T1,$inout0,$inout0
283           vmovups       0x90-0x80($key),$rndkey
284           vaesenc       $T1,$inout1,$inout1
285         vpsrldq         \$8,$Z2,$Z2
286           vaesenc       $T1,$inout2,$inout2
287         vpxor           $Z2,$Z3,$Z3
288           vaesenc       $T1,$inout3,$inout3
289         vpxor           $Ii,$Z0,$Z0
290         movbe           0x08($in0),%r13
291           vaesenc       $T1,$inout4,$inout4
292         movbe           0x00($in0),%r12
293           vaesenc       $T1,$inout5,$inout5
294           vmovups       0xa0-0x80($key),$T1
295           cmp           \$11,$rounds
296           jb            .Lenc_tail              # 128-bit key
297
298           vaesenc       $rndkey,$inout0,$inout0
299           vaesenc       $rndkey,$inout1,$inout1
300           vaesenc       $rndkey,$inout2,$inout2
301           vaesenc       $rndkey,$inout3,$inout3
302           vaesenc       $rndkey,$inout4,$inout4
303           vaesenc       $rndkey,$inout5,$inout5
304
305           vaesenc       $T1,$inout0,$inout0
306           vaesenc       $T1,$inout1,$inout1
307           vaesenc       $T1,$inout2,$inout2
308           vaesenc       $T1,$inout3,$inout3
309           vaesenc       $T1,$inout4,$inout4
310           vmovups       0xb0-0x80($key),$rndkey
311           vaesenc       $T1,$inout5,$inout5
312           vmovups       0xc0-0x80($key),$T1
313           je            .Lenc_tail              # 192-bit key
314
315           vaesenc       $rndkey,$inout0,$inout0
316           vaesenc       $rndkey,$inout1,$inout1
317           vaesenc       $rndkey,$inout2,$inout2
318           vaesenc       $rndkey,$inout3,$inout3
319           vaesenc       $rndkey,$inout4,$inout4
320           vaesenc       $rndkey,$inout5,$inout5
321
322           vaesenc       $T1,$inout0,$inout0
323           vaesenc       $T1,$inout1,$inout1
324           vaesenc       $T1,$inout2,$inout2
325           vaesenc       $T1,$inout3,$inout3
326           vaesenc       $T1,$inout4,$inout4
327           vmovups       0xd0-0x80($key),$rndkey
328           vaesenc       $T1,$inout5,$inout5
329           vmovups       0xe0-0x80($key),$T1
330           jmp           .Lenc_tail              # 256-bit key
331
332 .align  32
333 .Lhandle_ctr32:
334         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
335           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
336           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
337           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
338           vpaddd        $Z1,$Z2,$inout2
339         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
340           vpaddd        $Z1,$inout1,$inout3
341           vpshufb       $Ii,$inout1,$inout1
342           vpaddd        $Z1,$inout2,$inout4
343           vpshufb       $Ii,$inout2,$inout2
344           vpxor         $rndkey,$inout1,$inout1
345           vpaddd        $Z1,$inout3,$inout5
346           vpshufb       $Ii,$inout3,$inout3
347           vpxor         $rndkey,$inout2,$inout2
348           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
349           vpshufb       $Ii,$inout4,$inout4
350           vpshufb       $Ii,$inout5,$inout5
351           vpshufb       $Ii,$T1,$T1             # next counter value
352         jmp             .Lresume_ctr32
353
354 .align  32
355 .Lenc_tail:
356           vaesenc       $rndkey,$inout0,$inout0
357         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
358         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
359           vaesenc       $rndkey,$inout1,$inout1
360         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
361           vpxor         0x00($inp),$T1,$T2
362           vaesenc       $rndkey,$inout2,$inout2
363           vpxor         0x10($inp),$T1,$Ii
364           vaesenc       $rndkey,$inout3,$inout3
365           vpxor         0x20($inp),$T1,$Z1
366           vaesenc       $rndkey,$inout4,$inout4
367           vpxor         0x30($inp),$T1,$Z2
368           vaesenc       $rndkey,$inout5,$inout5
369           vpxor         0x40($inp),$T1,$Z3
370           vpxor         0x50($inp),$T1,$Hkey
371           vmovdqu       ($ivp),$T1              # load next counter value
372
373           vaesenclast   $T2,$inout0,$inout0
374           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
375           vaesenclast   $Ii,$inout1,$inout1
376          vpaddb         $T2,$T1,$Ii
377         mov             %r13,0x70+8(%rsp)
378         lea             0x60($inp),$inp
379           vaesenclast   $Z1,$inout2,$inout2
380          vpaddb         $T2,$Ii,$Z1
381         mov             %r12,0x78+8(%rsp)
382         lea             0x60($out),$out
383           vmovdqu       0x00-0x80($key),$rndkey
384           vaesenclast   $Z2,$inout3,$inout3
385          vpaddb         $T2,$Z1,$Z2
386           vaesenclast   $Z3, $inout4,$inout4
387          vpaddb         $T2,$Z2,$Z3
388           vaesenclast   $Hkey,$inout5,$inout5
389          vpaddb         $T2,$Z3,$Hkey
390
391         add             \$0x60,$ret
392         sub             \$0x6,$len
393         jc              .L6x_done
394
395           vmovups       $inout0,-0x60($out)     # save output
396          vpxor          $rndkey,$T1,$inout0
397           vmovups       $inout1,-0x50($out)
398          vmovdqa        $Ii,$inout1             # 0 latency
399           vmovups       $inout2,-0x40($out)
400          vmovdqa        $Z1,$inout2             # 0 latency
401           vmovups       $inout3,-0x30($out)
402          vmovdqa        $Z2,$inout3             # 0 latency
403           vmovups       $inout4,-0x20($out)
404          vmovdqa        $Z3,$inout4             # 0 latency
405           vmovups       $inout5,-0x10($out)
406          vmovdqa        $Hkey,$inout5           # 0 latency
407         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
408         jmp             .Loop6x
409
410 .L6x_done:
411         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
412         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
413
414         ret
415 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
416 ___
417 ######################################################################
418 #
419 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
420 #               const AES_KEY *key, unsigned char iv[16],
421 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
422 $code.=<<___;
423 .globl  aesni_gcm_decrypt
424 .type   aesni_gcm_decrypt,\@function,6
425 .align  32
426 aesni_gcm_decrypt:
427 .cfi_startproc
428         xor     $ret,$ret
429
430         # We call |_aesni_ctr32_ghash_6x|, which requires at least 96 (0x60)
431         # bytes of input.
432         cmp     \$0x60,$len                     # minimal accepted length
433         jb      .Lgcm_dec_abort
434
435         lea     (%rsp),%rax                     # save stack pointer
436 .cfi_def_cfa_register   %rax
437         push    %rbx
438 .cfi_push       %rbx
439         push    %rbp
440 .cfi_push       %rbp
441         push    %r12
442 .cfi_push       %r12
443         push    %r13
444 .cfi_push       %r13
445         push    %r14
446 .cfi_push       %r14
447         push    %r15
448 .cfi_push       %r15
449 ___
450 $code.=<<___ if ($win64);
451         lea     -0xa8(%rsp),%rsp
452         movaps  %xmm6,-0xd8(%rax)
453         movaps  %xmm7,-0xc8(%rax)
454         movaps  %xmm8,-0xb8(%rax)
455         movaps  %xmm9,-0xa8(%rax)
456         movaps  %xmm10,-0x98(%rax)
457         movaps  %xmm11,-0x88(%rax)
458         movaps  %xmm12,-0x78(%rax)
459         movaps  %xmm13,-0x68(%rax)
460         movaps  %xmm14,-0x58(%rax)
461         movaps  %xmm15,-0x48(%rax)
462 .Lgcm_dec_body:
463 ___
464 $code.=<<___;
465         vzeroupper
466
467         vmovdqu         ($ivp),$T1              # input counter value
468         add             \$-128,%rsp
469         mov             12($ivp),$counter
470         lea             .Lbswap_mask(%rip),$const
471         lea             -0x80($key),$in0        # borrow $in0
472         mov             \$0xf80,$end0           # borrow $end0
473         vmovdqu         ($Xip),$Xi              # load Xi
474         and             \$-128,%rsp             # ensure stack alignment
475         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
476         lea             0x80($key),$key         # size optimization
477         lea             0x20+0x20($Xip),$Xip    # size optimization
478         mov             0xf0-0x80($key),$rounds
479         vpshufb         $Ii,$Xi,$Xi
480
481         and             $end0,$in0
482         and             %rsp,$end0
483         sub             $in0,$end0
484         jc              .Ldec_no_key_aliasing
485         cmp             \$768,$end0
486         jnc             .Ldec_no_key_aliasing
487         sub             $end0,%rsp              # avoid aliasing with key
488 .Ldec_no_key_aliasing:
489
490         vmovdqu         0x50($inp),$Z3          # I[5]
491         lea             ($inp),$in0
492         vmovdqu         0x40($inp),$Z0
493
494         # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
495         # bytes before the end of the input. Note, in particular, that this is
496         # correct even if |$len| is not an even multiple of 96 or 16. XXX: This
497         # seems to require that |$inp| + |$len| >= 2*96 (0xc0); i.e. |$inp| must
498         # not be near the very beginning of the address space when |$len| < 2*96
499         # (0xc0).
500         lea             -0xc0($inp,$len),$end0
501
502         vmovdqu         0x30($inp),$Z1
503         shr             \$4,$len
504         xor             $ret,$ret
505         vmovdqu         0x20($inp),$Z2
506          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
507         vmovdqu         0x10($inp),$T2
508          vpshufb        $Ii,$Z0,$Z0
509         vmovdqu         ($inp),$Hkey
510          vpshufb        $Ii,$Z1,$Z1
511         vmovdqu         $Z0,0x30(%rsp)
512          vpshufb        $Ii,$Z2,$Z2
513         vmovdqu         $Z1,0x40(%rsp)
514          vpshufb        $Ii,$T2,$T2
515         vmovdqu         $Z2,0x50(%rsp)
516          vpshufb        $Ii,$Hkey,$Hkey
517         vmovdqu         $T2,0x60(%rsp)
518         vmovdqu         $Hkey,0x70(%rsp)
519
520         call            _aesni_ctr32_ghash_6x
521
522         vmovups         $inout0,-0x60($out)     # save output
523         vmovups         $inout1,-0x50($out)
524         vmovups         $inout2,-0x40($out)
525         vmovups         $inout3,-0x30($out)
526         vmovups         $inout4,-0x20($out)
527         vmovups         $inout5,-0x10($out)
528
529         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
530         vmovdqu         $Xi,-0x40($Xip)         # output Xi
531
532         vzeroupper
533 ___
534 $code.=<<___ if ($win64);
535         movaps  -0xd8(%rax),%xmm6
536         movaps  -0xc8(%rax),%xmm7
537         movaps  -0xb8(%rax),%xmm8
538         movaps  -0xa8(%rax),%xmm9
539         movaps  -0x98(%rax),%xmm10
540         movaps  -0x88(%rax),%xmm11
541         movaps  -0x78(%rax),%xmm12
542         movaps  -0x68(%rax),%xmm13
543         movaps  -0x58(%rax),%xmm14
544         movaps  -0x48(%rax),%xmm15
545 ___
546 $code.=<<___;
547         mov     -48(%rax),%r15
548 .cfi_restore    %r15
549         mov     -40(%rax),%r14
550 .cfi_restore    %r14
551         mov     -32(%rax),%r13
552 .cfi_restore    %r13
553         mov     -24(%rax),%r12
554 .cfi_restore    %r12
555         mov     -16(%rax),%rbp
556 .cfi_restore    %rbp
557         mov     -8(%rax),%rbx
558 .cfi_restore    %rbx
559         lea     (%rax),%rsp             # restore %rsp
560 .cfi_def_cfa_register   %rsp
561 .Lgcm_dec_abort:
562         mov     $ret,%rax               # return value
563         ret
564 .cfi_endproc
565 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
566 ___
567
568 $code.=<<___;
569 .type   _aesni_ctr32_6x,\@abi-omnipotent
570 .align  32
571 _aesni_ctr32_6x:
572         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
573         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
574         lea             -1($rounds),%r13
575         vmovups         0x10-0x80($key),$rndkey
576         lea             0x20-0x80($key),%r12
577         vpxor           $Z0,$T1,$inout0
578         add             \$`6<<24`,$counter
579         jc              .Lhandle_ctr32_2
580         vpaddb          $T2,$T1,$inout1
581         vpaddb          $T2,$inout1,$inout2
582         vpxor           $Z0,$inout1,$inout1
583         vpaddb          $T2,$inout2,$inout3
584         vpxor           $Z0,$inout2,$inout2
585         vpaddb          $T2,$inout3,$inout4
586         vpxor           $Z0,$inout3,$inout3
587         vpaddb          $T2,$inout4,$inout5
588         vpxor           $Z0,$inout4,$inout4
589         vpaddb          $T2,$inout5,$T1
590         vpxor           $Z0,$inout5,$inout5
591         jmp             .Loop_ctr32
592
593 .align  16
594 .Loop_ctr32:
595         vaesenc         $rndkey,$inout0,$inout0
596         vaesenc         $rndkey,$inout1,$inout1
597         vaesenc         $rndkey,$inout2,$inout2
598         vaesenc         $rndkey,$inout3,$inout3
599         vaesenc         $rndkey,$inout4,$inout4
600         vaesenc         $rndkey,$inout5,$inout5
601         vmovups         (%r12),$rndkey
602         lea             0x10(%r12),%r12
603         dec             %r13d
604         jnz             .Loop_ctr32
605
606         vmovdqu         (%r12),$Hkey            # last round key
607         vaesenc         $rndkey,$inout0,$inout0
608         vpxor           0x00($inp),$Hkey,$Z0
609         vaesenc         $rndkey,$inout1,$inout1
610         vpxor           0x10($inp),$Hkey,$Z1
611         vaesenc         $rndkey,$inout2,$inout2
612         vpxor           0x20($inp),$Hkey,$Z2
613         vaesenc         $rndkey,$inout3,$inout3
614         vpxor           0x30($inp),$Hkey,$Xi
615         vaesenc         $rndkey,$inout4,$inout4
616         vpxor           0x40($inp),$Hkey,$T2
617         vaesenc         $rndkey,$inout5,$inout5
618         vpxor           0x50($inp),$Hkey,$Hkey
619         lea             0x60($inp),$inp
620
621         vaesenclast     $Z0,$inout0,$inout0
622         vaesenclast     $Z1,$inout1,$inout1
623         vaesenclast     $Z2,$inout2,$inout2
624         vaesenclast     $Xi,$inout3,$inout3
625         vaesenclast     $T2,$inout4,$inout4
626         vaesenclast     $Hkey,$inout5,$inout5
627         vmovups         $inout0,0x00($out)
628         vmovups         $inout1,0x10($out)
629         vmovups         $inout2,0x20($out)
630         vmovups         $inout3,0x30($out)
631         vmovups         $inout4,0x40($out)
632         vmovups         $inout5,0x50($out)
633         lea             0x60($out),$out
634
635         ret
636 .align  32
637 .Lhandle_ctr32_2:
638         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
639         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
640         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
641         vpaddd          $Z1,$Z2,$inout2
642         vpaddd          $Z1,$inout1,$inout3
643         vpshufb         $Ii,$inout1,$inout1
644         vpaddd          $Z1,$inout2,$inout4
645         vpshufb         $Ii,$inout2,$inout2
646         vpxor           $Z0,$inout1,$inout1
647         vpaddd          $Z1,$inout3,$inout5
648         vpshufb         $Ii,$inout3,$inout3
649         vpxor           $Z0,$inout2,$inout2
650         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
651         vpshufb         $Ii,$inout4,$inout4
652         vpxor           $Z0,$inout3,$inout3
653         vpshufb         $Ii,$inout5,$inout5
654         vpxor           $Z0,$inout4,$inout4
655         vpshufb         $Ii,$T1,$T1             # next counter value
656         vpxor           $Z0,$inout5,$inout5
657         jmp     .Loop_ctr32
658 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
659
660 .globl  aesni_gcm_encrypt
661 .type   aesni_gcm_encrypt,\@function,6
662 .align  32
663 aesni_gcm_encrypt:
664 .cfi_startproc
665         xor     $ret,$ret
666
667         # We call |_aesni_ctr32_6x| twice, each call consuming 96 bytes of
668         # input. Then we call |_aesni_ctr32_ghash_6x|, which requires at
669         # least 96 more bytes of input.
670         cmp     \$0x60*3,$len                   # minimal accepted length
671         jb      .Lgcm_enc_abort
672
673         lea     (%rsp),%rax                     # save stack pointer
674 .cfi_def_cfa_register   %rax
675         push    %rbx
676 .cfi_push       %rbx
677         push    %rbp
678 .cfi_push       %rbp
679         push    %r12
680 .cfi_push       %r12
681         push    %r13
682 .cfi_push       %r13
683         push    %r14
684 .cfi_push       %r14
685         push    %r15
686 .cfi_push       %r15
687 ___
688 $code.=<<___ if ($win64);
689         lea     -0xa8(%rsp),%rsp
690         movaps  %xmm6,-0xd8(%rax)
691         movaps  %xmm7,-0xc8(%rax)
692         movaps  %xmm8,-0xb8(%rax)
693         movaps  %xmm9,-0xa8(%rax)
694         movaps  %xmm10,-0x98(%rax)
695         movaps  %xmm11,-0x88(%rax)
696         movaps  %xmm12,-0x78(%rax)
697         movaps  %xmm13,-0x68(%rax)
698         movaps  %xmm14,-0x58(%rax)
699         movaps  %xmm15,-0x48(%rax)
700 .Lgcm_enc_body:
701 ___
702 $code.=<<___;
703         vzeroupper
704
705         vmovdqu         ($ivp),$T1              # input counter value
706         add             \$-128,%rsp
707         mov             12($ivp),$counter
708         lea             .Lbswap_mask(%rip),$const
709         lea             -0x80($key),$in0        # borrow $in0
710         mov             \$0xf80,$end0           # borrow $end0
711         lea             0x80($key),$key         # size optimization
712         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
713         and             \$-128,%rsp             # ensure stack alignment
714         mov             0xf0-0x80($key),$rounds
715
716         and             $end0,$in0
717         and             %rsp,$end0
718         sub             $in0,$end0
719         jc              .Lenc_no_key_aliasing
720         cmp             \$768,$end0
721         jnc             .Lenc_no_key_aliasing
722         sub             $end0,%rsp              # avoid aliasing with key
723 .Lenc_no_key_aliasing:
724
725         lea             ($out),$in0
726
727         # |_aesni_ctr32_ghash_6x| requires |$end0| to point to 2*96 (0xc0)
728         # bytes before the end of the input. Note, in particular, that this is
729         # correct even if |$len| is not an even multiple of 96 or 16. Unlike in
730         # the decryption case, there's no caveat that |$out| must not be near
731         # the very beginning of the address space, because we know that
732         # |$len| >= 3*96 from the check above, and so we know
733         # |$out| + |$len| >= 2*96 (0xc0).
734         lea             -0xc0($out,$len),$end0
735
736         shr             \$4,$len
737
738         call            _aesni_ctr32_6x
739         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
740         vpshufb         $Ii,$inout1,$T2
741         vmovdqu         $Xi,0x70(%rsp)
742         vpshufb         $Ii,$inout2,$Z0
743         vmovdqu         $T2,0x60(%rsp)
744         vpshufb         $Ii,$inout3,$Z1
745         vmovdqu         $Z0,0x50(%rsp)
746         vpshufb         $Ii,$inout4,$Z2
747         vmovdqu         $Z1,0x40(%rsp)
748         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
749         vmovdqu         $Z2,0x30(%rsp)
750
751         call            _aesni_ctr32_6x
752
753         vmovdqu         ($Xip),$Xi              # load Xi
754         lea             0x20+0x20($Xip),$Xip    # size optimization
755         sub             \$12,$len
756         mov             \$0x60*2,$ret
757         vpshufb         $Ii,$Xi,$Xi
758
759         call            _aesni_ctr32_ghash_6x
760         vmovdqu         0x20(%rsp),$Z3          # I[5]
761          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
762         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
763         vpunpckhqdq     $Z3,$Z3,$T1
764         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
765          vmovups        $inout0,-0x60($out)     # save output
766          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
767         vpxor           $Z3,$T1,$T1
768          vmovups        $inout1,-0x50($out)
769          vpshufb        $Ii,$inout1,$inout1
770          vmovups        $inout2,-0x40($out)
771          vpshufb        $Ii,$inout2,$inout2
772          vmovups        $inout3,-0x30($out)
773          vpshufb        $Ii,$inout3,$inout3
774          vmovups        $inout4,-0x20($out)
775          vpshufb        $Ii,$inout4,$inout4
776          vmovups        $inout5,-0x10($out)
777          vpshufb        $Ii,$inout5,$inout5
778          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
779 ___
780 { my ($HK,$T3)=($rndkey,$inout0);
781
782 $code.=<<___;
783          vmovdqu        0x30(%rsp),$Z2          # I[4]
784          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
785          vpunpckhqdq    $Z2,$Z2,$T2
786         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
787          vpxor          $Z2,$T2,$T2
788         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
789         vpclmulqdq      \$0x00,$HK,$T1,$T1
790
791          vmovdqu        0x40(%rsp),$T3          # I[3]
792         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
793          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
794         vpxor           $Z1,$Z0,$Z0
795          vpunpckhqdq    $T3,$T3,$Z1
796         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
797          vpxor          $T3,$Z1,$Z1
798         vpxor           $Z3,$Z2,$Z2
799         vpclmulqdq      \$0x10,$HK,$T2,$T2
800          vmovdqu        0x50-0x20($Xip),$HK
801         vpxor           $T1,$T2,$T2
802
803          vmovdqu        0x50(%rsp),$T1          # I[2]
804         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
805          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
806         vpxor           $Z0,$Z3,$Z3
807          vpunpckhqdq    $T1,$T1,$Z0
808         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
809          vpxor          $T1,$Z0,$Z0
810         vpxor           $Z2,$T3,$T3
811         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
812         vpxor           $T2,$Z1,$Z1
813
814          vmovdqu        0x60(%rsp),$T2          # I[1]
815         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
816          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
817         vpxor           $Z3,$Z2,$Z2
818          vpunpckhqdq    $T2,$T2,$Z3
819         vpclmulqdq      \$0x11,$Ii,$T1,$T1
820          vpxor          $T2,$Z3,$Z3
821         vpxor           $T3,$T1,$T1
822         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
823          vmovdqu        0x80-0x20($Xip),$HK
824         vpxor           $Z1,$Z0,$Z0
825
826          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
827         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
828          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
829          vpunpckhqdq    $Xi,$Xi,$T3
830         vpxor           $Z2,$Z1,$Z1
831         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
832          vpxor          $Xi,$T3,$T3
833         vpxor           $T1,$T2,$T2
834         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
835         vpxor           $Z0,$Z3,$Z0
836
837         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
838          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
839          vpunpckhqdq    $inout5,$inout5,$T1
840         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
841          vpxor          $inout5,$T1,$T1
842         vpxor           $Z1,$Z2,$Z1
843         vpclmulqdq      \$0x10,$HK,$T3,$T3
844          vmovdqu        0x20-0x20($Xip),$HK
845         vpxor           $T2,$Xi,$Z3
846         vpxor           $Z0,$T3,$Z2
847
848          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
849           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
850         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
851           vpxor         $T3,$Z2,$Z2
852          vpunpckhqdq    $inout4,$inout4,$T2
853         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
854          vpxor          $inout4,$T2,$T2
855           vpslldq       \$8,$Z2,$T3
856         vpclmulqdq      \$0x00,$HK,$T1,$T1
857           vpxor         $T3,$Z1,$Xi
858           vpsrldq       \$8,$Z2,$Z2
859           vpxor         $Z2,$Z3,$Z3
860
861         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
862          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
863         vpxor           $Z0,$Z1,$Z1
864          vpunpckhqdq    $inout3,$inout3,$T3
865         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
866          vpxor          $inout3,$T3,$T3
867         vpxor           $inout5,$inout4,$inout4
868           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
869         vpclmulqdq      \$0x10,$HK,$T2,$T2
870          vmovdqu        0x50-0x20($Xip),$HK
871         vpxor           $T1,$T2,$T2
872
873         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
874          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
875         vpxor           $Z1,$Z0,$Z0
876          vpunpckhqdq    $inout2,$inout2,$T1
877         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
878          vpxor          $inout2,$T1,$T1
879         vpxor           $inout4,$inout3,$inout3
880           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
881         vpclmulqdq      \$0x00,$HK,$T3,$T3
882         vpxor           $T2,$T3,$T3
883
884           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
885           vxorps        $inout5,$Xi,$Xi
886
887         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
888          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
889         vpxor           $Z0,$Z1,$Z1
890          vpunpckhqdq    $inout1,$inout1,$T2
891         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
892          vpxor          $inout1,$T2,$T2
893           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
894         vpxor           $inout3,$inout2,$inout2
895         vpclmulqdq      \$0x10,$HK,$T1,$T1
896          vmovdqu        0x80-0x20($Xip),$HK
897         vpxor           $T3,$T1,$T1
898
899           vxorps        $Z3,$inout5,$inout5
900           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
901           vxorps        $inout5,$Xi,$Xi
902
903         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
904          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
905         vpxor           $Z1,$Z0,$Z0
906          vpunpckhqdq    $Xi,$Xi,$T3
907         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
908          vpxor          $Xi,$T3,$T3
909         vpxor           $inout2,$inout1,$inout1
910         vpclmulqdq      \$0x00,$HK,$T2,$T2
911         vpxor           $T1,$T2,$T2
912
913         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
914         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
915         vpxor           $Z0,$Z1,$Z1
916         vpclmulqdq      \$0x10,$HK,$T3,$Z2
917         vpxor           $inout1,$Z3,$Z3
918         vpxor           $T2,$Z2,$Z2
919
920         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
921         vpxor           $Z0,$Z2,$Z2
922         vpslldq         \$8,$Z2,$T1
923         vmovdqu         0x10($const),$Hkey      # .Lpoly
924         vpsrldq         \$8,$Z2,$Z2
925         vpxor           $T1,$Z1,$Xi
926         vpxor           $Z2,$Z3,$Z3
927
928         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
929         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
930         vpxor           $T2,$Xi,$Xi
931
932         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
933         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
934         vpxor           $Z3,$T2,$T2
935         vpxor           $T2,$Xi,$Xi
936 ___
937 }
938 $code.=<<___;
939         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
940         vmovdqu         $Xi,-0x40($Xip)         # output Xi
941
942         vzeroupper
943 ___
944 $code.=<<___ if ($win64);
945         movaps  -0xd8(%rax),%xmm6
946         movaps  -0xc8(%rax),%xmm7
947         movaps  -0xb8(%rax),%xmm8
948         movaps  -0xa8(%rax),%xmm9
949         movaps  -0x98(%rax),%xmm10
950         movaps  -0x88(%rax),%xmm11
951         movaps  -0x78(%rax),%xmm12
952         movaps  -0x68(%rax),%xmm13
953         movaps  -0x58(%rax),%xmm14
954         movaps  -0x48(%rax),%xmm15
955 ___
956 $code.=<<___;
957         mov     -48(%rax),%r15
958 .cfi_restore    %r15
959         mov     -40(%rax),%r14
960 .cfi_restore    %r14
961         mov     -32(%rax),%r13
962 .cfi_restore    %r13
963         mov     -24(%rax),%r12
964 .cfi_restore    %r12
965         mov     -16(%rax),%rbp
966 .cfi_restore    %rbp
967         mov     -8(%rax),%rbx
968 .cfi_restore    %rbx
969         lea     (%rax),%rsp             # restore %rsp
970 .cfi_def_cfa_register   %rsp
971 .Lgcm_enc_abort:
972         mov     $ret,%rax               # return value
973         ret
974 .cfi_endproc
975 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
976 ___
977
978 $code.=<<___;
979 .align  64
980 .Lbswap_mask:
981         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
982 .Lpoly:
983         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
984 .Lone_msb:
985         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
986 .Ltwo_lsb:
987         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
988 .Lone_lsb:
989         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
990 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
991 .align  64
992 ___
993 if ($win64) {
994 $rec="%rcx";
995 $frame="%rdx";
996 $context="%r8";
997 $disp="%r9";
998
999 $code.=<<___
1000 .extern __imp_RtlVirtualUnwind
1001 .type   gcm_se_handler,\@abi-omnipotent
1002 .align  16
1003 gcm_se_handler:
1004         push    %rsi
1005         push    %rdi
1006         push    %rbx
1007         push    %rbp
1008         push    %r12
1009         push    %r13
1010         push    %r14
1011         push    %r15
1012         pushfq
1013         sub     \$64,%rsp
1014
1015         mov     120($context),%rax      # pull context->Rax
1016         mov     248($context),%rbx      # pull context->Rip
1017
1018         mov     8($disp),%rsi           # disp->ImageBase
1019         mov     56($disp),%r11          # disp->HandlerData
1020
1021         mov     0(%r11),%r10d           # HandlerData[0]
1022         lea     (%rsi,%r10),%r10        # prologue label
1023         cmp     %r10,%rbx               # context->Rip<prologue label
1024         jb      .Lcommon_seh_tail
1025
1026         mov     152($context),%rax      # pull context->Rsp
1027
1028         mov     4(%r11),%r10d           # HandlerData[1]
1029         lea     (%rsi,%r10),%r10        # epilogue label
1030         cmp     %r10,%rbx               # context->Rip>=epilogue label
1031         jae     .Lcommon_seh_tail
1032
1033         mov     120($context),%rax      # pull context->Rax
1034
1035         mov     -48(%rax),%r15
1036         mov     -40(%rax),%r14
1037         mov     -32(%rax),%r13
1038         mov     -24(%rax),%r12
1039         mov     -16(%rax),%rbp
1040         mov     -8(%rax),%rbx
1041         mov     %r15,240($context)
1042         mov     %r14,232($context)
1043         mov     %r13,224($context)
1044         mov     %r12,216($context)
1045         mov     %rbp,160($context)
1046         mov     %rbx,144($context)
1047
1048         lea     -0xd8(%rax),%rsi        # %xmm save area
1049         lea     512($context),%rdi      # & context.Xmm6
1050         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1051         .long   0xa548f3fc              # cld; rep movsq
1052
1053 .Lcommon_seh_tail:
1054         mov     8(%rax),%rdi
1055         mov     16(%rax),%rsi
1056         mov     %rax,152($context)      # restore context->Rsp
1057         mov     %rsi,168($context)      # restore context->Rsi
1058         mov     %rdi,176($context)      # restore context->Rdi
1059
1060         mov     40($disp),%rdi          # disp->ContextRecord
1061         mov     $context,%rsi           # context
1062         mov     \$154,%ecx              # sizeof(CONTEXT)
1063         .long   0xa548f3fc              # cld; rep movsq
1064
1065         mov     $disp,%rsi
1066         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1067         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1068         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1069         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1070         mov     40(%rsi),%r10           # disp->ContextRecord
1071         lea     56(%rsi),%r11           # &disp->HandlerData
1072         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1073         mov     %r10,32(%rsp)           # arg5
1074         mov     %r11,40(%rsp)           # arg6
1075         mov     %r12,48(%rsp)           # arg7
1076         mov     %rcx,56(%rsp)           # arg8, (NULL)
1077         call    *__imp_RtlVirtualUnwind(%rip)
1078
1079         mov     \$1,%eax                # ExceptionContinueSearch
1080         add     \$64,%rsp
1081         popfq
1082         pop     %r15
1083         pop     %r14
1084         pop     %r13
1085         pop     %r12
1086         pop     %rbp
1087         pop     %rbx
1088         pop     %rdi
1089         pop     %rsi
1090         ret
1091 .size   gcm_se_handler,.-gcm_se_handler
1092
1093 .section        .pdata
1094 .align  4
1095         .rva    .LSEH_begin_aesni_gcm_decrypt
1096         .rva    .LSEH_end_aesni_gcm_decrypt
1097         .rva    .LSEH_gcm_dec_info
1098
1099         .rva    .LSEH_begin_aesni_gcm_encrypt
1100         .rva    .LSEH_end_aesni_gcm_encrypt
1101         .rva    .LSEH_gcm_enc_info
1102 .section        .xdata
1103 .align  8
1104 .LSEH_gcm_dec_info:
1105         .byte   9,0,0,0
1106         .rva    gcm_se_handler
1107         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1108 .LSEH_gcm_enc_info:
1109         .byte   9,0,0,0
1110         .rva    gcm_se_handler
1111         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1112 ___
1113 }
1114 }}} else {{{
1115 $code=<<___;    # assembler is too old
1116 .text
1117
1118 .globl  aesni_gcm_encrypt
1119 .type   aesni_gcm_encrypt,\@abi-omnipotent
1120 aesni_gcm_encrypt:
1121         xor     %eax,%eax
1122         ret
1123 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1124
1125 .globl  aesni_gcm_decrypt
1126 .type   aesni_gcm_decrypt,\@abi-omnipotent
1127 aesni_gcm_decrypt:
1128         xor     %eax,%eax
1129         ret
1130 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1131 ___
1132 }}}
1133
1134 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1135
1136 print $code;
1137
1138 close STDOUT;