Following the license change, modify the boilerplates in crypto/modes/
[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 #
18 # AES-NI-CTR+GHASH stitch.
19 #
20 # February 2013
21 #
22 # OpenSSL GCM implementation is organized in such way that its
23 # performance is rather close to the sum of its streamed components,
24 # in the context parallelized AES-NI CTR and modulo-scheduled
25 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26 # was observed to perform significantly better than the sum of the
27 # components on contemporary CPUs, the effort was deemed impossible to
28 # justify. This module is based on combination of Intel submissions,
29 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
31 # pressure with notable relative improvement, achieving 1.0 cycle per
32 # byte processed with 128-bit key on Haswell processor, 0.74 - on
33 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34 # measurements for favourable packet size, one divisible by 96.
35 # Applications using the EVP interface will observe a few percent
36 # worse performance.]
37 #
38 # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
39 #
40 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
42
43 $flavour = shift;
44 $output  = shift;
45 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
46
47 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
48
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
51 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
52 die "can't locate x86_64-xlate.pl";
53
54 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
55                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
56         $avx = ($1>=2.20) + ($1>=2.22);
57 }
58
59 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
60             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
61         $avx = ($1>=2.09) + ($1>=2.10);
62 }
63
64 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
65             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
66         $avx = ($1>=10) + ($1>=11);
67 }
68
69 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
70         $avx = ($2>=3.0) + ($2>3.0);
71 }
72
73 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
74 *STDOUT=*OUT;
75
76 if ($avx>1) {{{
77
78 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
79
80 ($Ii,$T1,$T2,$Hkey,
81  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
82
83 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
84
85 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
86
87 $code=<<___;
88 .text
89
90 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
91 .align  32
92 _aesni_ctr32_ghash_6x:
93         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
94         sub             \$6,$len
95         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
96         vmovdqu         0x00-0x80($key),$rndkey
97         vpaddb          $T2,$T1,$inout1
98         vpaddb          $T2,$inout1,$inout2
99         vpaddb          $T2,$inout2,$inout3
100         vpaddb          $T2,$inout3,$inout4
101         vpaddb          $T2,$inout4,$inout5
102         vpxor           $rndkey,$T1,$inout0
103         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
104         jmp             .Loop6x
105
106 .align  32
107 .Loop6x:
108         add             \$`6<<24`,$counter
109         jc              .Lhandle_ctr32          # discard $inout[1-5]?
110         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
111           vpaddb        $T2,$inout5,$T1         # next counter value
112           vpxor         $rndkey,$inout1,$inout1
113           vpxor         $rndkey,$inout2,$inout2
114
115 .Lresume_ctr32:
116         vmovdqu         $T1,($ivp)              # save next counter value
117         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
118           vpxor         $rndkey,$inout3,$inout3
119           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
120         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
121         xor             %r12,%r12
122         cmp             $in0,$end0
123
124           vaesenc       $T2,$inout0,$inout0
125         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
126           vpxor         $rndkey,$inout4,$inout4
127         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
128           vaesenc       $T2,$inout1,$inout1
129           vpxor         $rndkey,$inout5,$inout5
130         setnc           %r12b
131         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
132           vaesenc       $T2,$inout2,$inout2
133         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
134         neg             %r12
135           vaesenc       $T2,$inout3,$inout3
136          vpxor          $Z1,$Z2,$Z2
137         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
138          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
139           vaesenc       $T2,$inout4,$inout4
140          vpxor          $Z1,$T1,$Z0
141         and             \$0x60,%r12
142           vmovups       0x20-0x80($key),$rndkey
143         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
144           vaesenc       $T2,$inout5,$inout5
145
146         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
147         lea             ($in0,%r12),$in0
148           vaesenc       $rndkey,$inout0,$inout0
149          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
150         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
151          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
152           vaesenc       $rndkey,$inout1,$inout1
153         movbe           0x58($in0),%r13
154           vaesenc       $rndkey,$inout2,$inout2
155         movbe           0x50($in0),%r12
156           vaesenc       $rndkey,$inout3,$inout3
157         mov             %r13,0x20+8(%rsp)
158           vaesenc       $rndkey,$inout4,$inout4
159         mov             %r12,0x28+8(%rsp)
160         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
161           vaesenc       $rndkey,$inout5,$inout5
162
163           vmovups       0x30-0x80($key),$rndkey
164          vpxor          $T1,$Z2,$Z2
165         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
166           vaesenc       $rndkey,$inout0,$inout0
167          vpxor          $T2,$Z2,$Z2
168         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
169           vaesenc       $rndkey,$inout1,$inout1
170          vpxor          $Hkey,$Z3,$Z3
171         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
172           vaesenc       $rndkey,$inout2,$inout2
173         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
174          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
175           vaesenc       $rndkey,$inout3,$inout3
176           vaesenc       $rndkey,$inout4,$inout4
177          vpxor          $T1,$Z0,$Z0
178         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
179           vaesenc       $rndkey,$inout5,$inout5
180
181           vmovups       0x40-0x80($key),$rndkey
182          vpxor          $T2,$Z2,$Z2
183         vpclmulqdq      \$0x00,$T1,$Ii,$T2
184           vaesenc       $rndkey,$inout0,$inout0
185          vpxor          $Hkey,$Z2,$Z2
186         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
187           vaesenc       $rndkey,$inout1,$inout1
188         movbe           0x48($in0),%r13
189          vpxor          $Z1,$Z3,$Z3
190         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
191           vaesenc       $rndkey,$inout2,$inout2
192         movbe           0x40($in0),%r12
193         vpclmulqdq      \$0x11,$T1,$Ii,$T1
194          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
195           vaesenc       $rndkey,$inout3,$inout3
196         mov             %r13,0x30+8(%rsp)
197           vaesenc       $rndkey,$inout4,$inout4
198         mov             %r12,0x38+8(%rsp)
199          vpxor          $T2,$Z0,$Z0
200         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
201           vaesenc       $rndkey,$inout5,$inout5
202
203           vmovups       0x50-0x80($key),$rndkey
204          vpxor          $Hkey,$Z2,$Z2
205         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
206           vaesenc       $rndkey,$inout0,$inout0
207          vpxor          $Z1,$Z2,$Z2
208         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
209           vaesenc       $rndkey,$inout1,$inout1
210         movbe           0x38($in0),%r13
211          vpxor          $T1,$Z3,$Z3
212         vpclmulqdq      \$0x01,$T2,$Ii,$T1
213          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
214           vaesenc       $rndkey,$inout2,$inout2
215         movbe           0x30($in0),%r12
216         vpclmulqdq      \$0x11,$T2,$Ii,$T2
217           vaesenc       $rndkey,$inout3,$inout3
218         mov             %r13,0x40+8(%rsp)
219           vaesenc       $rndkey,$inout4,$inout4
220         mov             %r12,0x48+8(%rsp)
221          vpxor          $Hkey,$Z0,$Z0
222          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
223           vaesenc       $rndkey,$inout5,$inout5
224
225           vmovups       0x60-0x80($key),$rndkey
226          vpxor          $Z1,$Z2,$Z2
227         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
228           vaesenc       $rndkey,$inout0,$inout0
229          vpxor          $T1,$Z2,$Z2
230         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
231           vaesenc       $rndkey,$inout1,$inout1
232         movbe           0x28($in0),%r13
233          vpxor          $T2,$Z3,$Z3
234         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
235           vaesenc       $rndkey,$inout2,$inout2
236         movbe           0x20($in0),%r12
237         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
238           vaesenc       $rndkey,$inout3,$inout3
239         mov             %r13,0x50+8(%rsp)
240           vaesenc       $rndkey,$inout4,$inout4
241         mov             %r12,0x58+8(%rsp)
242         vpxor           $Z1,$Z2,$Z2
243           vaesenc       $rndkey,$inout5,$inout5
244         vpxor           $T1,$Z2,$Z2
245
246           vmovups       0x70-0x80($key),$rndkey
247         vpslldq         \$8,$Z2,$Z1
248         vpxor           $T2,$Z0,$Z0
249         vmovdqu         0x10($const),$Hkey      # .Lpoly
250
251           vaesenc       $rndkey,$inout0,$inout0
252         vpxor           $Xi,$Z3,$Z3
253           vaesenc       $rndkey,$inout1,$inout1
254         vpxor           $Z1,$Z0,$Z0
255         movbe           0x18($in0),%r13
256           vaesenc       $rndkey,$inout2,$inout2
257         movbe           0x10($in0),%r12
258         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
259         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
260         mov             %r13,0x60+8(%rsp)
261           vaesenc       $rndkey,$inout3,$inout3
262         mov             %r12,0x68+8(%rsp)
263           vaesenc       $rndkey,$inout4,$inout4
264           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
265           vaesenc       $rndkey,$inout5,$inout5
266
267           vaesenc       $T1,$inout0,$inout0
268           vmovups       0x90-0x80($key),$rndkey
269           vaesenc       $T1,$inout1,$inout1
270         vpsrldq         \$8,$Z2,$Z2
271           vaesenc       $T1,$inout2,$inout2
272         vpxor           $Z2,$Z3,$Z3
273           vaesenc       $T1,$inout3,$inout3
274         vpxor           $Ii,$Z0,$Z0
275         movbe           0x08($in0),%r13
276           vaesenc       $T1,$inout4,$inout4
277         movbe           0x00($in0),%r12
278           vaesenc       $T1,$inout5,$inout5
279           vmovups       0xa0-0x80($key),$T1
280           cmp           \$11,$rounds
281           jb            .Lenc_tail              # 128-bit key
282
283           vaesenc       $rndkey,$inout0,$inout0
284           vaesenc       $rndkey,$inout1,$inout1
285           vaesenc       $rndkey,$inout2,$inout2
286           vaesenc       $rndkey,$inout3,$inout3
287           vaesenc       $rndkey,$inout4,$inout4
288           vaesenc       $rndkey,$inout5,$inout5
289
290           vaesenc       $T1,$inout0,$inout0
291           vaesenc       $T1,$inout1,$inout1
292           vaesenc       $T1,$inout2,$inout2
293           vaesenc       $T1,$inout3,$inout3
294           vaesenc       $T1,$inout4,$inout4
295           vmovups       0xb0-0x80($key),$rndkey
296           vaesenc       $T1,$inout5,$inout5
297           vmovups       0xc0-0x80($key),$T1
298           je            .Lenc_tail              # 192-bit key
299
300           vaesenc       $rndkey,$inout0,$inout0
301           vaesenc       $rndkey,$inout1,$inout1
302           vaesenc       $rndkey,$inout2,$inout2
303           vaesenc       $rndkey,$inout3,$inout3
304           vaesenc       $rndkey,$inout4,$inout4
305           vaesenc       $rndkey,$inout5,$inout5
306
307           vaesenc       $T1,$inout0,$inout0
308           vaesenc       $T1,$inout1,$inout1
309           vaesenc       $T1,$inout2,$inout2
310           vaesenc       $T1,$inout3,$inout3
311           vaesenc       $T1,$inout4,$inout4
312           vmovups       0xd0-0x80($key),$rndkey
313           vaesenc       $T1,$inout5,$inout5
314           vmovups       0xe0-0x80($key),$T1
315           jmp           .Lenc_tail              # 256-bit key
316
317 .align  32
318 .Lhandle_ctr32:
319         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
320           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
321           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
322           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
323           vpaddd        $Z1,$Z2,$inout2
324         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
325           vpaddd        $Z1,$inout1,$inout3
326           vpshufb       $Ii,$inout1,$inout1
327           vpaddd        $Z1,$inout2,$inout4
328           vpshufb       $Ii,$inout2,$inout2
329           vpxor         $rndkey,$inout1,$inout1
330           vpaddd        $Z1,$inout3,$inout5
331           vpshufb       $Ii,$inout3,$inout3
332           vpxor         $rndkey,$inout2,$inout2
333           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
334           vpshufb       $Ii,$inout4,$inout4
335           vpshufb       $Ii,$inout5,$inout5
336           vpshufb       $Ii,$T1,$T1             # next counter value
337         jmp             .Lresume_ctr32
338
339 .align  32
340 .Lenc_tail:
341           vaesenc       $rndkey,$inout0,$inout0
342         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
343         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
344           vaesenc       $rndkey,$inout1,$inout1
345         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
346           vpxor         0x00($inp),$T1,$T2
347           vaesenc       $rndkey,$inout2,$inout2
348           vpxor         0x10($inp),$T1,$Ii
349           vaesenc       $rndkey,$inout3,$inout3
350           vpxor         0x20($inp),$T1,$Z1
351           vaesenc       $rndkey,$inout4,$inout4
352           vpxor         0x30($inp),$T1,$Z2
353           vaesenc       $rndkey,$inout5,$inout5
354           vpxor         0x40($inp),$T1,$Z3
355           vpxor         0x50($inp),$T1,$Hkey
356           vmovdqu       ($ivp),$T1              # load next counter value
357
358           vaesenclast   $T2,$inout0,$inout0
359           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
360           vaesenclast   $Ii,$inout1,$inout1
361          vpaddb         $T2,$T1,$Ii
362         mov             %r13,0x70+8(%rsp)
363         lea             0x60($inp),$inp
364           vaesenclast   $Z1,$inout2,$inout2
365          vpaddb         $T2,$Ii,$Z1
366         mov             %r12,0x78+8(%rsp)
367         lea             0x60($out),$out
368           vmovdqu       0x00-0x80($key),$rndkey
369           vaesenclast   $Z2,$inout3,$inout3
370          vpaddb         $T2,$Z1,$Z2
371           vaesenclast   $Z3, $inout4,$inout4
372          vpaddb         $T2,$Z2,$Z3
373           vaesenclast   $Hkey,$inout5,$inout5
374          vpaddb         $T2,$Z3,$Hkey
375
376         add             \$0x60,$ret
377         sub             \$0x6,$len
378         jc              .L6x_done
379
380           vmovups       $inout0,-0x60($out)     # save output
381          vpxor          $rndkey,$T1,$inout0
382           vmovups       $inout1,-0x50($out)
383          vmovdqa        $Ii,$inout1             # 0 latency
384           vmovups       $inout2,-0x40($out)
385          vmovdqa        $Z1,$inout2             # 0 latency
386           vmovups       $inout3,-0x30($out)
387          vmovdqa        $Z2,$inout3             # 0 latency
388           vmovups       $inout4,-0x20($out)
389          vmovdqa        $Z3,$inout4             # 0 latency
390           vmovups       $inout5,-0x10($out)
391          vmovdqa        $Hkey,$inout5           # 0 latency
392         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
393         jmp             .Loop6x
394
395 .L6x_done:
396         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
397         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
398
399         ret
400 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
401 ___
402 ######################################################################
403 #
404 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
405 #               const AES_KEY *key, unsigned char iv[16],
406 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
407 $code.=<<___;
408 .globl  aesni_gcm_decrypt
409 .type   aesni_gcm_decrypt,\@function,6
410 .align  32
411 aesni_gcm_decrypt:
412 .cfi_startproc
413         xor     $ret,$ret
414         cmp     \$0x60,$len                     # minimal accepted length
415         jb      .Lgcm_dec_abort
416
417         lea     (%rsp),%rax                     # save stack pointer
418 .cfi_def_cfa_register   %rax
419         push    %rbx
420 .cfi_push       %rbx
421         push    %rbp
422 .cfi_push       %rbp
423         push    %r12
424 .cfi_push       %r12
425         push    %r13
426 .cfi_push       %r13
427         push    %r14
428 .cfi_push       %r14
429         push    %r15
430 .cfi_push       %r15
431 ___
432 $code.=<<___ if ($win64);
433         lea     -0xa8(%rsp),%rsp
434         movaps  %xmm6,-0xd8(%rax)
435         movaps  %xmm7,-0xc8(%rax)
436         movaps  %xmm8,-0xb8(%rax)
437         movaps  %xmm9,-0xa8(%rax)
438         movaps  %xmm10,-0x98(%rax)
439         movaps  %xmm11,-0x88(%rax)
440         movaps  %xmm12,-0x78(%rax)
441         movaps  %xmm13,-0x68(%rax)
442         movaps  %xmm14,-0x58(%rax)
443         movaps  %xmm15,-0x48(%rax)
444 .Lgcm_dec_body:
445 ___
446 $code.=<<___;
447         vzeroupper
448
449         vmovdqu         ($ivp),$T1              # input counter value
450         add             \$-128,%rsp
451         mov             12($ivp),$counter
452         lea             .Lbswap_mask(%rip),$const
453         lea             -0x80($key),$in0        # borrow $in0
454         mov             \$0xf80,$end0           # borrow $end0
455         vmovdqu         ($Xip),$Xi              # load Xi
456         and             \$-128,%rsp             # ensure stack alignment
457         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
458         lea             0x80($key),$key         # size optimization
459         lea             0x20+0x20($Xip),$Xip    # size optimization
460         mov             0xf0-0x80($key),$rounds
461         vpshufb         $Ii,$Xi,$Xi
462
463         and             $end0,$in0
464         and             %rsp,$end0
465         sub             $in0,$end0
466         jc              .Ldec_no_key_aliasing
467         cmp             \$768,$end0
468         jnc             .Ldec_no_key_aliasing
469         sub             $end0,%rsp              # avoid aliasing with key
470 .Ldec_no_key_aliasing:
471
472         vmovdqu         0x50($inp),$Z3          # I[5]
473         lea             ($inp),$in0
474         vmovdqu         0x40($inp),$Z0
475         lea             -0xc0($inp,$len),$end0
476         vmovdqu         0x30($inp),$Z1
477         shr             \$4,$len
478         xor             $ret,$ret
479         vmovdqu         0x20($inp),$Z2
480          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
481         vmovdqu         0x10($inp),$T2
482          vpshufb        $Ii,$Z0,$Z0
483         vmovdqu         ($inp),$Hkey
484          vpshufb        $Ii,$Z1,$Z1
485         vmovdqu         $Z0,0x30(%rsp)
486          vpshufb        $Ii,$Z2,$Z2
487         vmovdqu         $Z1,0x40(%rsp)
488          vpshufb        $Ii,$T2,$T2
489         vmovdqu         $Z2,0x50(%rsp)
490          vpshufb        $Ii,$Hkey,$Hkey
491         vmovdqu         $T2,0x60(%rsp)
492         vmovdqu         $Hkey,0x70(%rsp)
493
494         call            _aesni_ctr32_ghash_6x
495
496         vmovups         $inout0,-0x60($out)     # save output
497         vmovups         $inout1,-0x50($out)
498         vmovups         $inout2,-0x40($out)
499         vmovups         $inout3,-0x30($out)
500         vmovups         $inout4,-0x20($out)
501         vmovups         $inout5,-0x10($out)
502
503         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
504         vmovdqu         $Xi,-0x40($Xip)         # output Xi
505
506         vzeroupper
507 ___
508 $code.=<<___ if ($win64);
509         movaps  -0xd8(%rax),%xmm6
510         movaps  -0xc8(%rax),%xmm7
511         movaps  -0xb8(%rax),%xmm8
512         movaps  -0xa8(%rax),%xmm9
513         movaps  -0x98(%rax),%xmm10
514         movaps  -0x88(%rax),%xmm11
515         movaps  -0x78(%rax),%xmm12
516         movaps  -0x68(%rax),%xmm13
517         movaps  -0x58(%rax),%xmm14
518         movaps  -0x48(%rax),%xmm15
519 ___
520 $code.=<<___;
521         mov     -48(%rax),%r15
522 .cfi_restore    %r15
523         mov     -40(%rax),%r14
524 .cfi_restore    %r14
525         mov     -32(%rax),%r13
526 .cfi_restore    %r13
527         mov     -24(%rax),%r12
528 .cfi_restore    %r12
529         mov     -16(%rax),%rbp
530 .cfi_restore    %rbp
531         mov     -8(%rax),%rbx
532 .cfi_restore    %rbx
533         lea     (%rax),%rsp             # restore %rsp
534 .cfi_def_cfa_register   %rsp
535 .Lgcm_dec_abort:
536         mov     $ret,%rax               # return value
537         ret
538 .cfi_endproc
539 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
540 ___
541
542 $code.=<<___;
543 .type   _aesni_ctr32_6x,\@abi-omnipotent
544 .align  32
545 _aesni_ctr32_6x:
546         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
547         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
548         lea             -1($rounds),%r13
549         vmovups         0x10-0x80($key),$rndkey
550         lea             0x20-0x80($key),%r12
551         vpxor           $Z0,$T1,$inout0
552         add             \$`6<<24`,$counter
553         jc              .Lhandle_ctr32_2
554         vpaddb          $T2,$T1,$inout1
555         vpaddb          $T2,$inout1,$inout2
556         vpxor           $Z0,$inout1,$inout1
557         vpaddb          $T2,$inout2,$inout3
558         vpxor           $Z0,$inout2,$inout2
559         vpaddb          $T2,$inout3,$inout4
560         vpxor           $Z0,$inout3,$inout3
561         vpaddb          $T2,$inout4,$inout5
562         vpxor           $Z0,$inout4,$inout4
563         vpaddb          $T2,$inout5,$T1
564         vpxor           $Z0,$inout5,$inout5
565         jmp             .Loop_ctr32
566
567 .align  16
568 .Loop_ctr32:
569         vaesenc         $rndkey,$inout0,$inout0
570         vaesenc         $rndkey,$inout1,$inout1
571         vaesenc         $rndkey,$inout2,$inout2
572         vaesenc         $rndkey,$inout3,$inout3
573         vaesenc         $rndkey,$inout4,$inout4
574         vaesenc         $rndkey,$inout5,$inout5
575         vmovups         (%r12),$rndkey
576         lea             0x10(%r12),%r12
577         dec             %r13d
578         jnz             .Loop_ctr32
579
580         vmovdqu         (%r12),$Hkey            # last round key
581         vaesenc         $rndkey,$inout0,$inout0
582         vpxor           0x00($inp),$Hkey,$Z0
583         vaesenc         $rndkey,$inout1,$inout1
584         vpxor           0x10($inp),$Hkey,$Z1
585         vaesenc         $rndkey,$inout2,$inout2
586         vpxor           0x20($inp),$Hkey,$Z2
587         vaesenc         $rndkey,$inout3,$inout3
588         vpxor           0x30($inp),$Hkey,$Xi
589         vaesenc         $rndkey,$inout4,$inout4
590         vpxor           0x40($inp),$Hkey,$T2
591         vaesenc         $rndkey,$inout5,$inout5
592         vpxor           0x50($inp),$Hkey,$Hkey
593         lea             0x60($inp),$inp
594
595         vaesenclast     $Z0,$inout0,$inout0
596         vaesenclast     $Z1,$inout1,$inout1
597         vaesenclast     $Z2,$inout2,$inout2
598         vaesenclast     $Xi,$inout3,$inout3
599         vaesenclast     $T2,$inout4,$inout4
600         vaesenclast     $Hkey,$inout5,$inout5
601         vmovups         $inout0,0x00($out)
602         vmovups         $inout1,0x10($out)
603         vmovups         $inout2,0x20($out)
604         vmovups         $inout3,0x30($out)
605         vmovups         $inout4,0x40($out)
606         vmovups         $inout5,0x50($out)
607         lea             0x60($out),$out
608
609         ret
610 .align  32
611 .Lhandle_ctr32_2:
612         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
613         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
614         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
615         vpaddd          $Z1,$Z2,$inout2
616         vpaddd          $Z1,$inout1,$inout3
617         vpshufb         $Ii,$inout1,$inout1
618         vpaddd          $Z1,$inout2,$inout4
619         vpshufb         $Ii,$inout2,$inout2
620         vpxor           $Z0,$inout1,$inout1
621         vpaddd          $Z1,$inout3,$inout5
622         vpshufb         $Ii,$inout3,$inout3
623         vpxor           $Z0,$inout2,$inout2
624         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
625         vpshufb         $Ii,$inout4,$inout4
626         vpxor           $Z0,$inout3,$inout3
627         vpshufb         $Ii,$inout5,$inout5
628         vpxor           $Z0,$inout4,$inout4
629         vpshufb         $Ii,$T1,$T1             # next counter value
630         vpxor           $Z0,$inout5,$inout5
631         jmp     .Loop_ctr32
632 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
633
634 .globl  aesni_gcm_encrypt
635 .type   aesni_gcm_encrypt,\@function,6
636 .align  32
637 aesni_gcm_encrypt:
638 .cfi_startproc
639         xor     $ret,$ret
640         cmp     \$0x60*3,$len                   # minimal accepted length
641         jb      .Lgcm_enc_abort
642
643         lea     (%rsp),%rax                     # save stack pointer
644 .cfi_def_cfa_register   %rax
645         push    %rbx
646 .cfi_push       %rbx
647         push    %rbp
648 .cfi_push       %rbp
649         push    %r12
650 .cfi_push       %r12
651         push    %r13
652 .cfi_push       %r13
653         push    %r14
654 .cfi_push       %r14
655         push    %r15
656 .cfi_push       %r15
657 ___
658 $code.=<<___ if ($win64);
659         lea     -0xa8(%rsp),%rsp
660         movaps  %xmm6,-0xd8(%rax)
661         movaps  %xmm7,-0xc8(%rax)
662         movaps  %xmm8,-0xb8(%rax)
663         movaps  %xmm9,-0xa8(%rax)
664         movaps  %xmm10,-0x98(%rax)
665         movaps  %xmm11,-0x88(%rax)
666         movaps  %xmm12,-0x78(%rax)
667         movaps  %xmm13,-0x68(%rax)
668         movaps  %xmm14,-0x58(%rax)
669         movaps  %xmm15,-0x48(%rax)
670 .Lgcm_enc_body:
671 ___
672 $code.=<<___;
673         vzeroupper
674
675         vmovdqu         ($ivp),$T1              # input counter value
676         add             \$-128,%rsp
677         mov             12($ivp),$counter
678         lea             .Lbswap_mask(%rip),$const
679         lea             -0x80($key),$in0        # borrow $in0
680         mov             \$0xf80,$end0           # borrow $end0
681         lea             0x80($key),$key         # size optimization
682         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
683         and             \$-128,%rsp             # ensure stack alignment
684         mov             0xf0-0x80($key),$rounds
685
686         and             $end0,$in0
687         and             %rsp,$end0
688         sub             $in0,$end0
689         jc              .Lenc_no_key_aliasing
690         cmp             \$768,$end0
691         jnc             .Lenc_no_key_aliasing
692         sub             $end0,%rsp              # avoid aliasing with key
693 .Lenc_no_key_aliasing:
694
695         lea             ($out),$in0
696         lea             -0xc0($out,$len),$end0
697         shr             \$4,$len
698
699         call            _aesni_ctr32_6x
700         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
701         vpshufb         $Ii,$inout1,$T2
702         vmovdqu         $Xi,0x70(%rsp)
703         vpshufb         $Ii,$inout2,$Z0
704         vmovdqu         $T2,0x60(%rsp)
705         vpshufb         $Ii,$inout3,$Z1
706         vmovdqu         $Z0,0x50(%rsp)
707         vpshufb         $Ii,$inout4,$Z2
708         vmovdqu         $Z1,0x40(%rsp)
709         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
710         vmovdqu         $Z2,0x30(%rsp)
711
712         call            _aesni_ctr32_6x
713
714         vmovdqu         ($Xip),$Xi              # load Xi
715         lea             0x20+0x20($Xip),$Xip    # size optimization
716         sub             \$12,$len
717         mov             \$0x60*2,$ret
718         vpshufb         $Ii,$Xi,$Xi
719
720         call            _aesni_ctr32_ghash_6x
721         vmovdqu         0x20(%rsp),$Z3          # I[5]
722          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
723         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
724         vpunpckhqdq     $Z3,$Z3,$T1
725         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
726          vmovups        $inout0,-0x60($out)     # save output
727          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
728         vpxor           $Z3,$T1,$T1
729          vmovups        $inout1,-0x50($out)
730          vpshufb        $Ii,$inout1,$inout1
731          vmovups        $inout2,-0x40($out)
732          vpshufb        $Ii,$inout2,$inout2
733          vmovups        $inout3,-0x30($out)
734          vpshufb        $Ii,$inout3,$inout3
735          vmovups        $inout4,-0x20($out)
736          vpshufb        $Ii,$inout4,$inout4
737          vmovups        $inout5,-0x10($out)
738          vpshufb        $Ii,$inout5,$inout5
739          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
740 ___
741 { my ($HK,$T3)=($rndkey,$inout0);
742
743 $code.=<<___;
744          vmovdqu        0x30(%rsp),$Z2          # I[4]
745          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
746          vpunpckhqdq    $Z2,$Z2,$T2
747         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
748          vpxor          $Z2,$T2,$T2
749         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
750         vpclmulqdq      \$0x00,$HK,$T1,$T1
751
752          vmovdqu        0x40(%rsp),$T3          # I[3]
753         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
754          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
755         vpxor           $Z1,$Z0,$Z0
756          vpunpckhqdq    $T3,$T3,$Z1
757         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
758          vpxor          $T3,$Z1,$Z1
759         vpxor           $Z3,$Z2,$Z2
760         vpclmulqdq      \$0x10,$HK,$T2,$T2
761          vmovdqu        0x50-0x20($Xip),$HK
762         vpxor           $T1,$T2,$T2
763
764          vmovdqu        0x50(%rsp),$T1          # I[2]
765         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
766          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
767         vpxor           $Z0,$Z3,$Z3
768          vpunpckhqdq    $T1,$T1,$Z0
769         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
770          vpxor          $T1,$Z0,$Z0
771         vpxor           $Z2,$T3,$T3
772         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
773         vpxor           $T2,$Z1,$Z1
774
775          vmovdqu        0x60(%rsp),$T2          # I[1]
776         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
777          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
778         vpxor           $Z3,$Z2,$Z2
779          vpunpckhqdq    $T2,$T2,$Z3
780         vpclmulqdq      \$0x11,$Ii,$T1,$T1
781          vpxor          $T2,$Z3,$Z3
782         vpxor           $T3,$T1,$T1
783         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
784          vmovdqu        0x80-0x20($Xip),$HK
785         vpxor           $Z1,$Z0,$Z0
786
787          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
788         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
789          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
790          vpunpckhqdq    $Xi,$Xi,$T3
791         vpxor           $Z2,$Z1,$Z1
792         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
793          vpxor          $Xi,$T3,$T3
794         vpxor           $T1,$T2,$T2
795         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
796         vpxor           $Z0,$Z3,$Z0
797
798         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
799          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
800          vpunpckhqdq    $inout5,$inout5,$T1
801         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
802          vpxor          $inout5,$T1,$T1
803         vpxor           $Z1,$Z2,$Z1
804         vpclmulqdq      \$0x10,$HK,$T3,$T3
805          vmovdqu        0x20-0x20($Xip),$HK
806         vpxor           $T2,$Xi,$Z3
807         vpxor           $Z0,$T3,$Z2
808
809          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
810           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
811         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
812           vpxor         $T3,$Z2,$Z2
813          vpunpckhqdq    $inout4,$inout4,$T2
814         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
815          vpxor          $inout4,$T2,$T2
816           vpslldq       \$8,$Z2,$T3
817         vpclmulqdq      \$0x00,$HK,$T1,$T1
818           vpxor         $T3,$Z1,$Xi
819           vpsrldq       \$8,$Z2,$Z2
820           vpxor         $Z2,$Z3,$Z3
821
822         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
823          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
824         vpxor           $Z0,$Z1,$Z1
825          vpunpckhqdq    $inout3,$inout3,$T3
826         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
827          vpxor          $inout3,$T3,$T3
828         vpxor           $inout5,$inout4,$inout4
829           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
830         vpclmulqdq      \$0x10,$HK,$T2,$T2
831          vmovdqu        0x50-0x20($Xip),$HK
832         vpxor           $T1,$T2,$T2
833
834         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
835          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
836         vpxor           $Z1,$Z0,$Z0
837          vpunpckhqdq    $inout2,$inout2,$T1
838         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
839          vpxor          $inout2,$T1,$T1
840         vpxor           $inout4,$inout3,$inout3
841           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
842         vpclmulqdq      \$0x00,$HK,$T3,$T3
843         vpxor           $T2,$T3,$T3
844
845           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
846           vxorps        $inout5,$Xi,$Xi
847
848         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
849          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
850         vpxor           $Z0,$Z1,$Z1
851          vpunpckhqdq    $inout1,$inout1,$T2
852         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
853          vpxor          $inout1,$T2,$T2
854           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
855         vpxor           $inout3,$inout2,$inout2
856         vpclmulqdq      \$0x10,$HK,$T1,$T1
857          vmovdqu        0x80-0x20($Xip),$HK
858         vpxor           $T3,$T1,$T1
859
860           vxorps        $Z3,$inout5,$inout5
861           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
862           vxorps        $inout5,$Xi,$Xi
863
864         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
865          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
866         vpxor           $Z1,$Z0,$Z0
867          vpunpckhqdq    $Xi,$Xi,$T3
868         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
869          vpxor          $Xi,$T3,$T3
870         vpxor           $inout2,$inout1,$inout1
871         vpclmulqdq      \$0x00,$HK,$T2,$T2
872         vpxor           $T1,$T2,$T2
873
874         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
875         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
876         vpxor           $Z0,$Z1,$Z1
877         vpclmulqdq      \$0x10,$HK,$T3,$Z2
878         vpxor           $inout1,$Z3,$Z3
879         vpxor           $T2,$Z2,$Z2
880
881         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
882         vpxor           $Z0,$Z2,$Z2
883         vpslldq         \$8,$Z2,$T1
884         vmovdqu         0x10($const),$Hkey      # .Lpoly
885         vpsrldq         \$8,$Z2,$Z2
886         vpxor           $T1,$Z1,$Xi
887         vpxor           $Z2,$Z3,$Z3
888
889         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
890         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
891         vpxor           $T2,$Xi,$Xi
892
893         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
894         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
895         vpxor           $Z3,$T2,$T2
896         vpxor           $T2,$Xi,$Xi
897 ___
898 }
899 $code.=<<___;
900         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
901         vmovdqu         $Xi,-0x40($Xip)         # output Xi
902
903         vzeroupper
904 ___
905 $code.=<<___ if ($win64);
906         movaps  -0xd8(%rax),%xmm6
907         movaps  -0xc8(%rax),%xmm7
908         movaps  -0xb8(%rax),%xmm8
909         movaps  -0xa8(%rax),%xmm9
910         movaps  -0x98(%rax),%xmm10
911         movaps  -0x88(%rax),%xmm11
912         movaps  -0x78(%rax),%xmm12
913         movaps  -0x68(%rax),%xmm13
914         movaps  -0x58(%rax),%xmm14
915         movaps  -0x48(%rax),%xmm15
916 ___
917 $code.=<<___;
918         mov     -48(%rax),%r15
919 .cfi_restore    %r15
920         mov     -40(%rax),%r14
921 .cfi_restore    %r14
922         mov     -32(%rax),%r13
923 .cfi_restore    %r13
924         mov     -24(%rax),%r12
925 .cfi_restore    %r12
926         mov     -16(%rax),%rbp
927 .cfi_restore    %rbp
928         mov     -8(%rax),%rbx
929 .cfi_restore    %rbx
930         lea     (%rax),%rsp             # restore %rsp
931 .cfi_def_cfa_register   %rsp
932 .Lgcm_enc_abort:
933         mov     $ret,%rax               # return value
934         ret
935 .cfi_endproc
936 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
937 ___
938
939 $code.=<<___;
940 .align  64
941 .Lbswap_mask:
942         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
943 .Lpoly:
944         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
945 .Lone_msb:
946         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
947 .Ltwo_lsb:
948         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
949 .Lone_lsb:
950         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
951 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
952 .align  64
953 ___
954 if ($win64) {
955 $rec="%rcx";
956 $frame="%rdx";
957 $context="%r8";
958 $disp="%r9";
959
960 $code.=<<___
961 .extern __imp_RtlVirtualUnwind
962 .type   gcm_se_handler,\@abi-omnipotent
963 .align  16
964 gcm_se_handler:
965         push    %rsi
966         push    %rdi
967         push    %rbx
968         push    %rbp
969         push    %r12
970         push    %r13
971         push    %r14
972         push    %r15
973         pushfq
974         sub     \$64,%rsp
975
976         mov     120($context),%rax      # pull context->Rax
977         mov     248($context),%rbx      # pull context->Rip
978
979         mov     8($disp),%rsi           # disp->ImageBase
980         mov     56($disp),%r11          # disp->HandlerData
981
982         mov     0(%r11),%r10d           # HandlerData[0]
983         lea     (%rsi,%r10),%r10        # prologue label
984         cmp     %r10,%rbx               # context->Rip<prologue label
985         jb      .Lcommon_seh_tail
986
987         mov     152($context),%rax      # pull context->Rsp
988
989         mov     4(%r11),%r10d           # HandlerData[1]
990         lea     (%rsi,%r10),%r10        # epilogue label
991         cmp     %r10,%rbx               # context->Rip>=epilogue label
992         jae     .Lcommon_seh_tail
993
994         mov     120($context),%rax      # pull context->Rax
995
996         mov     -48(%rax),%r15
997         mov     -40(%rax),%r14
998         mov     -32(%rax),%r13
999         mov     -24(%rax),%r12
1000         mov     -16(%rax),%rbp
1001         mov     -8(%rax),%rbx
1002         mov     %r15,240($context)
1003         mov     %r14,232($context)
1004         mov     %r13,224($context)
1005         mov     %r12,216($context)
1006         mov     %rbp,160($context)
1007         mov     %rbx,144($context)
1008
1009         lea     -0xd8(%rax),%rsi        # %xmm save area
1010         lea     512($context),%rdi      # & context.Xmm6
1011         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1012         .long   0xa548f3fc              # cld; rep movsq
1013
1014 .Lcommon_seh_tail:
1015         mov     8(%rax),%rdi
1016         mov     16(%rax),%rsi
1017         mov     %rax,152($context)      # restore context->Rsp
1018         mov     %rsi,168($context)      # restore context->Rsi
1019         mov     %rdi,176($context)      # restore context->Rdi
1020
1021         mov     40($disp),%rdi          # disp->ContextRecord
1022         mov     $context,%rsi           # context
1023         mov     \$154,%ecx              # sizeof(CONTEXT)
1024         .long   0xa548f3fc              # cld; rep movsq
1025
1026         mov     $disp,%rsi
1027         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1028         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1029         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1030         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1031         mov     40(%rsi),%r10           # disp->ContextRecord
1032         lea     56(%rsi),%r11           # &disp->HandlerData
1033         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1034         mov     %r10,32(%rsp)           # arg5
1035         mov     %r11,40(%rsp)           # arg6
1036         mov     %r12,48(%rsp)           # arg7
1037         mov     %rcx,56(%rsp)           # arg8, (NULL)
1038         call    *__imp_RtlVirtualUnwind(%rip)
1039
1040         mov     \$1,%eax                # ExceptionContinueSearch
1041         add     \$64,%rsp
1042         popfq
1043         pop     %r15
1044         pop     %r14
1045         pop     %r13
1046         pop     %r12
1047         pop     %rbp
1048         pop     %rbx
1049         pop     %rdi
1050         pop     %rsi
1051         ret
1052 .size   gcm_se_handler,.-gcm_se_handler
1053
1054 .section        .pdata
1055 .align  4
1056         .rva    .LSEH_begin_aesni_gcm_decrypt
1057         .rva    .LSEH_end_aesni_gcm_decrypt
1058         .rva    .LSEH_gcm_dec_info
1059
1060         .rva    .LSEH_begin_aesni_gcm_encrypt
1061         .rva    .LSEH_end_aesni_gcm_encrypt
1062         .rva    .LSEH_gcm_enc_info
1063 .section        .xdata
1064 .align  8
1065 .LSEH_gcm_dec_info:
1066         .byte   9,0,0,0
1067         .rva    gcm_se_handler
1068         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1069 .LSEH_gcm_enc_info:
1070         .byte   9,0,0,0
1071         .rva    gcm_se_handler
1072         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1073 ___
1074 }
1075 }}} else {{{
1076 $code=<<___;    # assembler is too old
1077 .text
1078
1079 .globl  aesni_gcm_encrypt
1080 .type   aesni_gcm_encrypt,\@abi-omnipotent
1081 aesni_gcm_encrypt:
1082         xor     %eax,%eax
1083         ret
1084 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1085
1086 .globl  aesni_gcm_decrypt
1087 .type   aesni_gcm_decrypt,\@abi-omnipotent
1088 aesni_gcm_decrypt:
1089         xor     %eax,%eax
1090         ret
1091 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1092 ___
1093 }}}
1094
1095 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1096
1097 print $code;
1098
1099 close STDOUT;