Undo commit cd359b2
[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 #
18 # AES-NI-CTR+GHASH stitch.
19 #
20 # February 2013
21 #
22 # OpenSSL GCM implementation is organized in such way that its
23 # performance is rather close to the sum of its streamed components,
24 # in the context parallelized AES-NI CTR and modulo-scheduled
25 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26 # was observed to perform significantly better than the sum of the
27 # components on contemporary CPUs, the effort was deemed impossible to
28 # justify. This module is based on combination of Intel submissions,
29 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
31 # pressure with notable relative improvement, achieving 1.0 cycle per
32 # byte processed with 128-bit key on Haswell processor, 0.74 - on
33 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34 # measurements for favourable packet size, one divisible by 96.
35 # Applications using the EVP interface will observe a few percent
36 # worse performance.]
37 #
38 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
39 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
40
41 $flavour = shift;
42 $output  = shift;
43 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
44
45 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
46
47 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
48 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
49 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
50 die "can't locate x86_64-xlate.pl";
51
52 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
53                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
54         $avx = ($1>=2.20) + ($1>=2.22);
55 }
56
57 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
58             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
59         $avx = ($1>=2.09) + ($1>=2.10);
60 }
61
62 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
63             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
64         $avx = ($1>=10) + ($1>=11);
65 }
66
67 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
68         $avx = ($2>=3.0) + ($2>3.0);
69 }
70
71 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
72 *STDOUT=*OUT;
73
74 if ($avx>1) {{{
75
76 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
77
78 ($Ii,$T1,$T2,$Hkey,
79  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
80
81 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
82
83 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
84
85 $code=<<___;
86 .text
87
88 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
89 .align  32
90 _aesni_ctr32_ghash_6x:
91         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
92         sub             \$6,$len
93         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
94         vmovdqu         0x00-0x80($key),$rndkey
95         vpaddb          $T2,$T1,$inout1
96         vpaddb          $T2,$inout1,$inout2
97         vpaddb          $T2,$inout2,$inout3
98         vpaddb          $T2,$inout3,$inout4
99         vpaddb          $T2,$inout4,$inout5
100         vpxor           $rndkey,$T1,$inout0
101         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
102         jmp             .Loop6x
103
104 .align  32
105 .Loop6x:
106         add             \$`6<<24`,$counter
107         jc              .Lhandle_ctr32          # discard $inout[1-5]?
108         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
109           vpaddb        $T2,$inout5,$T1         # next counter value
110           vpxor         $rndkey,$inout1,$inout1
111           vpxor         $rndkey,$inout2,$inout2
112
113 .Lresume_ctr32:
114         vmovdqu         $T1,($ivp)              # save next counter value
115         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
116           vpxor         $rndkey,$inout3,$inout3
117           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
118         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
119         xor             %r12,%r12
120         cmp             $in0,$end0
121
122           vaesenc       $T2,$inout0,$inout0
123         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
124           vpxor         $rndkey,$inout4,$inout4
125         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
126           vaesenc       $T2,$inout1,$inout1
127           vpxor         $rndkey,$inout5,$inout5
128         setnc           %r12b
129         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
130           vaesenc       $T2,$inout2,$inout2
131         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
132         neg             %r12
133           vaesenc       $T2,$inout3,$inout3
134          vpxor          $Z1,$Z2,$Z2
135         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
136          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
137           vaesenc       $T2,$inout4,$inout4
138          vpxor          $Z1,$T1,$Z0
139         and             \$0x60,%r12
140           vmovups       0x20-0x80($key),$rndkey
141         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
142           vaesenc       $T2,$inout5,$inout5
143
144         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
145         lea             ($in0,%r12),$in0
146           vaesenc       $rndkey,$inout0,$inout0
147          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
148         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
149          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
150           vaesenc       $rndkey,$inout1,$inout1
151         movbe           0x58($in0),%r13
152           vaesenc       $rndkey,$inout2,$inout2
153         movbe           0x50($in0),%r12
154           vaesenc       $rndkey,$inout3,$inout3
155         mov             %r13,0x20+8(%rsp)
156           vaesenc       $rndkey,$inout4,$inout4
157         mov             %r12,0x28+8(%rsp)
158         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
159           vaesenc       $rndkey,$inout5,$inout5
160
161           vmovups       0x30-0x80($key),$rndkey
162          vpxor          $T1,$Z2,$Z2
163         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
164           vaesenc       $rndkey,$inout0,$inout0
165          vpxor          $T2,$Z2,$Z2
166         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
167           vaesenc       $rndkey,$inout1,$inout1
168          vpxor          $Hkey,$Z3,$Z3
169         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
170           vaesenc       $rndkey,$inout2,$inout2
171         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
172          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
173           vaesenc       $rndkey,$inout3,$inout3
174           vaesenc       $rndkey,$inout4,$inout4
175          vpxor          $T1,$Z0,$Z0
176         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
177           vaesenc       $rndkey,$inout5,$inout5
178
179           vmovups       0x40-0x80($key),$rndkey
180          vpxor          $T2,$Z2,$Z2
181         vpclmulqdq      \$0x00,$T1,$Ii,$T2
182           vaesenc       $rndkey,$inout0,$inout0
183          vpxor          $Hkey,$Z2,$Z2
184         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
185           vaesenc       $rndkey,$inout1,$inout1
186         movbe           0x48($in0),%r13
187          vpxor          $Z1,$Z3,$Z3
188         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
189           vaesenc       $rndkey,$inout2,$inout2
190         movbe           0x40($in0),%r12
191         vpclmulqdq      \$0x11,$T1,$Ii,$T1
192          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
193           vaesenc       $rndkey,$inout3,$inout3
194         mov             %r13,0x30+8(%rsp)
195           vaesenc       $rndkey,$inout4,$inout4
196         mov             %r12,0x38+8(%rsp)
197          vpxor          $T2,$Z0,$Z0
198         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
199           vaesenc       $rndkey,$inout5,$inout5
200
201           vmovups       0x50-0x80($key),$rndkey
202          vpxor          $Hkey,$Z2,$Z2
203         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
204           vaesenc       $rndkey,$inout0,$inout0
205          vpxor          $Z1,$Z2,$Z2
206         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
207           vaesenc       $rndkey,$inout1,$inout1
208         movbe           0x38($in0),%r13
209          vpxor          $T1,$Z3,$Z3
210         vpclmulqdq      \$0x01,$T2,$Ii,$T1
211          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
212           vaesenc       $rndkey,$inout2,$inout2
213         movbe           0x30($in0),%r12
214         vpclmulqdq      \$0x11,$T2,$Ii,$T2
215           vaesenc       $rndkey,$inout3,$inout3
216         mov             %r13,0x40+8(%rsp)
217           vaesenc       $rndkey,$inout4,$inout4
218         mov             %r12,0x48+8(%rsp)
219          vpxor          $Hkey,$Z0,$Z0
220          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
221           vaesenc       $rndkey,$inout5,$inout5
222
223           vmovups       0x60-0x80($key),$rndkey
224          vpxor          $Z1,$Z2,$Z2
225         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
226           vaesenc       $rndkey,$inout0,$inout0
227          vpxor          $T1,$Z2,$Z2
228         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
229           vaesenc       $rndkey,$inout1,$inout1
230         movbe           0x28($in0),%r13
231          vpxor          $T2,$Z3,$Z3
232         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
233           vaesenc       $rndkey,$inout2,$inout2
234         movbe           0x20($in0),%r12
235         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
236           vaesenc       $rndkey,$inout3,$inout3
237         mov             %r13,0x50+8(%rsp)
238           vaesenc       $rndkey,$inout4,$inout4
239         mov             %r12,0x58+8(%rsp)
240         vpxor           $Z1,$Z2,$Z2
241           vaesenc       $rndkey,$inout5,$inout5
242         vpxor           $T1,$Z2,$Z2
243
244           vmovups       0x70-0x80($key),$rndkey
245         vpslldq         \$8,$Z2,$Z1
246         vpxor           $T2,$Z0,$Z0
247         vmovdqu         0x10($const),$Hkey      # .Lpoly
248
249           vaesenc       $rndkey,$inout0,$inout0
250         vpxor           $Xi,$Z3,$Z3
251           vaesenc       $rndkey,$inout1,$inout1
252         vpxor           $Z1,$Z0,$Z0
253         movbe           0x18($in0),%r13
254           vaesenc       $rndkey,$inout2,$inout2
255         movbe           0x10($in0),%r12
256         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
257         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
258         mov             %r13,0x60+8(%rsp)
259           vaesenc       $rndkey,$inout3,$inout3
260         mov             %r12,0x68+8(%rsp)
261           vaesenc       $rndkey,$inout4,$inout4
262           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
263           vaesenc       $rndkey,$inout5,$inout5
264
265           vaesenc       $T1,$inout0,$inout0
266           vmovups       0x90-0x80($key),$rndkey
267           vaesenc       $T1,$inout1,$inout1
268         vpsrldq         \$8,$Z2,$Z2
269           vaesenc       $T1,$inout2,$inout2
270         vpxor           $Z2,$Z3,$Z3
271           vaesenc       $T1,$inout3,$inout3
272         vpxor           $Ii,$Z0,$Z0
273         movbe           0x08($in0),%r13
274           vaesenc       $T1,$inout4,$inout4
275         movbe           0x00($in0),%r12
276           vaesenc       $T1,$inout5,$inout5
277           vmovups       0xa0-0x80($key),$T1
278           cmp           \$11,$rounds
279           jb            .Lenc_tail              # 128-bit key
280
281           vaesenc       $rndkey,$inout0,$inout0
282           vaesenc       $rndkey,$inout1,$inout1
283           vaesenc       $rndkey,$inout2,$inout2
284           vaesenc       $rndkey,$inout3,$inout3
285           vaesenc       $rndkey,$inout4,$inout4
286           vaesenc       $rndkey,$inout5,$inout5
287
288           vaesenc       $T1,$inout0,$inout0
289           vaesenc       $T1,$inout1,$inout1
290           vaesenc       $T1,$inout2,$inout2
291           vaesenc       $T1,$inout3,$inout3
292           vaesenc       $T1,$inout4,$inout4
293           vmovups       0xb0-0x80($key),$rndkey
294           vaesenc       $T1,$inout5,$inout5
295           vmovups       0xc0-0x80($key),$T1
296           je            .Lenc_tail              # 192-bit key
297
298           vaesenc       $rndkey,$inout0,$inout0
299           vaesenc       $rndkey,$inout1,$inout1
300           vaesenc       $rndkey,$inout2,$inout2
301           vaesenc       $rndkey,$inout3,$inout3
302           vaesenc       $rndkey,$inout4,$inout4
303           vaesenc       $rndkey,$inout5,$inout5
304
305           vaesenc       $T1,$inout0,$inout0
306           vaesenc       $T1,$inout1,$inout1
307           vaesenc       $T1,$inout2,$inout2
308           vaesenc       $T1,$inout3,$inout3
309           vaesenc       $T1,$inout4,$inout4
310           vmovups       0xd0-0x80($key),$rndkey
311           vaesenc       $T1,$inout5,$inout5
312           vmovups       0xe0-0x80($key),$T1
313           jmp           .Lenc_tail              # 256-bit key
314
315 .align  32
316 .Lhandle_ctr32:
317         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
318           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
319           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
320           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
321           vpaddd        $Z1,$Z2,$inout2
322         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
323           vpaddd        $Z1,$inout1,$inout3
324           vpshufb       $Ii,$inout1,$inout1
325           vpaddd        $Z1,$inout2,$inout4
326           vpshufb       $Ii,$inout2,$inout2
327           vpxor         $rndkey,$inout1,$inout1
328           vpaddd        $Z1,$inout3,$inout5
329           vpshufb       $Ii,$inout3,$inout3
330           vpxor         $rndkey,$inout2,$inout2
331           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
332           vpshufb       $Ii,$inout4,$inout4
333           vpshufb       $Ii,$inout5,$inout5
334           vpshufb       $Ii,$T1,$T1             # next counter value
335         jmp             .Lresume_ctr32
336
337 .align  32
338 .Lenc_tail:
339           vaesenc       $rndkey,$inout0,$inout0
340         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
341         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
342           vaesenc       $rndkey,$inout1,$inout1
343         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
344           vpxor         0x00($inp),$T1,$T2
345           vaesenc       $rndkey,$inout2,$inout2
346           vpxor         0x10($inp),$T1,$Ii
347           vaesenc       $rndkey,$inout3,$inout3
348           vpxor         0x20($inp),$T1,$Z1
349           vaesenc       $rndkey,$inout4,$inout4
350           vpxor         0x30($inp),$T1,$Z2
351           vaesenc       $rndkey,$inout5,$inout5
352           vpxor         0x40($inp),$T1,$Z3
353           vpxor         0x50($inp),$T1,$Hkey
354           vmovdqu       ($ivp),$T1              # load next counter value
355
356           vaesenclast   $T2,$inout0,$inout0
357           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
358           vaesenclast   $Ii,$inout1,$inout1
359          vpaddb         $T2,$T1,$Ii
360         mov             %r13,0x70+8(%rsp)
361         lea             0x60($inp),$inp
362           vaesenclast   $Z1,$inout2,$inout2
363          vpaddb         $T2,$Ii,$Z1
364         mov             %r12,0x78+8(%rsp)
365         lea             0x60($out),$out
366           vmovdqu       0x00-0x80($key),$rndkey
367           vaesenclast   $Z2,$inout3,$inout3
368          vpaddb         $T2,$Z1,$Z2
369           vaesenclast   $Z3, $inout4,$inout4
370          vpaddb         $T2,$Z2,$Z3
371           vaesenclast   $Hkey,$inout5,$inout5
372          vpaddb         $T2,$Z3,$Hkey
373
374         add             \$0x60,$ret
375         sub             \$0x6,$len
376         jc              .L6x_done
377
378           vmovups       $inout0,-0x60($out)     # save output
379          vpxor          $rndkey,$T1,$inout0
380           vmovups       $inout1,-0x50($out)
381          vmovdqa        $Ii,$inout1             # 0 latency
382           vmovups       $inout2,-0x40($out)
383          vmovdqa        $Z1,$inout2             # 0 latency
384           vmovups       $inout3,-0x30($out)
385          vmovdqa        $Z2,$inout3             # 0 latency
386           vmovups       $inout4,-0x20($out)
387          vmovdqa        $Z3,$inout4             # 0 latency
388           vmovups       $inout5,-0x10($out)
389          vmovdqa        $Hkey,$inout5           # 0 latency
390         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
391         jmp             .Loop6x
392
393 .L6x_done:
394         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
395         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
396
397         ret
398 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
399 ___
400 ######################################################################
401 #
402 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
403 #               const AES_KEY *key, unsigned char iv[16],
404 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
405 $code.=<<___;
406 .globl  aesni_gcm_decrypt
407 .type   aesni_gcm_decrypt,\@function,6
408 .align  32
409 aesni_gcm_decrypt:
410 .cfi_startproc
411         xor     $ret,$ret
412         cmp     \$0x60,$len                     # minimal accepted length
413         jb      .Lgcm_dec_abort
414
415         lea     (%rsp),%rax                     # save stack pointer
416 .cfi_def_cfa_register   %rax
417         push    %rbx
418 .cfi_push       %rbx
419         push    %rbp
420 .cfi_push       %rbp
421         push    %r12
422 .cfi_push       %r12
423         push    %r13
424 .cfi_push       %r13
425         push    %r14
426 .cfi_push       %r14
427         push    %r15
428 .cfi_push       %r15
429 ___
430 $code.=<<___ if ($win64);
431         lea     -0xa8(%rsp),%rsp
432         movaps  %xmm6,-0xd8(%rax)
433         movaps  %xmm7,-0xc8(%rax)
434         movaps  %xmm8,-0xb8(%rax)
435         movaps  %xmm9,-0xa8(%rax)
436         movaps  %xmm10,-0x98(%rax)
437         movaps  %xmm11,-0x88(%rax)
438         movaps  %xmm12,-0x78(%rax)
439         movaps  %xmm13,-0x68(%rax)
440         movaps  %xmm14,-0x58(%rax)
441         movaps  %xmm15,-0x48(%rax)
442 .Lgcm_dec_body:
443 ___
444 $code.=<<___;
445         vzeroupper
446
447         vmovdqu         ($ivp),$T1              # input counter value
448         add             \$-128,%rsp
449         mov             12($ivp),$counter
450         lea             .Lbswap_mask(%rip),$const
451         lea             -0x80($key),$in0        # borrow $in0
452         mov             \$0xf80,$end0           # borrow $end0
453         vmovdqu         ($Xip),$Xi              # load Xi
454         and             \$-128,%rsp             # ensure stack alignment
455         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
456         lea             0x80($key),$key         # size optimization
457         lea             0x20+0x20($Xip),$Xip    # size optimization
458         mov             0xf0-0x80($key),$rounds
459         vpshufb         $Ii,$Xi,$Xi
460
461         and             $end0,$in0
462         and             %rsp,$end0
463         sub             $in0,$end0
464         jc              .Ldec_no_key_aliasing
465         cmp             \$768,$end0
466         jnc             .Ldec_no_key_aliasing
467         sub             $end0,%rsp              # avoid aliasing with key
468 .Ldec_no_key_aliasing:
469
470         vmovdqu         0x50($inp),$Z3          # I[5]
471         lea             ($inp),$in0
472         vmovdqu         0x40($inp),$Z0
473         lea             -0xc0($inp,$len),$end0
474         vmovdqu         0x30($inp),$Z1
475         shr             \$4,$len
476         xor             $ret,$ret
477         vmovdqu         0x20($inp),$Z2
478          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
479         vmovdqu         0x10($inp),$T2
480          vpshufb        $Ii,$Z0,$Z0
481         vmovdqu         ($inp),$Hkey
482          vpshufb        $Ii,$Z1,$Z1
483         vmovdqu         $Z0,0x30(%rsp)
484          vpshufb        $Ii,$Z2,$Z2
485         vmovdqu         $Z1,0x40(%rsp)
486          vpshufb        $Ii,$T2,$T2
487         vmovdqu         $Z2,0x50(%rsp)
488          vpshufb        $Ii,$Hkey,$Hkey
489         vmovdqu         $T2,0x60(%rsp)
490         vmovdqu         $Hkey,0x70(%rsp)
491
492         call            _aesni_ctr32_ghash_6x
493
494         vmovups         $inout0,-0x60($out)     # save output
495         vmovups         $inout1,-0x50($out)
496         vmovups         $inout2,-0x40($out)
497         vmovups         $inout3,-0x30($out)
498         vmovups         $inout4,-0x20($out)
499         vmovups         $inout5,-0x10($out)
500
501         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
502         vmovdqu         $Xi,-0x40($Xip)         # output Xi
503
504         vzeroupper
505 ___
506 $code.=<<___ if ($win64);
507         movaps  -0xd8(%rax),%xmm6
508         movaps  -0xc8(%rax),%xmm7
509         movaps  -0xb8(%rax),%xmm8
510         movaps  -0xa8(%rax),%xmm9
511         movaps  -0x98(%rax),%xmm10
512         movaps  -0x88(%rax),%xmm11
513         movaps  -0x78(%rax),%xmm12
514         movaps  -0x68(%rax),%xmm13
515         movaps  -0x58(%rax),%xmm14
516         movaps  -0x48(%rax),%xmm15
517 ___
518 $code.=<<___;
519         mov     -48(%rax),%r15
520 .cfi_restore    %r15
521         mov     -40(%rax),%r14
522 .cfi_restore    %r14
523         mov     -32(%rax),%r13
524 .cfi_restore    %r13
525         mov     -24(%rax),%r12
526 .cfi_restore    %r12
527         mov     -16(%rax),%rbp
528 .cfi_restore    %rbp
529         mov     -8(%rax),%rbx
530 .cfi_restore    %rbx
531         lea     (%rax),%rsp             # restore %rsp
532 .cfi_def_cfa_register   %rsp
533 .Lgcm_dec_abort:
534         mov     $ret,%rax               # return value
535         ret
536 .cfi_endproc
537 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
538 ___
539
540 $code.=<<___;
541 .type   _aesni_ctr32_6x,\@abi-omnipotent
542 .align  32
543 _aesni_ctr32_6x:
544         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
545         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
546         lea             -1($rounds),%r13
547         vmovups         0x10-0x80($key),$rndkey
548         lea             0x20-0x80($key),%r12
549         vpxor           $Z0,$T1,$inout0
550         add             \$`6<<24`,$counter
551         jc              .Lhandle_ctr32_2
552         vpaddb          $T2,$T1,$inout1
553         vpaddb          $T2,$inout1,$inout2
554         vpxor           $Z0,$inout1,$inout1
555         vpaddb          $T2,$inout2,$inout3
556         vpxor           $Z0,$inout2,$inout2
557         vpaddb          $T2,$inout3,$inout4
558         vpxor           $Z0,$inout3,$inout3
559         vpaddb          $T2,$inout4,$inout5
560         vpxor           $Z0,$inout4,$inout4
561         vpaddb          $T2,$inout5,$T1
562         vpxor           $Z0,$inout5,$inout5
563         jmp             .Loop_ctr32
564
565 .align  16
566 .Loop_ctr32:
567         vaesenc         $rndkey,$inout0,$inout0
568         vaesenc         $rndkey,$inout1,$inout1
569         vaesenc         $rndkey,$inout2,$inout2
570         vaesenc         $rndkey,$inout3,$inout3
571         vaesenc         $rndkey,$inout4,$inout4
572         vaesenc         $rndkey,$inout5,$inout5
573         vmovups         (%r12),$rndkey
574         lea             0x10(%r12),%r12
575         dec             %r13d
576         jnz             .Loop_ctr32
577
578         vmovdqu         (%r12),$Hkey            # last round key
579         vaesenc         $rndkey,$inout0,$inout0
580         vpxor           0x00($inp),$Hkey,$Z0
581         vaesenc         $rndkey,$inout1,$inout1
582         vpxor           0x10($inp),$Hkey,$Z1
583         vaesenc         $rndkey,$inout2,$inout2
584         vpxor           0x20($inp),$Hkey,$Z2
585         vaesenc         $rndkey,$inout3,$inout3
586         vpxor           0x30($inp),$Hkey,$Xi
587         vaesenc         $rndkey,$inout4,$inout4
588         vpxor           0x40($inp),$Hkey,$T2
589         vaesenc         $rndkey,$inout5,$inout5
590         vpxor           0x50($inp),$Hkey,$Hkey
591         lea             0x60($inp),$inp
592
593         vaesenclast     $Z0,$inout0,$inout0
594         vaesenclast     $Z1,$inout1,$inout1
595         vaesenclast     $Z2,$inout2,$inout2
596         vaesenclast     $Xi,$inout3,$inout3
597         vaesenclast     $T2,$inout4,$inout4
598         vaesenclast     $Hkey,$inout5,$inout5
599         vmovups         $inout0,0x00($out)
600         vmovups         $inout1,0x10($out)
601         vmovups         $inout2,0x20($out)
602         vmovups         $inout3,0x30($out)
603         vmovups         $inout4,0x40($out)
604         vmovups         $inout5,0x50($out)
605         lea             0x60($out),$out
606
607         ret
608 .align  32
609 .Lhandle_ctr32_2:
610         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
611         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
612         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
613         vpaddd          $Z1,$Z2,$inout2
614         vpaddd          $Z1,$inout1,$inout3
615         vpshufb         $Ii,$inout1,$inout1
616         vpaddd          $Z1,$inout2,$inout4
617         vpshufb         $Ii,$inout2,$inout2
618         vpxor           $Z0,$inout1,$inout1
619         vpaddd          $Z1,$inout3,$inout5
620         vpshufb         $Ii,$inout3,$inout3
621         vpxor           $Z0,$inout2,$inout2
622         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
623         vpshufb         $Ii,$inout4,$inout4
624         vpxor           $Z0,$inout3,$inout3
625         vpshufb         $Ii,$inout5,$inout5
626         vpxor           $Z0,$inout4,$inout4
627         vpshufb         $Ii,$T1,$T1             # next counter value
628         vpxor           $Z0,$inout5,$inout5
629         jmp     .Loop_ctr32
630 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
631
632 .globl  aesni_gcm_encrypt
633 .type   aesni_gcm_encrypt,\@function,6
634 .align  32
635 aesni_gcm_encrypt:
636 .cfi_startproc
637         xor     $ret,$ret
638         cmp     \$0x60*3,$len                   # minimal accepted length
639         jb      .Lgcm_enc_abort
640
641         lea     (%rsp),%rax                     # save stack pointer
642 .cfi_def_cfa_register   %rax
643         push    %rbx
644 .cfi_push       %rbx
645         push    %rbp
646 .cfi_push       %rbp
647         push    %r12
648 .cfi_push       %r12
649         push    %r13
650 .cfi_push       %r13
651         push    %r14
652 .cfi_push       %r14
653         push    %r15
654 .cfi_push       %r15
655 ___
656 $code.=<<___ if ($win64);
657         lea     -0xa8(%rsp),%rsp
658         movaps  %xmm6,-0xd8(%rax)
659         movaps  %xmm7,-0xc8(%rax)
660         movaps  %xmm8,-0xb8(%rax)
661         movaps  %xmm9,-0xa8(%rax)
662         movaps  %xmm10,-0x98(%rax)
663         movaps  %xmm11,-0x88(%rax)
664         movaps  %xmm12,-0x78(%rax)
665         movaps  %xmm13,-0x68(%rax)
666         movaps  %xmm14,-0x58(%rax)
667         movaps  %xmm15,-0x48(%rax)
668 .Lgcm_enc_body:
669 ___
670 $code.=<<___;
671         vzeroupper
672
673         vmovdqu         ($ivp),$T1              # input counter value
674         add             \$-128,%rsp
675         mov             12($ivp),$counter
676         lea             .Lbswap_mask(%rip),$const
677         lea             -0x80($key),$in0        # borrow $in0
678         mov             \$0xf80,$end0           # borrow $end0
679         lea             0x80($key),$key         # size optimization
680         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
681         and             \$-128,%rsp             # ensure stack alignment
682         mov             0xf0-0x80($key),$rounds
683
684         and             $end0,$in0
685         and             %rsp,$end0
686         sub             $in0,$end0
687         jc              .Lenc_no_key_aliasing
688         cmp             \$768,$end0
689         jnc             .Lenc_no_key_aliasing
690         sub             $end0,%rsp              # avoid aliasing with key
691 .Lenc_no_key_aliasing:
692
693         lea             ($out),$in0
694         lea             -0xc0($out,$len),$end0
695         shr             \$4,$len
696
697         call            _aesni_ctr32_6x
698         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
699         vpshufb         $Ii,$inout1,$T2
700         vmovdqu         $Xi,0x70(%rsp)
701         vpshufb         $Ii,$inout2,$Z0
702         vmovdqu         $T2,0x60(%rsp)
703         vpshufb         $Ii,$inout3,$Z1
704         vmovdqu         $Z0,0x50(%rsp)
705         vpshufb         $Ii,$inout4,$Z2
706         vmovdqu         $Z1,0x40(%rsp)
707         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
708         vmovdqu         $Z2,0x30(%rsp)
709
710         call            _aesni_ctr32_6x
711
712         vmovdqu         ($Xip),$Xi              # load Xi
713         lea             0x20+0x20($Xip),$Xip    # size optimization
714         sub             \$12,$len
715         mov             \$0x60*2,$ret
716         vpshufb         $Ii,$Xi,$Xi
717
718         call            _aesni_ctr32_ghash_6x
719         vmovdqu         0x20(%rsp),$Z3          # I[5]
720          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
721         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
722         vpunpckhqdq     $Z3,$Z3,$T1
723         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
724          vmovups        $inout0,-0x60($out)     # save output
725          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
726         vpxor           $Z3,$T1,$T1
727          vmovups        $inout1,-0x50($out)
728          vpshufb        $Ii,$inout1,$inout1
729          vmovups        $inout2,-0x40($out)
730          vpshufb        $Ii,$inout2,$inout2
731          vmovups        $inout3,-0x30($out)
732          vpshufb        $Ii,$inout3,$inout3
733          vmovups        $inout4,-0x20($out)
734          vpshufb        $Ii,$inout4,$inout4
735          vmovups        $inout5,-0x10($out)
736          vpshufb        $Ii,$inout5,$inout5
737          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
738 ___
739 { my ($HK,$T3)=($rndkey,$inout0);
740
741 $code.=<<___;
742          vmovdqu        0x30(%rsp),$Z2          # I[4]
743          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
744          vpunpckhqdq    $Z2,$Z2,$T2
745         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
746          vpxor          $Z2,$T2,$T2
747         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
748         vpclmulqdq      \$0x00,$HK,$T1,$T1
749
750          vmovdqu        0x40(%rsp),$T3          # I[3]
751         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
752          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
753         vpxor           $Z1,$Z0,$Z0
754          vpunpckhqdq    $T3,$T3,$Z1
755         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
756          vpxor          $T3,$Z1,$Z1
757         vpxor           $Z3,$Z2,$Z2
758         vpclmulqdq      \$0x10,$HK,$T2,$T2
759          vmovdqu        0x50-0x20($Xip),$HK
760         vpxor           $T1,$T2,$T2
761
762          vmovdqu        0x50(%rsp),$T1          # I[2]
763         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
764          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
765         vpxor           $Z0,$Z3,$Z3
766          vpunpckhqdq    $T1,$T1,$Z0
767         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
768          vpxor          $T1,$Z0,$Z0
769         vpxor           $Z2,$T3,$T3
770         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
771         vpxor           $T2,$Z1,$Z1
772
773          vmovdqu        0x60(%rsp),$T2          # I[1]
774         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
775          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
776         vpxor           $Z3,$Z2,$Z2
777          vpunpckhqdq    $T2,$T2,$Z3
778         vpclmulqdq      \$0x11,$Ii,$T1,$T1
779          vpxor          $T2,$Z3,$Z3
780         vpxor           $T3,$T1,$T1
781         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
782          vmovdqu        0x80-0x20($Xip),$HK
783         vpxor           $Z1,$Z0,$Z0
784
785          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
786         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
787          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
788          vpunpckhqdq    $Xi,$Xi,$T3
789         vpxor           $Z2,$Z1,$Z1
790         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
791          vpxor          $Xi,$T3,$T3
792         vpxor           $T1,$T2,$T2
793         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
794         vpxor           $Z0,$Z3,$Z0
795
796         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
797          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
798          vpunpckhqdq    $inout5,$inout5,$T1
799         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
800          vpxor          $inout5,$T1,$T1
801         vpxor           $Z1,$Z2,$Z1
802         vpclmulqdq      \$0x10,$HK,$T3,$T3
803          vmovdqu        0x20-0x20($Xip),$HK
804         vpxor           $T2,$Xi,$Z3
805         vpxor           $Z0,$T3,$Z2
806
807          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
808           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
809         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
810           vpxor         $T3,$Z2,$Z2
811          vpunpckhqdq    $inout4,$inout4,$T2
812         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
813          vpxor          $inout4,$T2,$T2
814           vpslldq       \$8,$Z2,$T3
815         vpclmulqdq      \$0x00,$HK,$T1,$T1
816           vpxor         $T3,$Z1,$Xi
817           vpsrldq       \$8,$Z2,$Z2
818           vpxor         $Z2,$Z3,$Z3
819
820         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
821          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
822         vpxor           $Z0,$Z1,$Z1
823          vpunpckhqdq    $inout3,$inout3,$T3
824         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
825          vpxor          $inout3,$T3,$T3
826         vpxor           $inout5,$inout4,$inout4
827           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
828         vpclmulqdq      \$0x10,$HK,$T2,$T2
829          vmovdqu        0x50-0x20($Xip),$HK
830         vpxor           $T1,$T2,$T2
831
832         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
833          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
834         vpxor           $Z1,$Z0,$Z0
835          vpunpckhqdq    $inout2,$inout2,$T1
836         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
837          vpxor          $inout2,$T1,$T1
838         vpxor           $inout4,$inout3,$inout3
839           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
840         vpclmulqdq      \$0x00,$HK,$T3,$T3
841         vpxor           $T2,$T3,$T3
842
843           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
844           vxorps        $inout5,$Xi,$Xi
845
846         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
847          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
848         vpxor           $Z0,$Z1,$Z1
849          vpunpckhqdq    $inout1,$inout1,$T2
850         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
851          vpxor          $inout1,$T2,$T2
852           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
853         vpxor           $inout3,$inout2,$inout2
854         vpclmulqdq      \$0x10,$HK,$T1,$T1
855          vmovdqu        0x80-0x20($Xip),$HK
856         vpxor           $T3,$T1,$T1
857
858           vxorps        $Z3,$inout5,$inout5
859           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
860           vxorps        $inout5,$Xi,$Xi
861
862         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
863          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
864         vpxor           $Z1,$Z0,$Z0
865          vpunpckhqdq    $Xi,$Xi,$T3
866         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
867          vpxor          $Xi,$T3,$T3
868         vpxor           $inout2,$inout1,$inout1
869         vpclmulqdq      \$0x00,$HK,$T2,$T2
870         vpxor           $T1,$T2,$T2
871
872         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
873         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
874         vpxor           $Z0,$Z1,$Z1
875         vpclmulqdq      \$0x10,$HK,$T3,$Z2
876         vpxor           $inout1,$Z3,$Z3
877         vpxor           $T2,$Z2,$Z2
878
879         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
880         vpxor           $Z0,$Z2,$Z2
881         vpslldq         \$8,$Z2,$T1
882         vmovdqu         0x10($const),$Hkey      # .Lpoly
883         vpsrldq         \$8,$Z2,$Z2
884         vpxor           $T1,$Z1,$Xi
885         vpxor           $Z2,$Z3,$Z3
886
887         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
888         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
889         vpxor           $T2,$Xi,$Xi
890
891         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
892         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
893         vpxor           $Z3,$T2,$T2
894         vpxor           $T2,$Xi,$Xi
895 ___
896 }
897 $code.=<<___;
898         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
899         vmovdqu         $Xi,-0x40($Xip)         # output Xi
900
901         vzeroupper
902 ___
903 $code.=<<___ if ($win64);
904         movaps  -0xd8(%rax),%xmm6
905         movaps  -0xc8(%rax),%xmm7
906         movaps  -0xb8(%rax),%xmm8
907         movaps  -0xa8(%rax),%xmm9
908         movaps  -0x98(%rax),%xmm10
909         movaps  -0x88(%rax),%xmm11
910         movaps  -0x78(%rax),%xmm12
911         movaps  -0x68(%rax),%xmm13
912         movaps  -0x58(%rax),%xmm14
913         movaps  -0x48(%rax),%xmm15
914 ___
915 $code.=<<___;
916         mov     -48(%rax),%r15
917 .cfi_restore    %r15
918         mov     -40(%rax),%r14
919 .cfi_restore    %r14
920         mov     -32(%rax),%r13
921 .cfi_restore    %r13
922         mov     -24(%rax),%r12
923 .cfi_restore    %r12
924         mov     -16(%rax),%rbp
925 .cfi_restore    %rbp
926         mov     -8(%rax),%rbx
927 .cfi_restore    %rbx
928         lea     (%rax),%rsp             # restore %rsp
929 .cfi_def_cfa_register   %rsp
930 .Lgcm_enc_abort:
931         mov     $ret,%rax               # return value
932         ret
933 .cfi_endproc
934 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
935 ___
936
937 $code.=<<___;
938 .align  64
939 .Lbswap_mask:
940         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
941 .Lpoly:
942         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
943 .Lone_msb:
944         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
945 .Ltwo_lsb:
946         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
947 .Lone_lsb:
948         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
949 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
950 .align  64
951 ___
952 if ($win64) {
953 $rec="%rcx";
954 $frame="%rdx";
955 $context="%r8";
956 $disp="%r9";
957
958 $code.=<<___
959 .extern __imp_RtlVirtualUnwind
960 .type   gcm_se_handler,\@abi-omnipotent
961 .align  16
962 gcm_se_handler:
963         push    %rsi
964         push    %rdi
965         push    %rbx
966         push    %rbp
967         push    %r12
968         push    %r13
969         push    %r14
970         push    %r15
971         pushfq
972         sub     \$64,%rsp
973
974         mov     120($context),%rax      # pull context->Rax
975         mov     248($context),%rbx      # pull context->Rip
976
977         mov     8($disp),%rsi           # disp->ImageBase
978         mov     56($disp),%r11          # disp->HandlerData
979
980         mov     0(%r11),%r10d           # HandlerData[0]
981         lea     (%rsi,%r10),%r10        # prologue label
982         cmp     %r10,%rbx               # context->Rip<prologue label
983         jb      .Lcommon_seh_tail
984
985         mov     152($context),%rax      # pull context->Rsp
986
987         mov     4(%r11),%r10d           # HandlerData[1]
988         lea     (%rsi,%r10),%r10        # epilogue label
989         cmp     %r10,%rbx               # context->Rip>=epilogue label
990         jae     .Lcommon_seh_tail
991
992         mov     120($context),%rax      # pull context->Rax
993
994         mov     -48(%rax),%r15
995         mov     -40(%rax),%r14
996         mov     -32(%rax),%r13
997         mov     -24(%rax),%r12
998         mov     -16(%rax),%rbp
999         mov     -8(%rax),%rbx
1000         mov     %r15,240($context)
1001         mov     %r14,232($context)
1002         mov     %r13,224($context)
1003         mov     %r12,216($context)
1004         mov     %rbp,160($context)
1005         mov     %rbx,144($context)
1006
1007         lea     -0xd8(%rax),%rsi        # %xmm save area
1008         lea     512($context),%rdi      # & context.Xmm6
1009         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1010         .long   0xa548f3fc              # cld; rep movsq
1011
1012 .Lcommon_seh_tail:
1013         mov     8(%rax),%rdi
1014         mov     16(%rax),%rsi
1015         mov     %rax,152($context)      # restore context->Rsp
1016         mov     %rsi,168($context)      # restore context->Rsi
1017         mov     %rdi,176($context)      # restore context->Rdi
1018
1019         mov     40($disp),%rdi          # disp->ContextRecord
1020         mov     $context,%rsi           # context
1021         mov     \$154,%ecx              # sizeof(CONTEXT)
1022         .long   0xa548f3fc              # cld; rep movsq
1023
1024         mov     $disp,%rsi
1025         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1026         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1027         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1028         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1029         mov     40(%rsi),%r10           # disp->ContextRecord
1030         lea     56(%rsi),%r11           # &disp->HandlerData
1031         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1032         mov     %r10,32(%rsp)           # arg5
1033         mov     %r11,40(%rsp)           # arg6
1034         mov     %r12,48(%rsp)           # arg7
1035         mov     %rcx,56(%rsp)           # arg8, (NULL)
1036         call    *__imp_RtlVirtualUnwind(%rip)
1037
1038         mov     \$1,%eax                # ExceptionContinueSearch
1039         add     \$64,%rsp
1040         popfq
1041         pop     %r15
1042         pop     %r14
1043         pop     %r13
1044         pop     %r12
1045         pop     %rbp
1046         pop     %rbx
1047         pop     %rdi
1048         pop     %rsi
1049         ret
1050 .size   gcm_se_handler,.-gcm_se_handler
1051
1052 .section        .pdata
1053 .align  4
1054         .rva    .LSEH_begin_aesni_gcm_decrypt
1055         .rva    .LSEH_end_aesni_gcm_decrypt
1056         .rva    .LSEH_gcm_dec_info
1057
1058         .rva    .LSEH_begin_aesni_gcm_encrypt
1059         .rva    .LSEH_end_aesni_gcm_encrypt
1060         .rva    .LSEH_gcm_enc_info
1061 .section        .xdata
1062 .align  8
1063 .LSEH_gcm_dec_info:
1064         .byte   9,0,0,0
1065         .rva    gcm_se_handler
1066         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1067 .LSEH_gcm_enc_info:
1068         .byte   9,0,0,0
1069         .rva    gcm_se_handler
1070         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1071 ___
1072 }
1073 }}} else {{{
1074 $code=<<___;    # assembler is too old
1075 .text
1076
1077 .globl  aesni_gcm_encrypt
1078 .type   aesni_gcm_encrypt,\@abi-omnipotent
1079 aesni_gcm_encrypt:
1080         xor     %eax,%eax
1081         ret
1082 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1083
1084 .globl  aesni_gcm_decrypt
1085 .type   aesni_gcm_decrypt,\@abi-omnipotent
1086 aesni_gcm_decrypt:
1087         xor     %eax,%eax
1088         ret
1089 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1090 ___
1091 }}}
1092
1093 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1094
1095 print $code;
1096
1097 close STDOUT;