aes/asm/aesni-x86_64.pl: further optimization for Atom Silvermont.
[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 #
11 # AES-NI-CTR+GHASH stitch.
12 #
13 # February 2013
14 #
15 # OpenSSL GCM implementation is organized in such way that its
16 # performance is rather close to the sum of its streamed components,
17 # in the context parallelized AES-NI CTR and modulo-scheduled
18 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
19 # was observed to perform significantly better than the sum of the
20 # components on contemporary CPUs, the effort was deemed impossible to
21 # justify. This module is based on combination of Intel submissions,
22 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
23 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
24 # pressure with notable relative improvement, achieving 1.0 cycle per
25 # byte processed with 128-bit key on Haswell processor.
26 #
27 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
28 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
29
30 $flavour = shift;
31 $output  = shift;
32 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
33
34 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
35
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39 die "can't locate x86_64-xlate.pl";
40
41 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43         $avx = ($1>=2.19) + ($1>=2.22);
44 }
45
46 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48         $avx = ($1>=2.09) + ($1>=2.10);
49 }
50
51 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53         $avx = ($1>=10) + ($1>=11);
54 }
55
56 open OUT,"| \"$^X\" $xlate $flavour $output";
57 *STDOUT=*OUT;
58
59 if ($avx>1) {{{
60
61 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
62
63 ($Ii,$T1,$T2,$Hkey,
64  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
65
66 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
67
68 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
69
70 $code=<<___;
71 .text
72
73 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
74 .align  32
75 _aesni_ctr32_ghash_6x:
76         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
77         sub             \$6,$len
78         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
79         vmovdqu         0x00-0x80($key),$rndkey
80         vpaddb          $T2,$T1,$inout1
81         vpaddb          $T2,$inout1,$inout2
82         vpaddb          $T2,$inout2,$inout3
83         vpaddb          $T2,$inout3,$inout4
84         vpaddb          $T2,$inout4,$inout5
85         vpxor           $rndkey,$T1,$inout0
86         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
87         jmp             .Loop6x
88
89 .align  32
90 .Loop6x:
91         add             \$6<<24,$counter
92         jc              .Lhandle_ctr32          # discard $inout[1-5]?
93         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
94           vpaddb        $T2,$inout5,$T1         # next counter value
95           vpxor         $rndkey,$inout1,$inout1
96           vpxor         $rndkey,$inout2,$inout2
97
98 .Lresume_ctr32:
99         vmovdqu         $T1,($ivp)              # save next counter value
100         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
101           vpxor         $rndkey,$inout3,$inout3
102           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
103         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
104         xor             %r12,%r12
105         cmp             $in0,$end0
106
107           vaesenc       $T2,$inout0,$inout0
108         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
109           vpxor         $rndkey,$inout4,$inout4
110         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
111           vaesenc       $T2,$inout1,$inout1
112           vpxor         $rndkey,$inout5,$inout5
113         setnc           %r12b
114         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
115           vaesenc       $T2,$inout2,$inout2
116         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
117         neg             %r12
118           vaesenc       $T2,$inout3,$inout3
119          vpxor          $Z1,$Z2,$Z2
120         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
121          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
122           vaesenc       $T2,$inout4,$inout4
123          vpxor          $Z1,$T1,$Z0
124         and             \$0x60,%r12
125           vmovups       0x20-0x80($key),$rndkey
126         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
127           vaesenc       $T2,$inout5,$inout5
128
129         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
130         lea             ($in0,%r12),$in0
131           vaesenc       $rndkey,$inout0,$inout0
132          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
133         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
134          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
135           vaesenc       $rndkey,$inout1,$inout1
136         movbe           0x58($in0),%r13
137           vaesenc       $rndkey,$inout2,$inout2
138         movbe           0x50($in0),%r12
139           vaesenc       $rndkey,$inout3,$inout3
140         mov             %r13,0x20+8(%rsp)
141           vaesenc       $rndkey,$inout4,$inout4
142         mov             %r12,0x28+8(%rsp)
143         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
144           vaesenc       $rndkey,$inout5,$inout5
145
146           vmovups       0x30-0x80($key),$rndkey
147          vpxor          $T1,$Z2,$Z2
148         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
149           vaesenc       $rndkey,$inout0,$inout0
150          vpxor          $T2,$Z2,$Z2
151         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
152           vaesenc       $rndkey,$inout1,$inout1
153          vpxor          $Hkey,$Z3,$Z3
154         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
155           vaesenc       $rndkey,$inout2,$inout2
156         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
157          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
158           vaesenc       $rndkey,$inout3,$inout3
159           vaesenc       $rndkey,$inout4,$inout4
160          vpxor          $T1,$Z0,$Z0
161         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
162           vaesenc       $rndkey,$inout5,$inout5
163
164           vmovups       0x40-0x80($key),$rndkey
165          vpxor          $T2,$Z2,$Z2
166         vpclmulqdq      \$0x00,$T1,$Ii,$T2
167           vaesenc       $rndkey,$inout0,$inout0
168          vpxor          $Hkey,$Z2,$Z2
169         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
170           vaesenc       $rndkey,$inout1,$inout1
171         movbe           0x48($in0),%r13
172          vpxor          $Z1,$Z3,$Z3
173         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
174           vaesenc       $rndkey,$inout2,$inout2
175         movbe           0x40($in0),%r12
176         vpclmulqdq      \$0x11,$T1,$Ii,$T1
177          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
178           vaesenc       $rndkey,$inout3,$inout3
179         mov             %r13,0x30+8(%rsp)
180           vaesenc       $rndkey,$inout4,$inout4
181         mov             %r12,0x38+8(%rsp)
182          vpxor          $T2,$Z0,$Z0
183         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
184           vaesenc       $rndkey,$inout5,$inout5
185
186           vmovups       0x50-0x80($key),$rndkey
187          vpxor          $Hkey,$Z2,$Z2
188         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
189           vaesenc       $rndkey,$inout0,$inout0
190          vpxor          $Z1,$Z2,$Z2
191         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
192           vaesenc       $rndkey,$inout1,$inout1
193         movbe           0x38($in0),%r13
194          vpxor          $T1,$Z3,$Z3
195         vpclmulqdq      \$0x01,$T2,$Ii,$T1
196          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
197           vaesenc       $rndkey,$inout2,$inout2
198         movbe           0x30($in0),%r12
199         vpclmulqdq      \$0x11,$T2,$Ii,$T2
200           vaesenc       $rndkey,$inout3,$inout3
201         mov             %r13,0x40+8(%rsp)
202           vaesenc       $rndkey,$inout4,$inout4
203         mov             %r12,0x48+8(%rsp)
204          vpxor          $Hkey,$Z0,$Z0
205          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
206           vaesenc       $rndkey,$inout5,$inout5
207
208           vmovups       0x60-0x80($key),$rndkey
209          vpxor          $Z1,$Z2,$Z2
210         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
211           vaesenc       $rndkey,$inout0,$inout0
212          vpxor          $T1,$Z2,$Z2
213         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
214           vaesenc       $rndkey,$inout1,$inout1
215         movbe           0x28($in0),%r13
216          vpxor          $T2,$Z3,$Z3
217         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
218           vaesenc       $rndkey,$inout2,$inout2
219         movbe           0x20($in0),%r12
220         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
221           vaesenc       $rndkey,$inout3,$inout3
222         mov             %r13,0x50+8(%rsp)
223           vaesenc       $rndkey,$inout4,$inout4
224         mov             %r12,0x58+8(%rsp)
225         vpxor           $Z1,$Z2,$Z2
226           vaesenc       $rndkey,$inout5,$inout5
227         vpxor           $T1,$Z2,$Z2
228
229           vmovups       0x70-0x80($key),$rndkey
230         vpslldq         \$8,$Z2,$Z1
231         vpxor           $T2,$Z0,$Z0
232         vmovdqu         0x10($const),$Hkey      # .Lpoly
233
234           vaesenc       $rndkey,$inout0,$inout0
235         vpxor           $Xi,$Z3,$Z3
236           vaesenc       $rndkey,$inout1,$inout1
237         vpxor           $Z1,$Z0,$Z0
238         movbe           0x18($in0),%r13
239           vaesenc       $rndkey,$inout2,$inout2
240         movbe           0x10($in0),%r12
241         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
242         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
243         mov             %r13,0x60+8(%rsp)
244           vaesenc       $rndkey,$inout3,$inout3
245         mov             %r12,0x68+8(%rsp)
246           vaesenc       $rndkey,$inout4,$inout4
247           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
248           vaesenc       $rndkey,$inout5,$inout5
249
250           vaesenc       $T1,$inout0,$inout0
251           vmovups       0x90-0x80($key),$rndkey
252           vaesenc       $T1,$inout1,$inout1
253         vpsrldq         \$8,$Z2,$Z2
254           vaesenc       $T1,$inout2,$inout2
255         vpxor           $Z2,$Z3,$Z3
256           vaesenc       $T1,$inout3,$inout3
257         vpxor           $Ii,$Z0,$Z0
258         movbe           0x08($in0),%r13
259           vaesenc       $T1,$inout4,$inout4
260         movbe           0x00($in0),%r12
261           vaesenc       $T1,$inout5,$inout5
262           vmovups       0xa0-0x80($key),$T1
263           cmp           \$11,$rounds
264           jb            .Lenc_tail              # 128-bit key
265
266           vaesenc       $rndkey,$inout0,$inout0
267           vaesenc       $rndkey,$inout1,$inout1
268           vaesenc       $rndkey,$inout2,$inout2
269           vaesenc       $rndkey,$inout3,$inout3
270           vaesenc       $rndkey,$inout4,$inout4
271           vaesenc       $rndkey,$inout5,$inout5
272
273           vaesenc       $T1,$inout0,$inout0
274           vaesenc       $T1,$inout1,$inout1
275           vaesenc       $T1,$inout2,$inout2
276           vaesenc       $T1,$inout3,$inout3
277           vaesenc       $T1,$inout4,$inout4
278           vmovups       0xb0-0x80($key),$rndkey
279           vaesenc       $T1,$inout5,$inout5
280           vmovups       0xc0-0x80($key),$T1
281           je            .Lenc_tail              # 192-bit key
282
283           vaesenc       $rndkey,$inout0,$inout0
284           vaesenc       $rndkey,$inout1,$inout1
285           vaesenc       $rndkey,$inout2,$inout2
286           vaesenc       $rndkey,$inout3,$inout3
287           vaesenc       $rndkey,$inout4,$inout4
288           vaesenc       $rndkey,$inout5,$inout5
289
290           vaesenc       $T1,$inout0,$inout0
291           vaesenc       $T1,$inout1,$inout1
292           vaesenc       $T1,$inout2,$inout2
293           vaesenc       $T1,$inout3,$inout3
294           vaesenc       $T1,$inout4,$inout4
295           vmovups       0xd0-0x80($key),$rndkey
296           vaesenc       $T1,$inout5,$inout5
297           vmovups       0xe0-0x80($key),$T1
298           jmp           .Lenc_tail              # 256-bit key
299
300 .align  32
301 .Lhandle_ctr32:
302         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
303           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
304           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
305           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
306           vpaddd        $Z1,$Z2,$inout2
307         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
308           vpaddd        $Z1,$inout1,$inout3
309           vpshufb       $Ii,$inout1,$inout1
310           vpaddd        $Z1,$inout2,$inout4
311           vpshufb       $Ii,$inout2,$inout2
312           vpxor         $rndkey,$inout1,$inout1
313           vpaddd        $Z1,$inout3,$inout5
314           vpshufb       $Ii,$inout3,$inout3
315           vpxor         $rndkey,$inout2,$inout2
316           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
317           vpshufb       $Ii,$inout4,$inout4
318           vpshufb       $Ii,$inout5,$inout5
319           vpshufb       $Ii,$T1,$T1             # next counter value
320         jmp             .Lresume_ctr32
321
322 .align  32
323 .Lenc_tail:
324           vaesenc       $rndkey,$inout0,$inout0
325         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
326         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
327           vaesenc       $rndkey,$inout1,$inout1
328         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
329           vpxor         0x00($inp),$T1,$T2
330           vaesenc       $rndkey,$inout2,$inout2
331           vpxor         0x10($inp),$T1,$Ii
332           vaesenc       $rndkey,$inout3,$inout3
333           vpxor         0x20($inp),$T1,$Z1
334           vaesenc       $rndkey,$inout4,$inout4
335           vpxor         0x30($inp),$T1,$Z2
336           vaesenc       $rndkey,$inout5,$inout5
337           vpxor         0x40($inp),$T1,$Z3
338           vpxor         0x50($inp),$T1,$Hkey
339           vmovdqu       ($ivp),$T1              # load next counter value
340
341           vaesenclast   $T2,$inout0,$inout0
342           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
343           vaesenclast   $Ii,$inout1,$inout1
344          vpaddb         $T2,$T1,$Ii
345         mov             %r13,0x70+8(%rsp)
346         lea             0x60($inp),$inp
347           vaesenclast   $Z1,$inout2,$inout2
348          vpaddb         $T2,$Ii,$Z1
349         mov             %r12,0x78+8(%rsp)
350         lea             0x60($out),$out
351           vmovdqu       0x00-0x80($key),$rndkey
352           vaesenclast   $Z2,$inout3,$inout3
353          vpaddb         $T2,$Z1,$Z2
354           vaesenclast   $Z3, $inout4,$inout4
355          vpaddb         $T2,$Z2,$Z3
356           vaesenclast   $Hkey,$inout5,$inout5
357          vpaddb         $T2,$Z3,$Hkey
358
359         add             \$0x60,$ret
360         sub             \$0x6,$len
361         jc              .L6x_done
362
363           vmovups       $inout0,-0x60($out)     # save output
364          vpxor          $rndkey,$T1,$inout0
365           vmovups       $inout1,-0x50($out)
366          vmovdqa        $Ii,$inout1             # 0 latency
367           vmovups       $inout2,-0x40($out)
368          vmovdqa        $Z1,$inout2             # 0 latency
369           vmovups       $inout3,-0x30($out)
370          vmovdqa        $Z2,$inout3             # 0 latency
371           vmovups       $inout4,-0x20($out)
372          vmovdqa        $Z3,$inout4             # 0 latency
373           vmovups       $inout5,-0x10($out)
374          vmovdqa        $Hkey,$inout5           # 0 latency
375         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
376         jmp             .Loop6x
377
378 .L6x_done:
379         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
380         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
381
382         ret
383 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
384 ___
385 ######################################################################
386 #
387 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
388 #               const AES_KEY *key, unsigned char iv[16],
389 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
390 $code.=<<___;
391 .globl  aesni_gcm_decrypt
392 .type   aesni_gcm_decrypt,\@function,6
393 .align  32
394 aesni_gcm_decrypt:
395         xor     $ret,$ret
396         cmp     \$0x60,$len                     # minimal accepted length
397         jb      .Lgcm_dec_abort
398
399         lea     (%rsp),%rax                     # save stack pointer
400         push    %rbx
401         push    %rbp
402         push    %r12
403         push    %r13
404         push    %r14
405         push    %r15
406 ___
407 $code.=<<___ if ($win64);
408         lea     -0xa8(%rsp),%rsp
409         movaps  %xmm6,-0xd8(%rax)
410         movaps  %xmm7,-0xc8(%rax)
411         movaps  %xmm8,-0xb8(%rax)
412         movaps  %xmm9,-0xa8(%rax)
413         movaps  %xmm10,-0x98(%rax)
414         movaps  %xmm11,-0x88(%rax)
415         movaps  %xmm12,-0x78(%rax)
416         movaps  %xmm13,-0x68(%rax)
417         movaps  %xmm14,-0x58(%rax)
418         movaps  %xmm15,-0x48(%rax)
419 .Lgcm_dec_body:
420 ___
421 $code.=<<___;
422         vzeroupper
423
424         vmovdqu         ($ivp),$T1              # input counter value
425         add             \$-128,%rsp
426         mov             12($ivp),$counter
427         lea             .Lbswap_mask(%rip),$const
428         lea             -0x80($key),$in0        # borrow $in0
429         mov             \$0xf80,$end0           # borrow $end0
430         vmovdqu         ($Xip),$Xi              # load Xi
431         and             \$-128,%rsp             # ensure stack alignment
432         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
433         lea             0x80($key),$key         # size optimization
434         lea             0x20+0x20($Xip),$Xip    # size optimization
435         mov             0xf0-0x80($key),$rounds
436         vpshufb         $Ii,$Xi,$Xi
437
438         and             $end0,$in0
439         and             %rsp,$end0
440         sub             $in0,$end0
441         jc              .Ldec_no_key_aliasing
442         cmp             \$768,$end0
443         jnc             .Ldec_no_key_aliasing
444         sub             $end0,%rsp              # avoid aliasing with key
445 .Ldec_no_key_aliasing:
446
447         vmovdqu         0x50($inp),$Z3          # I[5]
448         lea             ($inp),$in0
449         vmovdqu         0x40($inp),$Z0
450         lea             -0xc0($inp,$len),$end0
451         vmovdqu         0x30($inp),$Z1
452         shr             \$4,$len
453         xor             $ret,$ret
454         vmovdqu         0x20($inp),$Z2
455          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
456         vmovdqu         0x10($inp),$T2
457          vpshufb        $Ii,$Z0,$Z0
458         vmovdqu         ($inp),$Hkey
459          vpshufb        $Ii,$Z1,$Z1
460         vmovdqu         $Z0,0x30(%rsp)
461          vpshufb        $Ii,$Z2,$Z2
462         vmovdqu         $Z1,0x40(%rsp)
463          vpshufb        $Ii,$T2,$T2
464         vmovdqu         $Z2,0x50(%rsp)
465          vpshufb        $Ii,$Hkey,$Hkey
466         vmovdqu         $T2,0x60(%rsp)
467         vmovdqu         $Hkey,0x70(%rsp)
468
469         call            _aesni_ctr32_ghash_6x
470
471         vmovups         $inout0,-0x60($out)     # save output
472         vmovups         $inout1,-0x50($out)
473         vmovups         $inout2,-0x40($out)
474         vmovups         $inout3,-0x30($out)
475         vmovups         $inout4,-0x20($out)
476         vmovups         $inout5,-0x10($out)
477
478         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
479         vmovdqu         $Xi,-0x40($Xip)         # output Xi
480
481         vzeroupper
482 ___
483 $code.=<<___ if ($win64);
484         movaps  -0xd8(%rax),%xmm6
485         movaps  -0xd8(%rax),%xmm7
486         movaps  -0xb8(%rax),%xmm8
487         movaps  -0xa8(%rax),%xmm9
488         movaps  -0x98(%rax),%xmm10
489         movaps  -0x88(%rax),%xmm11
490         movaps  -0x78(%rax),%xmm12
491         movaps  -0x68(%rax),%xmm13
492         movaps  -0x58(%rax),%xmm14
493         movaps  -0x48(%rax),%xmm15
494 ___
495 $code.=<<___;
496         mov     -48(%rax),%r15
497         mov     -40(%rax),%r14
498         mov     -32(%rax),%r13
499         mov     -24(%rax),%r12
500         mov     -16(%rax),%rbp
501         mov     -8(%rax),%rbx
502         lea     (%rax),%rsp             # restore %rsp
503 .Lgcm_dec_abort:
504         mov     $ret,%rax               # return value
505         ret
506 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
507 ___
508
509 $code.=<<___;
510 .type   _aesni_ctr32_6x,\@abi-omnipotent
511 .align  32
512 _aesni_ctr32_6x:
513         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
514         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
515         lea             -1($rounds),%r13
516         vmovups         0x10-0x80($key),$rndkey
517         lea             0x20-0x80($key),%r12
518         vpxor           $Z0,$T1,$inout0
519         add             \$6<<24,$counter
520         jc              .Lhandle_ctr32_2
521         vpaddb          $T2,$T1,$inout1
522         vpaddb          $T2,$inout1,$inout2
523         vpxor           $Z0,$inout1,$inout1
524         vpaddb          $T2,$inout2,$inout3
525         vpxor           $Z0,$inout2,$inout2
526         vpaddb          $T2,$inout3,$inout4
527         vpxor           $Z0,$inout3,$inout3
528         vpaddb          $T2,$inout4,$inout5
529         vpxor           $Z0,$inout4,$inout4
530         vpaddb          $T2,$inout5,$T1
531         vpxor           $Z0,$inout5,$inout5
532         jmp             .Loop_ctr32
533
534 .align  16
535 .Loop_ctr32:
536         vaesenc         $rndkey,$inout0,$inout0
537         vaesenc         $rndkey,$inout1,$inout1
538         vaesenc         $rndkey,$inout2,$inout2
539         vaesenc         $rndkey,$inout3,$inout3
540         vaesenc         $rndkey,$inout4,$inout4
541         vaesenc         $rndkey,$inout5,$inout5
542         vmovups         (%r12),$rndkey
543         lea             0x10(%r12),%r12
544         dec             %r13d
545         jnz             .Loop_ctr32
546
547         vmovdqu         (%r12),$Hkey            # last round key
548         vaesenc         $rndkey,$inout0,$inout0
549         vpxor           0x00($inp),$Hkey,$Z0
550         vaesenc         $rndkey,$inout1,$inout1
551         vpxor           0x10($inp),$Hkey,$Z1
552         vaesenc         $rndkey,$inout2,$inout2
553         vpxor           0x20($inp),$Hkey,$Z2
554         vaesenc         $rndkey,$inout3,$inout3
555         vpxor           0x30($inp),$Hkey,$Xi
556         vaesenc         $rndkey,$inout4,$inout4
557         vpxor           0x40($inp),$Hkey,$T2
558         vaesenc         $rndkey,$inout5,$inout5
559         vpxor           0x50($inp),$Hkey,$Hkey
560         lea             0x60($inp),$inp
561
562         vaesenclast     $Z0,$inout0,$inout0
563         vaesenclast     $Z1,$inout1,$inout1
564         vaesenclast     $Z2,$inout2,$inout2
565         vaesenclast     $Xi,$inout3,$inout3
566         vaesenclast     $T2,$inout4,$inout4
567         vaesenclast     $Hkey,$inout5,$inout5
568         vmovups         $inout0,0x00($out)
569         vmovups         $inout1,0x10($out)
570         vmovups         $inout2,0x20($out)
571         vmovups         $inout3,0x30($out)
572         vmovups         $inout4,0x40($out)
573         vmovups         $inout5,0x50($out)
574         lea             0x60($out),$out
575
576         ret
577 .align  32
578 .Lhandle_ctr32_2:
579         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
580         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
581         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
582         vpaddd          $Z1,$Z2,$inout2
583         vpaddd          $Z1,$inout1,$inout3
584         vpshufb         $Ii,$inout1,$inout1
585         vpaddd          $Z1,$inout2,$inout4
586         vpshufb         $Ii,$inout2,$inout2
587         vpxor           $Z0,$inout1,$inout1
588         vpaddd          $Z1,$inout3,$inout5
589         vpshufb         $Ii,$inout3,$inout3
590         vpxor           $Z0,$inout2,$inout2
591         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
592         vpshufb         $Ii,$inout4,$inout4
593         vpxor           $Z0,$inout3,$inout3
594         vpshufb         $Ii,$inout5,$inout5
595         vpxor           $Z0,$inout4,$inout4
596         vpshufb         $Ii,$T1,$T1             # next counter value
597         vpxor           $Z0,$inout5,$inout5
598         jmp     .Loop_ctr32
599 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
600
601 .globl  aesni_gcm_encrypt
602 .type   aesni_gcm_encrypt,\@function,6
603 .align  32
604 aesni_gcm_encrypt:
605         xor     $ret,$ret
606         cmp     \$0x60*3,$len                   # minimal accepted length
607         jb      .Lgcm_enc_abort
608
609         lea     (%rsp),%rax                     # save stack pointer
610         push    %rbx
611         push    %rbp
612         push    %r12
613         push    %r13
614         push    %r14
615         push    %r15
616 ___
617 $code.=<<___ if ($win64);
618         lea     -0xa8(%rsp),%rsp
619         movaps  %xmm6,-0xd8(%rax)
620         movaps  %xmm7,-0xc8(%rax)
621         movaps  %xmm8,-0xb8(%rax)
622         movaps  %xmm9,-0xa8(%rax)
623         movaps  %xmm10,-0x98(%rax)
624         movaps  %xmm11,-0x88(%rax)
625         movaps  %xmm12,-0x78(%rax)
626         movaps  %xmm13,-0x68(%rax)
627         movaps  %xmm14,-0x58(%rax)
628         movaps  %xmm15,-0x48(%rax)
629 .Lgcm_enc_body:
630 ___
631 $code.=<<___;
632         vzeroupper
633
634         vmovdqu         ($ivp),$T1              # input counter value
635         add             \$-128,%rsp
636         mov             12($ivp),$counter
637         lea             .Lbswap_mask(%rip),$const
638         lea             -0x80($key),$in0        # borrow $in0
639         mov             \$0xf80,$end0           # borrow $end0
640         lea             0x80($key),$key         # size optimization
641         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
642         and             \$-128,%rsp             # ensure stack alignment
643         mov             0xf0-0x80($key),$rounds
644
645         and             $end0,$in0
646         and             %rsp,$end0
647         sub             $in0,$end0
648         jc              .Lenc_no_key_aliasing
649         cmp             \$768,$end0
650         jnc             .Lenc_no_key_aliasing
651         sub             $end0,%rsp              # avoid aliasing with key
652 .Lenc_no_key_aliasing:
653
654         lea             ($out),$in0
655         lea             -0xc0($out,$len),$end0
656         shr             \$4,$len
657
658         call            _aesni_ctr32_6x
659         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
660         vpshufb         $Ii,$inout1,$T2
661         vmovdqu         $Xi,0x70(%rsp)
662         vpshufb         $Ii,$inout2,$Z0
663         vmovdqu         $T2,0x60(%rsp)
664         vpshufb         $Ii,$inout3,$Z1
665         vmovdqu         $Z0,0x50(%rsp)
666         vpshufb         $Ii,$inout4,$Z2
667         vmovdqu         $Z1,0x40(%rsp)
668         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
669         vmovdqu         $Z2,0x30(%rsp)
670
671         call            _aesni_ctr32_6x
672
673         vmovdqu         ($Xip),$Xi              # load Xi
674         lea             0x20+0x20($Xip),$Xip    # size optimization
675         sub             \$12,$len
676         mov             \$0x60*2,$ret
677         vpshufb         $Ii,$Xi,$Xi
678
679         call            _aesni_ctr32_ghash_6x
680         vmovdqu         0x20(%rsp),$Z3          # I[5]
681          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
682         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
683         vpunpckhqdq     $Z3,$Z3,$T1
684         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
685          vmovups        $inout0,-0x60($out)     # save output
686          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
687         vpxor           $Z3,$T1,$T1
688          vmovups        $inout1,-0x50($out)
689          vpshufb        $Ii,$inout1,$inout1
690          vmovups        $inout2,-0x40($out)
691          vpshufb        $Ii,$inout2,$inout2
692          vmovups        $inout3,-0x30($out)
693          vpshufb        $Ii,$inout3,$inout3
694          vmovups        $inout4,-0x20($out)
695          vpshufb        $Ii,$inout4,$inout4
696          vmovups        $inout5,-0x10($out)
697          vpshufb        $Ii,$inout5,$inout5
698          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
699 ___
700 { my ($HK,$T3)=($rndkey,$inout0);
701
702 $code.=<<___;
703          vmovdqu        0x30(%rsp),$Z2          # I[4]
704          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
705          vpunpckhqdq    $Z2,$Z2,$T2
706         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
707          vpxor          $Z2,$T2,$T2
708         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
709         vpclmulqdq      \$0x00,$HK,$T1,$T1
710
711          vmovdqu        0x40(%rsp),$T3          # I[3]
712         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
713          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
714         vpxor           $Z1,$Z0,$Z0
715          vpunpckhqdq    $T3,$T3,$Z1
716         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
717          vpxor          $T3,$Z1,$Z1
718         vpxor           $Z3,$Z2,$Z2
719         vpclmulqdq      \$0x10,$HK,$T2,$T2
720          vmovdqu        0x50-0x20($Xip),$HK
721         vpxor           $T1,$T2,$T2
722
723          vmovdqu        0x50(%rsp),$T1          # I[2]
724         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
725          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
726         vpxor           $Z0,$Z3,$Z3
727          vpunpckhqdq    $T1,$T1,$Z0
728         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
729          vpxor          $T1,$Z0,$Z0
730         vpxor           $Z2,$T3,$T3
731         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
732         vpxor           $T2,$Z1,$Z1
733
734          vmovdqu        0x60(%rsp),$T2          # I[1]
735         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
736          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
737         vpxor           $Z3,$Z2,$Z2
738          vpunpckhqdq    $T2,$T2,$Z3
739         vpclmulqdq      \$0x11,$Ii,$T1,$T1
740          vpxor          $T2,$Z3,$Z3
741         vpxor           $T3,$T1,$T1
742         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
743          vmovdqu        0x80-0x20($Xip),$HK
744         vpxor           $Z1,$Z0,$Z0
745
746          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
747         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
748          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
749          vpunpckhqdq    $Xi,$Xi,$T3
750         vpxor           $Z2,$Z1,$Z1
751         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
752          vpxor          $Xi,$T3,$T3
753         vpxor           $T1,$T2,$T2
754         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
755         vpxor           $Z0,$Z3,$Z0
756
757         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
758          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
759          vpunpckhqdq    $inout5,$inout5,$T1
760         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
761          vpxor          $inout5,$T1,$T1
762         vpxor           $Z1,$Z2,$Z1
763         vpclmulqdq      \$0x10,$HK,$T3,$T3
764          vmovdqu        0x20-0x20($Xip),$HK
765         vpxor           $T2,$Xi,$Z3
766         vpxor           $Z0,$T3,$Z2
767
768          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
769           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
770         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
771           vpxor         $T3,$Z2,$Z2
772          vpunpckhqdq    $inout4,$inout4,$T2
773         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
774          vpxor          $inout4,$T2,$T2
775           vpslldq       \$8,$Z2,$T3
776         vpclmulqdq      \$0x00,$HK,$T1,$T1
777           vpxor         $T3,$Z1,$Xi
778           vpsrldq       \$8,$Z2,$Z2
779           vpxor         $Z2,$Z3,$Z3
780
781         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
782          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
783         vpxor           $Z0,$Z1,$Z1
784          vpunpckhqdq    $inout3,$inout3,$T3
785         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
786          vpxor          $inout3,$T3,$T3
787         vpxor           $inout5,$inout4,$inout4
788           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
789         vpclmulqdq      \$0x10,$HK,$T2,$T2
790          vmovdqu        0x50-0x20($Xip),$HK
791         vpxor           $T1,$T2,$T2
792
793         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
794          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
795         vpxor           $Z1,$Z0,$Z0
796          vpunpckhqdq    $inout2,$inout2,$T1
797         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
798          vpxor          $inout2,$T1,$T1
799         vpxor           $inout4,$inout3,$inout3
800           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
801         vpclmulqdq      \$0x00,$HK,$T3,$T3
802         vpxor           $T2,$T3,$T3
803
804           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
805           vxorps        $inout5,$Xi,$Xi
806
807         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
808          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
809         vpxor           $Z0,$Z1,$Z1
810          vpunpckhqdq    $inout1,$inout1,$T2
811         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
812          vpxor          $inout1,$T2,$T2
813           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
814         vpxor           $inout3,$inout2,$inout2
815         vpclmulqdq      \$0x10,$HK,$T1,$T1
816          vmovdqu        0x80-0x20($Xip),$HK
817         vpxor           $T3,$T1,$T1
818
819           vxorps        $Z3,$inout5,$inout5
820           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
821           vxorps        $inout5,$Xi,$Xi
822
823         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
824          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
825         vpxor           $Z1,$Z0,$Z0
826          vpunpckhqdq    $Xi,$Xi,$T3
827         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
828          vpxor          $Xi,$T3,$T3
829         vpxor           $inout2,$inout1,$inout1
830         vpclmulqdq      \$0x00,$HK,$T2,$T2
831         vpxor           $T1,$T2,$T2
832
833         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
834         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
835         vpxor           $Z0,$Z1,$Z1
836         vpclmulqdq      \$0x10,$HK,$T3,$Z2
837         vpxor           $inout1,$Z3,$Z3
838         vpxor           $T2,$Z2,$Z2
839
840         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
841         vpxor           $Z0,$Z2,$Z2
842         vpslldq         \$8,$Z2,$T1
843         vmovdqu         0x10($const),$Hkey      # .Lpoly
844         vpsrldq         \$8,$Z2,$Z2
845         vpxor           $T1,$Z1,$Xi
846         vpxor           $Z2,$Z3,$Z3
847
848         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
849         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
850         vpxor           $T2,$Xi,$Xi
851
852         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
853         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
854         vpxor           $Z3,$T2,$T2
855         vpxor           $T2,$Xi,$Xi
856 ___
857 }
858 $code.=<<___;
859         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
860         vmovdqu         $Xi,-0x40($Xip)         # output Xi
861
862         vzeroupper
863 ___
864 $code.=<<___ if ($win64);
865         movaps  -0xd8(%rax),%xmm6
866         movaps  -0xc8(%rax),%xmm7
867         movaps  -0xb8(%rax),%xmm8
868         movaps  -0xa8(%rax),%xmm9
869         movaps  -0x98(%rax),%xmm10
870         movaps  -0x88(%rax),%xmm11
871         movaps  -0x78(%rax),%xmm12
872         movaps  -0x68(%rax),%xmm13
873         movaps  -0x58(%rax),%xmm14
874         movaps  -0x48(%rax),%xmm15
875 ___
876 $code.=<<___;
877         mov     -48(%rax),%r15
878         mov     -40(%rax),%r14
879         mov     -32(%rax),%r13
880         mov     -24(%rax),%r12
881         mov     -16(%rax),%rbp
882         mov     -8(%rax),%rbx
883         lea     (%rax),%rsp             # restore %rsp
884 .Lgcm_enc_abort:
885         mov     $ret,%rax               # return value
886         ret
887 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
888 ___
889
890 $code.=<<___;
891 .align  64
892 .Lbswap_mask:
893         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
894 .Lpoly:
895         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
896 .Lone_msb:
897         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
898 .Ltwo_lsb:
899         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
900 .Lone_lsb:
901         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
902 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
903 .align  64
904 ___
905 if ($win64) {
906 $rec="%rcx";
907 $frame="%rdx";
908 $context="%r8";
909 $disp="%r9";
910
911 $code.=<<___
912 .extern __imp_RtlVirtualUnwind
913 .type   gcm_se_handler,\@abi-omnipotent
914 .align  16
915 gcm_se_handler:
916         push    %rsi
917         push    %rdi
918         push    %rbx
919         push    %rbp
920         push    %r12
921         push    %r13
922         push    %r14
923         push    %r15
924         pushfq
925         sub     \$64,%rsp
926
927         mov     120($context),%rax      # pull context->Rax
928         mov     248($context),%rbx      # pull context->Rip
929
930         mov     8($disp),%rsi           # disp->ImageBase
931         mov     56($disp),%r11          # disp->HandlerData
932
933         mov     0(%r11),%r10d           # HandlerData[0]
934         lea     (%rsi,%r10),%r10        # prologue label
935         cmp     %r10,%rbx               # context->Rip<prologue label
936         jb      .Lcommon_seh_tail
937
938         mov     152($context),%rax      # pull context->Rsp
939
940         mov     4(%r11),%r10d           # HandlerData[1]
941         lea     (%rsi,%r10),%r10        # epilogue label
942         cmp     %r10,%rbx               # context->Rip>=epilogue label
943         jae     .Lcommon_seh_tail
944
945         mov     120($context),%rax      # pull context->Rax
946
947         mov     -48(%rax),%r15
948         mov     -40(%rax),%r14
949         mov     -32(%rax),%r13
950         mov     -24(%rax),%r12
951         mov     -16(%rax),%rbp
952         mov     -8(%rax),%rbx
953         mov     %r15,240($context)
954         mov     %r14,232($context)
955         mov     %r13,224($context)
956         mov     %r12,216($context)
957         mov     %rbp,160($context)
958         mov     %rbx,144($context)
959
960         lea     -0xd8(%rax),%rsi        # %xmm save area
961         lea     512($context),%rdi      # & context.Xmm6
962         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
963         .long   0xa548f3fc              # cld; rep movsq
964
965 .Lcommon_seh_tail:
966         mov     8(%rax),%rdi
967         mov     16(%rax),%rsi
968         mov     %rax,152($context)      # restore context->Rsp
969         mov     %rsi,168($context)      # restore context->Rsi
970         mov     %rdi,176($context)      # restore context->Rdi
971
972         mov     40($disp),%rdi          # disp->ContextRecord
973         mov     $context,%rsi           # context
974         mov     \$154,%ecx              # sizeof(CONTEXT)
975         .long   0xa548f3fc              # cld; rep movsq
976
977         mov     $disp,%rsi
978         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
979         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
980         mov     0(%rsi),%r8             # arg3, disp->ControlPc
981         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
982         mov     40(%rsi),%r10           # disp->ContextRecord
983         lea     56(%rsi),%r11           # &disp->HandlerData
984         lea     24(%rsi),%r12           # &disp->EstablisherFrame
985         mov     %r10,32(%rsp)           # arg5
986         mov     %r11,40(%rsp)           # arg6
987         mov     %r12,48(%rsp)           # arg7
988         mov     %rcx,56(%rsp)           # arg8, (NULL)
989         call    *__imp_RtlVirtualUnwind(%rip)
990
991         mov     \$1,%eax                # ExceptionContinueSearch
992         add     \$64,%rsp
993         popfq
994         pop     %r15
995         pop     %r14
996         pop     %r13
997         pop     %r12
998         pop     %rbp
999         pop     %rbx
1000         pop     %rdi
1001         pop     %rsi
1002         ret
1003 .size   gcm_se_handler,.-gcm_se_handler
1004
1005 .section        .pdata
1006 .align  4
1007         .rva    .LSEH_begin_aesni_gcm_decrypt
1008         .rva    .LSEH_end_aesni_gcm_decrypt
1009         .rva    .LSEH_gcm_dec_info
1010
1011         .rva    .LSEH_begin_aesni_gcm_encrypt
1012         .rva    .LSEH_end_aesni_gcm_encrypt
1013         .rva    .LSEH_gcm_enc_info
1014 .section        .xdata
1015 .align  8
1016 .LSEH_gcm_dec_info:
1017         .byte   9,0,0,0
1018         .rva    gcm_se_handler
1019         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1020 .LSEH_gcm_enc_info:
1021         .byte   9,0,0,0
1022         .rva    gcm_se_handler
1023         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1024 ___
1025 }
1026 }}} else {{{
1027 $code=<<___;    # assembler is too old
1028 .text
1029
1030 .globl  aesni_gcm_encrypt
1031 .type   aesni_gcm_encrypt,\@abi-omnipotent
1032 aesni_gcm_encrypt:
1033         xor     %eax,%eax
1034         ret
1035 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1036
1037 .globl  aesni_gcm_decrypt
1038 .type   aesni_gcm_decrypt,\@abi-omnipotent
1039 aesni_gcm_decrypt:
1040         xor     %eax,%eax
1041         ret
1042 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1043 ___
1044 }}}
1045
1046 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1047
1048 print $code;
1049
1050 close STDOUT;