x86_64 assembly pack: addendum to last clang commit.
[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 #
11 # AES-NI-CTR+GHASH stitch.
12 #
13 # February 2013
14 #
15 # OpenSSL GCM implementation is organized in such way that its
16 # performance is rather close to the sum of its streamed components,
17 # in the context parallelized AES-NI CTR and modulo-scheduled
18 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
19 # was observed to perform significantly better than the sum of the
20 # components on contemporary CPUs, the effort was deemed impossible to
21 # justify. This module is based on combination of Intel submissions,
22 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
23 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
24 # pressure with notable relative improvement, achieving 1.0 cycle per
25 # byte processed with 128-bit key on Haswell processor.
26 #
27 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
28 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
29
30 $flavour = shift;
31 $output  = shift;
32 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
33
34 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
35
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39 die "can't locate x86_64-xlate.pl";
40
41 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43         $avx = ($1>=2.19) + ($1>=2.22);
44 }
45
46 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48         $avx = ($1>=2.09) + ($1>=2.10);
49 }
50
51 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53         $avx = ($1>=10) + ($1>=11);
54 }
55
56 if (!$avx && `$ENV{CC} -v 2>&1` =~ /LLVM ([3-9]\.[0-9]+)/) {
57         $avx = ($1>=3.0) + ($1>=3.1);
58 }
59
60 open OUT,"| \"$^X\" $xlate $flavour $output";
61 *STDOUT=*OUT;
62
63 if ($avx>1) {{{
64
65 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
66
67 ($Ii,$T1,$T2,$Hkey,
68  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
69
70 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
71
72 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
73
74 $code=<<___;
75 .text
76
77 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
78 .align  32
79 _aesni_ctr32_ghash_6x:
80         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
81         sub             \$6,$len
82         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
83         vmovdqu         0x00-0x80($key),$rndkey
84         vpaddb          $T2,$T1,$inout1
85         vpaddb          $T2,$inout1,$inout2
86         vpaddb          $T2,$inout2,$inout3
87         vpaddb          $T2,$inout3,$inout4
88         vpaddb          $T2,$inout4,$inout5
89         vpxor           $rndkey,$T1,$inout0
90         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
91         jmp             .Loop6x
92
93 .align  32
94 .Loop6x:
95         add             \$6<<24,$counter
96         jc              .Lhandle_ctr32          # discard $inout[1-5]?
97         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
98           vpaddb        $T2,$inout5,$T1         # next counter value
99           vpxor         $rndkey,$inout1,$inout1
100           vpxor         $rndkey,$inout2,$inout2
101
102 .Lresume_ctr32:
103         vmovdqu         $T1,($ivp)              # save next counter value
104         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
105           vpxor         $rndkey,$inout3,$inout3
106           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
107         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
108         xor             %r12,%r12
109         cmp             $in0,$end0
110
111           vaesenc       $T2,$inout0,$inout0
112         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
113           vpxor         $rndkey,$inout4,$inout4
114         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
115           vaesenc       $T2,$inout1,$inout1
116           vpxor         $rndkey,$inout5,$inout5
117         setnc           %r12b
118         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
119           vaesenc       $T2,$inout2,$inout2
120         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
121         neg             %r12
122           vaesenc       $T2,$inout3,$inout3
123          vpxor          $Z1,$Z2,$Z2
124         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
125          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
126           vaesenc       $T2,$inout4,$inout4
127          vpxor          $Z1,$T1,$Z0
128         and             \$0x60,%r12
129           vmovups       0x20-0x80($key),$rndkey
130         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
131           vaesenc       $T2,$inout5,$inout5
132
133         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
134         lea             ($in0,%r12),$in0
135           vaesenc       $rndkey,$inout0,$inout0
136          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
137         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
138          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
139           vaesenc       $rndkey,$inout1,$inout1
140         movbe           0x58($in0),%r13
141           vaesenc       $rndkey,$inout2,$inout2
142         movbe           0x50($in0),%r12
143           vaesenc       $rndkey,$inout3,$inout3
144         mov             %r13,0x20+8(%rsp)
145           vaesenc       $rndkey,$inout4,$inout4
146         mov             %r12,0x28+8(%rsp)
147         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
148           vaesenc       $rndkey,$inout5,$inout5
149
150           vmovups       0x30-0x80($key),$rndkey
151          vpxor          $T1,$Z2,$Z2
152         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
153           vaesenc       $rndkey,$inout0,$inout0
154          vpxor          $T2,$Z2,$Z2
155         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
156           vaesenc       $rndkey,$inout1,$inout1
157          vpxor          $Hkey,$Z3,$Z3
158         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
159           vaesenc       $rndkey,$inout2,$inout2
160         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
161          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
162           vaesenc       $rndkey,$inout3,$inout3
163           vaesenc       $rndkey,$inout4,$inout4
164          vpxor          $T1,$Z0,$Z0
165         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
166           vaesenc       $rndkey,$inout5,$inout5
167
168           vmovups       0x40-0x80($key),$rndkey
169          vpxor          $T2,$Z2,$Z2
170         vpclmulqdq      \$0x00,$T1,$Ii,$T2
171           vaesenc       $rndkey,$inout0,$inout0
172          vpxor          $Hkey,$Z2,$Z2
173         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
174           vaesenc       $rndkey,$inout1,$inout1
175         movbe           0x48($in0),%r13
176          vpxor          $Z1,$Z3,$Z3
177         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
178           vaesenc       $rndkey,$inout2,$inout2
179         movbe           0x40($in0),%r12
180         vpclmulqdq      \$0x11,$T1,$Ii,$T1
181          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
182           vaesenc       $rndkey,$inout3,$inout3
183         mov             %r13,0x30+8(%rsp)
184           vaesenc       $rndkey,$inout4,$inout4
185         mov             %r12,0x38+8(%rsp)
186          vpxor          $T2,$Z0,$Z0
187         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
188           vaesenc       $rndkey,$inout5,$inout5
189
190           vmovups       0x50-0x80($key),$rndkey
191          vpxor          $Hkey,$Z2,$Z2
192         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
193           vaesenc       $rndkey,$inout0,$inout0
194          vpxor          $Z1,$Z2,$Z2
195         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
196           vaesenc       $rndkey,$inout1,$inout1
197         movbe           0x38($in0),%r13
198          vpxor          $T1,$Z3,$Z3
199         vpclmulqdq      \$0x01,$T2,$Ii,$T1
200          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
201           vaesenc       $rndkey,$inout2,$inout2
202         movbe           0x30($in0),%r12
203         vpclmulqdq      \$0x11,$T2,$Ii,$T2
204           vaesenc       $rndkey,$inout3,$inout3
205         mov             %r13,0x40+8(%rsp)
206           vaesenc       $rndkey,$inout4,$inout4
207         mov             %r12,0x48+8(%rsp)
208          vpxor          $Hkey,$Z0,$Z0
209          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
210           vaesenc       $rndkey,$inout5,$inout5
211
212           vmovups       0x60-0x80($key),$rndkey
213          vpxor          $Z1,$Z2,$Z2
214         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
215           vaesenc       $rndkey,$inout0,$inout0
216          vpxor          $T1,$Z2,$Z2
217         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
218           vaesenc       $rndkey,$inout1,$inout1
219         movbe           0x28($in0),%r13
220          vpxor          $T2,$Z3,$Z3
221         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
222           vaesenc       $rndkey,$inout2,$inout2
223         movbe           0x20($in0),%r12
224         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
225           vaesenc       $rndkey,$inout3,$inout3
226         mov             %r13,0x50+8(%rsp)
227           vaesenc       $rndkey,$inout4,$inout4
228         mov             %r12,0x58+8(%rsp)
229         vpxor           $Z1,$Z2,$Z2
230           vaesenc       $rndkey,$inout5,$inout5
231         vpxor           $T1,$Z2,$Z2
232
233           vmovups       0x70-0x80($key),$rndkey
234         vpslldq         \$8,$Z2,$Z1
235         vpxor           $T2,$Z0,$Z0
236         vmovdqu         0x10($const),$Hkey      # .Lpoly
237
238           vaesenc       $rndkey,$inout0,$inout0
239         vpxor           $Xi,$Z3,$Z3
240           vaesenc       $rndkey,$inout1,$inout1
241         vpxor           $Z1,$Z0,$Z0
242         movbe           0x18($in0),%r13
243           vaesenc       $rndkey,$inout2,$inout2
244         movbe           0x10($in0),%r12
245         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
246         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
247         mov             %r13,0x60+8(%rsp)
248           vaesenc       $rndkey,$inout3,$inout3
249         mov             %r12,0x68+8(%rsp)
250           vaesenc       $rndkey,$inout4,$inout4
251           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
252           vaesenc       $rndkey,$inout5,$inout5
253
254           vaesenc       $T1,$inout0,$inout0
255           vmovups       0x90-0x80($key),$rndkey
256           vaesenc       $T1,$inout1,$inout1
257         vpsrldq         \$8,$Z2,$Z2
258           vaesenc       $T1,$inout2,$inout2
259         vpxor           $Z2,$Z3,$Z3
260           vaesenc       $T1,$inout3,$inout3
261         vpxor           $Ii,$Z0,$Z0
262         movbe           0x08($in0),%r13
263           vaesenc       $T1,$inout4,$inout4
264         movbe           0x00($in0),%r12
265           vaesenc       $T1,$inout5,$inout5
266           vmovups       0xa0-0x80($key),$T1
267           cmp           \$11,$rounds
268           jb            .Lenc_tail              # 128-bit key
269
270           vaesenc       $rndkey,$inout0,$inout0
271           vaesenc       $rndkey,$inout1,$inout1
272           vaesenc       $rndkey,$inout2,$inout2
273           vaesenc       $rndkey,$inout3,$inout3
274           vaesenc       $rndkey,$inout4,$inout4
275           vaesenc       $rndkey,$inout5,$inout5
276
277           vaesenc       $T1,$inout0,$inout0
278           vaesenc       $T1,$inout1,$inout1
279           vaesenc       $T1,$inout2,$inout2
280           vaesenc       $T1,$inout3,$inout3
281           vaesenc       $T1,$inout4,$inout4
282           vmovups       0xb0-0x80($key),$rndkey
283           vaesenc       $T1,$inout5,$inout5
284           vmovups       0xc0-0x80($key),$T1
285           je            .Lenc_tail              # 192-bit key
286
287           vaesenc       $rndkey,$inout0,$inout0
288           vaesenc       $rndkey,$inout1,$inout1
289           vaesenc       $rndkey,$inout2,$inout2
290           vaesenc       $rndkey,$inout3,$inout3
291           vaesenc       $rndkey,$inout4,$inout4
292           vaesenc       $rndkey,$inout5,$inout5
293
294           vaesenc       $T1,$inout0,$inout0
295           vaesenc       $T1,$inout1,$inout1
296           vaesenc       $T1,$inout2,$inout2
297           vaesenc       $T1,$inout3,$inout3
298           vaesenc       $T1,$inout4,$inout4
299           vmovups       0xd0-0x80($key),$rndkey
300           vaesenc       $T1,$inout5,$inout5
301           vmovups       0xe0-0x80($key),$T1
302           jmp           .Lenc_tail              # 256-bit key
303
304 .align  32
305 .Lhandle_ctr32:
306         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
307           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
308           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
309           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
310           vpaddd        $Z1,$Z2,$inout2
311         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
312           vpaddd        $Z1,$inout1,$inout3
313           vpshufb       $Ii,$inout1,$inout1
314           vpaddd        $Z1,$inout2,$inout4
315           vpshufb       $Ii,$inout2,$inout2
316           vpxor         $rndkey,$inout1,$inout1
317           vpaddd        $Z1,$inout3,$inout5
318           vpshufb       $Ii,$inout3,$inout3
319           vpxor         $rndkey,$inout2,$inout2
320           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
321           vpshufb       $Ii,$inout4,$inout4
322           vpshufb       $Ii,$inout5,$inout5
323           vpshufb       $Ii,$T1,$T1             # next counter value
324         jmp             .Lresume_ctr32
325
326 .align  32
327 .Lenc_tail:
328           vaesenc       $rndkey,$inout0,$inout0
329         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
330         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
331           vaesenc       $rndkey,$inout1,$inout1
332         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
333           vpxor         0x00($inp),$T1,$T2
334           vaesenc       $rndkey,$inout2,$inout2
335           vpxor         0x10($inp),$T1,$Ii
336           vaesenc       $rndkey,$inout3,$inout3
337           vpxor         0x20($inp),$T1,$Z1
338           vaesenc       $rndkey,$inout4,$inout4
339           vpxor         0x30($inp),$T1,$Z2
340           vaesenc       $rndkey,$inout5,$inout5
341           vpxor         0x40($inp),$T1,$Z3
342           vpxor         0x50($inp),$T1,$Hkey
343           vmovdqu       ($ivp),$T1              # load next counter value
344
345           vaesenclast   $T2,$inout0,$inout0
346           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
347           vaesenclast   $Ii,$inout1,$inout1
348          vpaddb         $T2,$T1,$Ii
349         mov             %r13,0x70+8(%rsp)
350         lea             0x60($inp),$inp
351           vaesenclast   $Z1,$inout2,$inout2
352          vpaddb         $T2,$Ii,$Z1
353         mov             %r12,0x78+8(%rsp)
354         lea             0x60($out),$out
355           vmovdqu       0x00-0x80($key),$rndkey
356           vaesenclast   $Z2,$inout3,$inout3
357          vpaddb         $T2,$Z1,$Z2
358           vaesenclast   $Z3, $inout4,$inout4
359          vpaddb         $T2,$Z2,$Z3
360           vaesenclast   $Hkey,$inout5,$inout5
361          vpaddb         $T2,$Z3,$Hkey
362
363         add             \$0x60,$ret
364         sub             \$0x6,$len
365         jc              .L6x_done
366
367           vmovups       $inout0,-0x60($out)     # save output
368          vpxor          $rndkey,$T1,$inout0
369           vmovups       $inout1,-0x50($out)
370          vmovdqa        $Ii,$inout1             # 0 latency
371           vmovups       $inout2,-0x40($out)
372          vmovdqa        $Z1,$inout2             # 0 latency
373           vmovups       $inout3,-0x30($out)
374          vmovdqa        $Z2,$inout3             # 0 latency
375           vmovups       $inout4,-0x20($out)
376          vmovdqa        $Z3,$inout4             # 0 latency
377           vmovups       $inout5,-0x10($out)
378          vmovdqa        $Hkey,$inout5           # 0 latency
379         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
380         jmp             .Loop6x
381
382 .L6x_done:
383         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
384         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
385
386         ret
387 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
388 ___
389 ######################################################################
390 #
391 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
392 #               const AES_KEY *key, unsigned char iv[16],
393 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
394 $code.=<<___;
395 .globl  aesni_gcm_decrypt
396 .type   aesni_gcm_decrypt,\@function,6
397 .align  32
398 aesni_gcm_decrypt:
399         xor     $ret,$ret
400         cmp     \$0x60,$len                     # minimal accepted length
401         jb      .Lgcm_dec_abort
402
403         lea     (%rsp),%rax                     # save stack pointer
404         push    %rbx
405         push    %rbp
406         push    %r12
407         push    %r13
408         push    %r14
409         push    %r15
410 ___
411 $code.=<<___ if ($win64);
412         lea     -0xa8(%rsp),%rsp
413         movaps  %xmm6,-0xd8(%rax)
414         movaps  %xmm7,-0xc8(%rax)
415         movaps  %xmm8,-0xb8(%rax)
416         movaps  %xmm9,-0xa8(%rax)
417         movaps  %xmm10,-0x98(%rax)
418         movaps  %xmm11,-0x88(%rax)
419         movaps  %xmm12,-0x78(%rax)
420         movaps  %xmm13,-0x68(%rax)
421         movaps  %xmm14,-0x58(%rax)
422         movaps  %xmm15,-0x48(%rax)
423 .Lgcm_dec_body:
424 ___
425 $code.=<<___;
426         vzeroupper
427
428         vmovdqu         ($ivp),$T1              # input counter value
429         add             \$-128,%rsp
430         mov             12($ivp),$counter
431         lea             .Lbswap_mask(%rip),$const
432         lea             -0x80($key),$in0        # borrow $in0
433         mov             \$0xf80,$end0           # borrow $end0
434         vmovdqu         ($Xip),$Xi              # load Xi
435         and             \$-128,%rsp             # ensure stack alignment
436         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
437         lea             0x80($key),$key         # size optimization
438         lea             0x20+0x20($Xip),$Xip    # size optimization
439         mov             0xf0-0x80($key),$rounds
440         vpshufb         $Ii,$Xi,$Xi
441
442         and             $end0,$in0
443         and             %rsp,$end0
444         sub             $in0,$end0
445         jc              .Ldec_no_key_aliasing
446         cmp             \$768,$end0
447         jnc             .Ldec_no_key_aliasing
448         sub             $end0,%rsp              # avoid aliasing with key
449 .Ldec_no_key_aliasing:
450
451         vmovdqu         0x50($inp),$Z3          # I[5]
452         lea             ($inp),$in0
453         vmovdqu         0x40($inp),$Z0
454         lea             -0xc0($inp,$len),$end0
455         vmovdqu         0x30($inp),$Z1
456         shr             \$4,$len
457         xor             $ret,$ret
458         vmovdqu         0x20($inp),$Z2
459          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
460         vmovdqu         0x10($inp),$T2
461          vpshufb        $Ii,$Z0,$Z0
462         vmovdqu         ($inp),$Hkey
463          vpshufb        $Ii,$Z1,$Z1
464         vmovdqu         $Z0,0x30(%rsp)
465          vpshufb        $Ii,$Z2,$Z2
466         vmovdqu         $Z1,0x40(%rsp)
467          vpshufb        $Ii,$T2,$T2
468         vmovdqu         $Z2,0x50(%rsp)
469          vpshufb        $Ii,$Hkey,$Hkey
470         vmovdqu         $T2,0x60(%rsp)
471         vmovdqu         $Hkey,0x70(%rsp)
472
473         call            _aesni_ctr32_ghash_6x
474
475         vmovups         $inout0,-0x60($out)     # save output
476         vmovups         $inout1,-0x50($out)
477         vmovups         $inout2,-0x40($out)
478         vmovups         $inout3,-0x30($out)
479         vmovups         $inout4,-0x20($out)
480         vmovups         $inout5,-0x10($out)
481
482         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
483         vmovdqu         $Xi,-0x40($Xip)         # output Xi
484
485         vzeroupper
486 ___
487 $code.=<<___ if ($win64);
488         movaps  -0xd8(%rax),%xmm6
489         movaps  -0xd8(%rax),%xmm7
490         movaps  -0xb8(%rax),%xmm8
491         movaps  -0xa8(%rax),%xmm9
492         movaps  -0x98(%rax),%xmm10
493         movaps  -0x88(%rax),%xmm11
494         movaps  -0x78(%rax),%xmm12
495         movaps  -0x68(%rax),%xmm13
496         movaps  -0x58(%rax),%xmm14
497         movaps  -0x48(%rax),%xmm15
498 ___
499 $code.=<<___;
500         mov     -48(%rax),%r15
501         mov     -40(%rax),%r14
502         mov     -32(%rax),%r13
503         mov     -24(%rax),%r12
504         mov     -16(%rax),%rbp
505         mov     -8(%rax),%rbx
506         lea     (%rax),%rsp             # restore %rsp
507 .Lgcm_dec_abort:
508         mov     $ret,%rax               # return value
509         ret
510 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
511 ___
512
513 $code.=<<___;
514 .type   _aesni_ctr32_6x,\@abi-omnipotent
515 .align  32
516 _aesni_ctr32_6x:
517         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
518         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
519         lea             -1($rounds),%r13
520         vmovups         0x10-0x80($key),$rndkey
521         lea             0x20-0x80($key),%r12
522         vpxor           $Z0,$T1,$inout0
523         add             \$6<<24,$counter
524         jc              .Lhandle_ctr32_2
525         vpaddb          $T2,$T1,$inout1
526         vpaddb          $T2,$inout1,$inout2
527         vpxor           $Z0,$inout1,$inout1
528         vpaddb          $T2,$inout2,$inout3
529         vpxor           $Z0,$inout2,$inout2
530         vpaddb          $T2,$inout3,$inout4
531         vpxor           $Z0,$inout3,$inout3
532         vpaddb          $T2,$inout4,$inout5
533         vpxor           $Z0,$inout4,$inout4
534         vpaddb          $T2,$inout5,$T1
535         vpxor           $Z0,$inout5,$inout5
536         jmp             .Loop_ctr32
537
538 .align  16
539 .Loop_ctr32:
540         vaesenc         $rndkey,$inout0,$inout0
541         vaesenc         $rndkey,$inout1,$inout1
542         vaesenc         $rndkey,$inout2,$inout2
543         vaesenc         $rndkey,$inout3,$inout3
544         vaesenc         $rndkey,$inout4,$inout4
545         vaesenc         $rndkey,$inout5,$inout5
546         vmovups         (%r12),$rndkey
547         lea             0x10(%r12),%r12
548         dec             %r13d
549         jnz             .Loop_ctr32
550
551         vmovdqu         (%r12),$Hkey            # last round key
552         vaesenc         $rndkey,$inout0,$inout0
553         vpxor           0x00($inp),$Hkey,$Z0
554         vaesenc         $rndkey,$inout1,$inout1
555         vpxor           0x10($inp),$Hkey,$Z1
556         vaesenc         $rndkey,$inout2,$inout2
557         vpxor           0x20($inp),$Hkey,$Z2
558         vaesenc         $rndkey,$inout3,$inout3
559         vpxor           0x30($inp),$Hkey,$Xi
560         vaesenc         $rndkey,$inout4,$inout4
561         vpxor           0x40($inp),$Hkey,$T2
562         vaesenc         $rndkey,$inout5,$inout5
563         vpxor           0x50($inp),$Hkey,$Hkey
564         lea             0x60($inp),$inp
565
566         vaesenclast     $Z0,$inout0,$inout0
567         vaesenclast     $Z1,$inout1,$inout1
568         vaesenclast     $Z2,$inout2,$inout2
569         vaesenclast     $Xi,$inout3,$inout3
570         vaesenclast     $T2,$inout4,$inout4
571         vaesenclast     $Hkey,$inout5,$inout5
572         vmovups         $inout0,0x00($out)
573         vmovups         $inout1,0x10($out)
574         vmovups         $inout2,0x20($out)
575         vmovups         $inout3,0x30($out)
576         vmovups         $inout4,0x40($out)
577         vmovups         $inout5,0x50($out)
578         lea             0x60($out),$out
579
580         ret
581 .align  32
582 .Lhandle_ctr32_2:
583         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
584         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
585         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
586         vpaddd          $Z1,$Z2,$inout2
587         vpaddd          $Z1,$inout1,$inout3
588         vpshufb         $Ii,$inout1,$inout1
589         vpaddd          $Z1,$inout2,$inout4
590         vpshufb         $Ii,$inout2,$inout2
591         vpxor           $Z0,$inout1,$inout1
592         vpaddd          $Z1,$inout3,$inout5
593         vpshufb         $Ii,$inout3,$inout3
594         vpxor           $Z0,$inout2,$inout2
595         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
596         vpshufb         $Ii,$inout4,$inout4
597         vpxor           $Z0,$inout3,$inout3
598         vpshufb         $Ii,$inout5,$inout5
599         vpxor           $Z0,$inout4,$inout4
600         vpshufb         $Ii,$T1,$T1             # next counter value
601         vpxor           $Z0,$inout5,$inout5
602         jmp     .Loop_ctr32
603 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
604
605 .globl  aesni_gcm_encrypt
606 .type   aesni_gcm_encrypt,\@function,6
607 .align  32
608 aesni_gcm_encrypt:
609         xor     $ret,$ret
610         cmp     \$0x60*3,$len                   # minimal accepted length
611         jb      .Lgcm_enc_abort
612
613         lea     (%rsp),%rax                     # save stack pointer
614         push    %rbx
615         push    %rbp
616         push    %r12
617         push    %r13
618         push    %r14
619         push    %r15
620 ___
621 $code.=<<___ if ($win64);
622         lea     -0xa8(%rsp),%rsp
623         movaps  %xmm6,-0xd8(%rax)
624         movaps  %xmm7,-0xc8(%rax)
625         movaps  %xmm8,-0xb8(%rax)
626         movaps  %xmm9,-0xa8(%rax)
627         movaps  %xmm10,-0x98(%rax)
628         movaps  %xmm11,-0x88(%rax)
629         movaps  %xmm12,-0x78(%rax)
630         movaps  %xmm13,-0x68(%rax)
631         movaps  %xmm14,-0x58(%rax)
632         movaps  %xmm15,-0x48(%rax)
633 .Lgcm_enc_body:
634 ___
635 $code.=<<___;
636         vzeroupper
637
638         vmovdqu         ($ivp),$T1              # input counter value
639         add             \$-128,%rsp
640         mov             12($ivp),$counter
641         lea             .Lbswap_mask(%rip),$const
642         lea             -0x80($key),$in0        # borrow $in0
643         mov             \$0xf80,$end0           # borrow $end0
644         lea             0x80($key),$key         # size optimization
645         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
646         and             \$-128,%rsp             # ensure stack alignment
647         mov             0xf0-0x80($key),$rounds
648
649         and             $end0,$in0
650         and             %rsp,$end0
651         sub             $in0,$end0
652         jc              .Lenc_no_key_aliasing
653         cmp             \$768,$end0
654         jnc             .Lenc_no_key_aliasing
655         sub             $end0,%rsp              # avoid aliasing with key
656 .Lenc_no_key_aliasing:
657
658         lea             ($out),$in0
659         lea             -0xc0($out,$len),$end0
660         shr             \$4,$len
661
662         call            _aesni_ctr32_6x
663         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
664         vpshufb         $Ii,$inout1,$T2
665         vmovdqu         $Xi,0x70(%rsp)
666         vpshufb         $Ii,$inout2,$Z0
667         vmovdqu         $T2,0x60(%rsp)
668         vpshufb         $Ii,$inout3,$Z1
669         vmovdqu         $Z0,0x50(%rsp)
670         vpshufb         $Ii,$inout4,$Z2
671         vmovdqu         $Z1,0x40(%rsp)
672         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
673         vmovdqu         $Z2,0x30(%rsp)
674
675         call            _aesni_ctr32_6x
676
677         vmovdqu         ($Xip),$Xi              # load Xi
678         lea             0x20+0x20($Xip),$Xip    # size optimization
679         sub             \$12,$len
680         mov             \$0x60*2,$ret
681         vpshufb         $Ii,$Xi,$Xi
682
683         call            _aesni_ctr32_ghash_6x
684         vmovdqu         0x20(%rsp),$Z3          # I[5]
685          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
686         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
687         vpunpckhqdq     $Z3,$Z3,$T1
688         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
689          vmovups        $inout0,-0x60($out)     # save output
690          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
691         vpxor           $Z3,$T1,$T1
692          vmovups        $inout1,-0x50($out)
693          vpshufb        $Ii,$inout1,$inout1
694          vmovups        $inout2,-0x40($out)
695          vpshufb        $Ii,$inout2,$inout2
696          vmovups        $inout3,-0x30($out)
697          vpshufb        $Ii,$inout3,$inout3
698          vmovups        $inout4,-0x20($out)
699          vpshufb        $Ii,$inout4,$inout4
700          vmovups        $inout5,-0x10($out)
701          vpshufb        $Ii,$inout5,$inout5
702          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
703 ___
704 { my ($HK,$T3)=($rndkey,$inout0);
705
706 $code.=<<___;
707          vmovdqu        0x30(%rsp),$Z2          # I[4]
708          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
709          vpunpckhqdq    $Z2,$Z2,$T2
710         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
711          vpxor          $Z2,$T2,$T2
712         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
713         vpclmulqdq      \$0x00,$HK,$T1,$T1
714
715          vmovdqu        0x40(%rsp),$T3          # I[3]
716         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
717          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
718         vpxor           $Z1,$Z0,$Z0
719          vpunpckhqdq    $T3,$T3,$Z1
720         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
721          vpxor          $T3,$Z1,$Z1
722         vpxor           $Z3,$Z2,$Z2
723         vpclmulqdq      \$0x10,$HK,$T2,$T2
724          vmovdqu        0x50-0x20($Xip),$HK
725         vpxor           $T1,$T2,$T2
726
727          vmovdqu        0x50(%rsp),$T1          # I[2]
728         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
729          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
730         vpxor           $Z0,$Z3,$Z3
731          vpunpckhqdq    $T1,$T1,$Z0
732         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
733          vpxor          $T1,$Z0,$Z0
734         vpxor           $Z2,$T3,$T3
735         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
736         vpxor           $T2,$Z1,$Z1
737
738          vmovdqu        0x60(%rsp),$T2          # I[1]
739         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
740          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
741         vpxor           $Z3,$Z2,$Z2
742          vpunpckhqdq    $T2,$T2,$Z3
743         vpclmulqdq      \$0x11,$Ii,$T1,$T1
744          vpxor          $T2,$Z3,$Z3
745         vpxor           $T3,$T1,$T1
746         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
747          vmovdqu        0x80-0x20($Xip),$HK
748         vpxor           $Z1,$Z0,$Z0
749
750          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
751         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
752          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
753          vpunpckhqdq    $Xi,$Xi,$T3
754         vpxor           $Z2,$Z1,$Z1
755         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
756          vpxor          $Xi,$T3,$T3
757         vpxor           $T1,$T2,$T2
758         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
759         vpxor           $Z0,$Z3,$Z0
760
761         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
762          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
763          vpunpckhqdq    $inout5,$inout5,$T1
764         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
765          vpxor          $inout5,$T1,$T1
766         vpxor           $Z1,$Z2,$Z1
767         vpclmulqdq      \$0x10,$HK,$T3,$T3
768          vmovdqu        0x20-0x20($Xip),$HK
769         vpxor           $T2,$Xi,$Z3
770         vpxor           $Z0,$T3,$Z2
771
772          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
773           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
774         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
775           vpxor         $T3,$Z2,$Z2
776          vpunpckhqdq    $inout4,$inout4,$T2
777         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
778          vpxor          $inout4,$T2,$T2
779           vpslldq       \$8,$Z2,$T3
780         vpclmulqdq      \$0x00,$HK,$T1,$T1
781           vpxor         $T3,$Z1,$Xi
782           vpsrldq       \$8,$Z2,$Z2
783           vpxor         $Z2,$Z3,$Z3
784
785         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
786          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
787         vpxor           $Z0,$Z1,$Z1
788          vpunpckhqdq    $inout3,$inout3,$T3
789         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
790          vpxor          $inout3,$T3,$T3
791         vpxor           $inout5,$inout4,$inout4
792           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
793         vpclmulqdq      \$0x10,$HK,$T2,$T2
794          vmovdqu        0x50-0x20($Xip),$HK
795         vpxor           $T1,$T2,$T2
796
797         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
798          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
799         vpxor           $Z1,$Z0,$Z0
800          vpunpckhqdq    $inout2,$inout2,$T1
801         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
802          vpxor          $inout2,$T1,$T1
803         vpxor           $inout4,$inout3,$inout3
804           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
805         vpclmulqdq      \$0x00,$HK,$T3,$T3
806         vpxor           $T2,$T3,$T3
807
808           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
809           vxorps        $inout5,$Xi,$Xi
810
811         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
812          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
813         vpxor           $Z0,$Z1,$Z1
814          vpunpckhqdq    $inout1,$inout1,$T2
815         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
816          vpxor          $inout1,$T2,$T2
817           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
818         vpxor           $inout3,$inout2,$inout2
819         vpclmulqdq      \$0x10,$HK,$T1,$T1
820          vmovdqu        0x80-0x20($Xip),$HK
821         vpxor           $T3,$T1,$T1
822
823           vxorps        $Z3,$inout5,$inout5
824           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
825           vxorps        $inout5,$Xi,$Xi
826
827         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
828          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
829         vpxor           $Z1,$Z0,$Z0
830          vpunpckhqdq    $Xi,$Xi,$T3
831         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
832          vpxor          $Xi,$T3,$T3
833         vpxor           $inout2,$inout1,$inout1
834         vpclmulqdq      \$0x00,$HK,$T2,$T2
835         vpxor           $T1,$T2,$T2
836
837         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
838         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
839         vpxor           $Z0,$Z1,$Z1
840         vpclmulqdq      \$0x10,$HK,$T3,$Z2
841         vpxor           $inout1,$Z3,$Z3
842         vpxor           $T2,$Z2,$Z2
843
844         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
845         vpxor           $Z0,$Z2,$Z2
846         vpslldq         \$8,$Z2,$T1
847         vmovdqu         0x10($const),$Hkey      # .Lpoly
848         vpsrldq         \$8,$Z2,$Z2
849         vpxor           $T1,$Z1,$Xi
850         vpxor           $Z2,$Z3,$Z3
851
852         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
853         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
854         vpxor           $T2,$Xi,$Xi
855
856         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
857         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
858         vpxor           $Z3,$T2,$T2
859         vpxor           $T2,$Xi,$Xi
860 ___
861 }
862 $code.=<<___;
863         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
864         vmovdqu         $Xi,-0x40($Xip)         # output Xi
865
866         vzeroupper
867 ___
868 $code.=<<___ if ($win64);
869         movaps  -0xd8(%rax),%xmm6
870         movaps  -0xc8(%rax),%xmm7
871         movaps  -0xb8(%rax),%xmm8
872         movaps  -0xa8(%rax),%xmm9
873         movaps  -0x98(%rax),%xmm10
874         movaps  -0x88(%rax),%xmm11
875         movaps  -0x78(%rax),%xmm12
876         movaps  -0x68(%rax),%xmm13
877         movaps  -0x58(%rax),%xmm14
878         movaps  -0x48(%rax),%xmm15
879 ___
880 $code.=<<___;
881         mov     -48(%rax),%r15
882         mov     -40(%rax),%r14
883         mov     -32(%rax),%r13
884         mov     -24(%rax),%r12
885         mov     -16(%rax),%rbp
886         mov     -8(%rax),%rbx
887         lea     (%rax),%rsp             # restore %rsp
888 .Lgcm_enc_abort:
889         mov     $ret,%rax               # return value
890         ret
891 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
892 ___
893
894 $code.=<<___;
895 .align  64
896 .Lbswap_mask:
897         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
898 .Lpoly:
899         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
900 .Lone_msb:
901         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
902 .Ltwo_lsb:
903         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
904 .Lone_lsb:
905         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
906 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
907 .align  64
908 ___
909 if ($win64) {
910 $rec="%rcx";
911 $frame="%rdx";
912 $context="%r8";
913 $disp="%r9";
914
915 $code.=<<___
916 .extern __imp_RtlVirtualUnwind
917 .type   gcm_se_handler,\@abi-omnipotent
918 .align  16
919 gcm_se_handler:
920         push    %rsi
921         push    %rdi
922         push    %rbx
923         push    %rbp
924         push    %r12
925         push    %r13
926         push    %r14
927         push    %r15
928         pushfq
929         sub     \$64,%rsp
930
931         mov     120($context),%rax      # pull context->Rax
932         mov     248($context),%rbx      # pull context->Rip
933
934         mov     8($disp),%rsi           # disp->ImageBase
935         mov     56($disp),%r11          # disp->HandlerData
936
937         mov     0(%r11),%r10d           # HandlerData[0]
938         lea     (%rsi,%r10),%r10        # prologue label
939         cmp     %r10,%rbx               # context->Rip<prologue label
940         jb      .Lcommon_seh_tail
941
942         mov     152($context),%rax      # pull context->Rsp
943
944         mov     4(%r11),%r10d           # HandlerData[1]
945         lea     (%rsi,%r10),%r10        # epilogue label
946         cmp     %r10,%rbx               # context->Rip>=epilogue label
947         jae     .Lcommon_seh_tail
948
949         mov     120($context),%rax      # pull context->Rax
950
951         mov     -48(%rax),%r15
952         mov     -40(%rax),%r14
953         mov     -32(%rax),%r13
954         mov     -24(%rax),%r12
955         mov     -16(%rax),%rbp
956         mov     -8(%rax),%rbx
957         mov     %r15,240($context)
958         mov     %r14,232($context)
959         mov     %r13,224($context)
960         mov     %r12,216($context)
961         mov     %rbp,160($context)
962         mov     %rbx,144($context)
963
964         lea     -0xd8(%rax),%rsi        # %xmm save area
965         lea     512($context),%rdi      # & context.Xmm6
966         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
967         .long   0xa548f3fc              # cld; rep movsq
968
969 .Lcommon_seh_tail:
970         mov     8(%rax),%rdi
971         mov     16(%rax),%rsi
972         mov     %rax,152($context)      # restore context->Rsp
973         mov     %rsi,168($context)      # restore context->Rsi
974         mov     %rdi,176($context)      # restore context->Rdi
975
976         mov     40($disp),%rdi          # disp->ContextRecord
977         mov     $context,%rsi           # context
978         mov     \$154,%ecx              # sizeof(CONTEXT)
979         .long   0xa548f3fc              # cld; rep movsq
980
981         mov     $disp,%rsi
982         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
983         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
984         mov     0(%rsi),%r8             # arg3, disp->ControlPc
985         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
986         mov     40(%rsi),%r10           # disp->ContextRecord
987         lea     56(%rsi),%r11           # &disp->HandlerData
988         lea     24(%rsi),%r12           # &disp->EstablisherFrame
989         mov     %r10,32(%rsp)           # arg5
990         mov     %r11,40(%rsp)           # arg6
991         mov     %r12,48(%rsp)           # arg7
992         mov     %rcx,56(%rsp)           # arg8, (NULL)
993         call    *__imp_RtlVirtualUnwind(%rip)
994
995         mov     \$1,%eax                # ExceptionContinueSearch
996         add     \$64,%rsp
997         popfq
998         pop     %r15
999         pop     %r14
1000         pop     %r13
1001         pop     %r12
1002         pop     %rbp
1003         pop     %rbx
1004         pop     %rdi
1005         pop     %rsi
1006         ret
1007 .size   gcm_se_handler,.-gcm_se_handler
1008
1009 .section        .pdata
1010 .align  4
1011         .rva    .LSEH_begin_aesni_gcm_decrypt
1012         .rva    .LSEH_end_aesni_gcm_decrypt
1013         .rva    .LSEH_gcm_dec_info
1014
1015         .rva    .LSEH_begin_aesni_gcm_encrypt
1016         .rva    .LSEH_end_aesni_gcm_encrypt
1017         .rva    .LSEH_gcm_enc_info
1018 .section        .xdata
1019 .align  8
1020 .LSEH_gcm_dec_info:
1021         .byte   9,0,0,0
1022         .rva    gcm_se_handler
1023         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1024 .LSEH_gcm_enc_info:
1025         .byte   9,0,0,0
1026         .rva    gcm_se_handler
1027         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1028 ___
1029 }
1030 }}} else {{{
1031 $code=<<___;    # assembler is too old
1032 .text
1033
1034 .globl  aesni_gcm_encrypt
1035 .type   aesni_gcm_encrypt,\@abi-omnipotent
1036 aesni_gcm_encrypt:
1037         xor     %eax,%eax
1038         ret
1039 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1040
1041 .globl  aesni_gcm_decrypt
1042 .type   aesni_gcm_decrypt,\@abi-omnipotent
1043 aesni_gcm_decrypt:
1044         xor     %eax,%eax
1045         ret
1046 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1047 ___
1048 }}}
1049
1050 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1051
1052 print $code;
1053
1054 close STDOUT;