Skylake performance results.
[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 #
11 # AES-NI-CTR+GHASH stitch.
12 #
13 # February 2013
14 #
15 # OpenSSL GCM implementation is organized in such way that its
16 # performance is rather close to the sum of its streamed components,
17 # in the context parallelized AES-NI CTR and modulo-scheduled
18 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
19 # was observed to perform significantly better than the sum of the
20 # components on contemporary CPUs, the effort was deemed impossible to
21 # justify. This module is based on combination of Intel submissions,
22 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
23 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
24 # pressure with notable relative improvement, achieving 1.0 cycle per
25 # byte processed with 128-bit key on Haswell processor, 0.74 - on
26 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
27 # measurements for favourable packet size, one divisible by 96.
28 # Applications using the EVP interface will observe a few percent
29 # worse performance.]
30 #
31 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
32 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
33
34 $flavour = shift;
35 $output  = shift;
36 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
37
38 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
39
40 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
41 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
42 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
43 die "can't locate x86_64-xlate.pl";
44
45 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
46                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
47         $avx = ($1>=2.19) + ($1>=2.22);
48 }
49
50 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
51             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
52         $avx = ($1>=2.09) + ($1>=2.10);
53 }
54
55 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
56             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
57         $avx = ($1>=10) + ($1>=11);
58 }
59
60 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
61         $avx = ($2>=3.0) + ($2>3.0);
62 }
63
64 open OUT,"| \"$^X\" $xlate $flavour $output";
65 *STDOUT=*OUT;
66
67 if ($avx>1) {{{
68
69 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
70
71 ($Ii,$T1,$T2,$Hkey,
72  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
73
74 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
75
76 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
77
78 $code=<<___;
79 .text
80
81 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
82 .align  32
83 _aesni_ctr32_ghash_6x:
84         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
85         sub             \$6,$len
86         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
87         vmovdqu         0x00-0x80($key),$rndkey
88         vpaddb          $T2,$T1,$inout1
89         vpaddb          $T2,$inout1,$inout2
90         vpaddb          $T2,$inout2,$inout3
91         vpaddb          $T2,$inout3,$inout4
92         vpaddb          $T2,$inout4,$inout5
93         vpxor           $rndkey,$T1,$inout0
94         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
95         jmp             .Loop6x
96
97 .align  32
98 .Loop6x:
99         add             \$`6<<24`,$counter
100         jc              .Lhandle_ctr32          # discard $inout[1-5]?
101         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
102           vpaddb        $T2,$inout5,$T1         # next counter value
103           vpxor         $rndkey,$inout1,$inout1
104           vpxor         $rndkey,$inout2,$inout2
105
106 .Lresume_ctr32:
107         vmovdqu         $T1,($ivp)              # save next counter value
108         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
109           vpxor         $rndkey,$inout3,$inout3
110           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
111         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
112         xor             %r12,%r12
113         cmp             $in0,$end0
114
115           vaesenc       $T2,$inout0,$inout0
116         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
117           vpxor         $rndkey,$inout4,$inout4
118         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
119           vaesenc       $T2,$inout1,$inout1
120           vpxor         $rndkey,$inout5,$inout5
121         setnc           %r12b
122         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
123           vaesenc       $T2,$inout2,$inout2
124         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
125         neg             %r12
126           vaesenc       $T2,$inout3,$inout3
127          vpxor          $Z1,$Z2,$Z2
128         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
129          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
130           vaesenc       $T2,$inout4,$inout4
131          vpxor          $Z1,$T1,$Z0
132         and             \$0x60,%r12
133           vmovups       0x20-0x80($key),$rndkey
134         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
135           vaesenc       $T2,$inout5,$inout5
136
137         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
138         lea             ($in0,%r12),$in0
139           vaesenc       $rndkey,$inout0,$inout0
140          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
141         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
142          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
143           vaesenc       $rndkey,$inout1,$inout1
144         movbe           0x58($in0),%r13
145           vaesenc       $rndkey,$inout2,$inout2
146         movbe           0x50($in0),%r12
147           vaesenc       $rndkey,$inout3,$inout3
148         mov             %r13,0x20+8(%rsp)
149           vaesenc       $rndkey,$inout4,$inout4
150         mov             %r12,0x28+8(%rsp)
151         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
152           vaesenc       $rndkey,$inout5,$inout5
153
154           vmovups       0x30-0x80($key),$rndkey
155          vpxor          $T1,$Z2,$Z2
156         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
157           vaesenc       $rndkey,$inout0,$inout0
158          vpxor          $T2,$Z2,$Z2
159         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
160           vaesenc       $rndkey,$inout1,$inout1
161          vpxor          $Hkey,$Z3,$Z3
162         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
163           vaesenc       $rndkey,$inout2,$inout2
164         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
165          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
166           vaesenc       $rndkey,$inout3,$inout3
167           vaesenc       $rndkey,$inout4,$inout4
168          vpxor          $T1,$Z0,$Z0
169         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
170           vaesenc       $rndkey,$inout5,$inout5
171
172           vmovups       0x40-0x80($key),$rndkey
173          vpxor          $T2,$Z2,$Z2
174         vpclmulqdq      \$0x00,$T1,$Ii,$T2
175           vaesenc       $rndkey,$inout0,$inout0
176          vpxor          $Hkey,$Z2,$Z2
177         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
178           vaesenc       $rndkey,$inout1,$inout1
179         movbe           0x48($in0),%r13
180          vpxor          $Z1,$Z3,$Z3
181         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
182           vaesenc       $rndkey,$inout2,$inout2
183         movbe           0x40($in0),%r12
184         vpclmulqdq      \$0x11,$T1,$Ii,$T1
185          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
186           vaesenc       $rndkey,$inout3,$inout3
187         mov             %r13,0x30+8(%rsp)
188           vaesenc       $rndkey,$inout4,$inout4
189         mov             %r12,0x38+8(%rsp)
190          vpxor          $T2,$Z0,$Z0
191         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
192           vaesenc       $rndkey,$inout5,$inout5
193
194           vmovups       0x50-0x80($key),$rndkey
195          vpxor          $Hkey,$Z2,$Z2
196         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
197           vaesenc       $rndkey,$inout0,$inout0
198          vpxor          $Z1,$Z2,$Z2
199         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
200           vaesenc       $rndkey,$inout1,$inout1
201         movbe           0x38($in0),%r13
202          vpxor          $T1,$Z3,$Z3
203         vpclmulqdq      \$0x01,$T2,$Ii,$T1
204          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
205           vaesenc       $rndkey,$inout2,$inout2
206         movbe           0x30($in0),%r12
207         vpclmulqdq      \$0x11,$T2,$Ii,$T2
208           vaesenc       $rndkey,$inout3,$inout3
209         mov             %r13,0x40+8(%rsp)
210           vaesenc       $rndkey,$inout4,$inout4
211         mov             %r12,0x48+8(%rsp)
212          vpxor          $Hkey,$Z0,$Z0
213          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
214           vaesenc       $rndkey,$inout5,$inout5
215
216           vmovups       0x60-0x80($key),$rndkey
217          vpxor          $Z1,$Z2,$Z2
218         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
219           vaesenc       $rndkey,$inout0,$inout0
220          vpxor          $T1,$Z2,$Z2
221         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
222           vaesenc       $rndkey,$inout1,$inout1
223         movbe           0x28($in0),%r13
224          vpxor          $T2,$Z3,$Z3
225         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
226           vaesenc       $rndkey,$inout2,$inout2
227         movbe           0x20($in0),%r12
228         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
229           vaesenc       $rndkey,$inout3,$inout3
230         mov             %r13,0x50+8(%rsp)
231           vaesenc       $rndkey,$inout4,$inout4
232         mov             %r12,0x58+8(%rsp)
233         vpxor           $Z1,$Z2,$Z2
234           vaesenc       $rndkey,$inout5,$inout5
235         vpxor           $T1,$Z2,$Z2
236
237           vmovups       0x70-0x80($key),$rndkey
238         vpslldq         \$8,$Z2,$Z1
239         vpxor           $T2,$Z0,$Z0
240         vmovdqu         0x10($const),$Hkey      # .Lpoly
241
242           vaesenc       $rndkey,$inout0,$inout0
243         vpxor           $Xi,$Z3,$Z3
244           vaesenc       $rndkey,$inout1,$inout1
245         vpxor           $Z1,$Z0,$Z0
246         movbe           0x18($in0),%r13
247           vaesenc       $rndkey,$inout2,$inout2
248         movbe           0x10($in0),%r12
249         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
250         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
251         mov             %r13,0x60+8(%rsp)
252           vaesenc       $rndkey,$inout3,$inout3
253         mov             %r12,0x68+8(%rsp)
254           vaesenc       $rndkey,$inout4,$inout4
255           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
256           vaesenc       $rndkey,$inout5,$inout5
257
258           vaesenc       $T1,$inout0,$inout0
259           vmovups       0x90-0x80($key),$rndkey
260           vaesenc       $T1,$inout1,$inout1
261         vpsrldq         \$8,$Z2,$Z2
262           vaesenc       $T1,$inout2,$inout2
263         vpxor           $Z2,$Z3,$Z3
264           vaesenc       $T1,$inout3,$inout3
265         vpxor           $Ii,$Z0,$Z0
266         movbe           0x08($in0),%r13
267           vaesenc       $T1,$inout4,$inout4
268         movbe           0x00($in0),%r12
269           vaesenc       $T1,$inout5,$inout5
270           vmovups       0xa0-0x80($key),$T1
271           cmp           \$11,$rounds
272           jb            .Lenc_tail              # 128-bit key
273
274           vaesenc       $rndkey,$inout0,$inout0
275           vaesenc       $rndkey,$inout1,$inout1
276           vaesenc       $rndkey,$inout2,$inout2
277           vaesenc       $rndkey,$inout3,$inout3
278           vaesenc       $rndkey,$inout4,$inout4
279           vaesenc       $rndkey,$inout5,$inout5
280
281           vaesenc       $T1,$inout0,$inout0
282           vaesenc       $T1,$inout1,$inout1
283           vaesenc       $T1,$inout2,$inout2
284           vaesenc       $T1,$inout3,$inout3
285           vaesenc       $T1,$inout4,$inout4
286           vmovups       0xb0-0x80($key),$rndkey
287           vaesenc       $T1,$inout5,$inout5
288           vmovups       0xc0-0x80($key),$T1
289           je            .Lenc_tail              # 192-bit key
290
291           vaesenc       $rndkey,$inout0,$inout0
292           vaesenc       $rndkey,$inout1,$inout1
293           vaesenc       $rndkey,$inout2,$inout2
294           vaesenc       $rndkey,$inout3,$inout3
295           vaesenc       $rndkey,$inout4,$inout4
296           vaesenc       $rndkey,$inout5,$inout5
297
298           vaesenc       $T1,$inout0,$inout0
299           vaesenc       $T1,$inout1,$inout1
300           vaesenc       $T1,$inout2,$inout2
301           vaesenc       $T1,$inout3,$inout3
302           vaesenc       $T1,$inout4,$inout4
303           vmovups       0xd0-0x80($key),$rndkey
304           vaesenc       $T1,$inout5,$inout5
305           vmovups       0xe0-0x80($key),$T1
306           jmp           .Lenc_tail              # 256-bit key
307
308 .align  32
309 .Lhandle_ctr32:
310         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
311           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
312           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
313           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
314           vpaddd        $Z1,$Z2,$inout2
315         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
316           vpaddd        $Z1,$inout1,$inout3
317           vpshufb       $Ii,$inout1,$inout1
318           vpaddd        $Z1,$inout2,$inout4
319           vpshufb       $Ii,$inout2,$inout2
320           vpxor         $rndkey,$inout1,$inout1
321           vpaddd        $Z1,$inout3,$inout5
322           vpshufb       $Ii,$inout3,$inout3
323           vpxor         $rndkey,$inout2,$inout2
324           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
325           vpshufb       $Ii,$inout4,$inout4
326           vpshufb       $Ii,$inout5,$inout5
327           vpshufb       $Ii,$T1,$T1             # next counter value
328         jmp             .Lresume_ctr32
329
330 .align  32
331 .Lenc_tail:
332           vaesenc       $rndkey,$inout0,$inout0
333         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
334         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
335           vaesenc       $rndkey,$inout1,$inout1
336         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
337           vpxor         0x00($inp),$T1,$T2
338           vaesenc       $rndkey,$inout2,$inout2
339           vpxor         0x10($inp),$T1,$Ii
340           vaesenc       $rndkey,$inout3,$inout3
341           vpxor         0x20($inp),$T1,$Z1
342           vaesenc       $rndkey,$inout4,$inout4
343           vpxor         0x30($inp),$T1,$Z2
344           vaesenc       $rndkey,$inout5,$inout5
345           vpxor         0x40($inp),$T1,$Z3
346           vpxor         0x50($inp),$T1,$Hkey
347           vmovdqu       ($ivp),$T1              # load next counter value
348
349           vaesenclast   $T2,$inout0,$inout0
350           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
351           vaesenclast   $Ii,$inout1,$inout1
352          vpaddb         $T2,$T1,$Ii
353         mov             %r13,0x70+8(%rsp)
354         lea             0x60($inp),$inp
355           vaesenclast   $Z1,$inout2,$inout2
356          vpaddb         $T2,$Ii,$Z1
357         mov             %r12,0x78+8(%rsp)
358         lea             0x60($out),$out
359           vmovdqu       0x00-0x80($key),$rndkey
360           vaesenclast   $Z2,$inout3,$inout3
361          vpaddb         $T2,$Z1,$Z2
362           vaesenclast   $Z3, $inout4,$inout4
363          vpaddb         $T2,$Z2,$Z3
364           vaesenclast   $Hkey,$inout5,$inout5
365          vpaddb         $T2,$Z3,$Hkey
366
367         add             \$0x60,$ret
368         sub             \$0x6,$len
369         jc              .L6x_done
370
371           vmovups       $inout0,-0x60($out)     # save output
372          vpxor          $rndkey,$T1,$inout0
373           vmovups       $inout1,-0x50($out)
374          vmovdqa        $Ii,$inout1             # 0 latency
375           vmovups       $inout2,-0x40($out)
376          vmovdqa        $Z1,$inout2             # 0 latency
377           vmovups       $inout3,-0x30($out)
378          vmovdqa        $Z2,$inout3             # 0 latency
379           vmovups       $inout4,-0x20($out)
380          vmovdqa        $Z3,$inout4             # 0 latency
381           vmovups       $inout5,-0x10($out)
382          vmovdqa        $Hkey,$inout5           # 0 latency
383         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
384         jmp             .Loop6x
385
386 .L6x_done:
387         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
388         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
389
390         ret
391 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
392 ___
393 ######################################################################
394 #
395 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
396 #               const AES_KEY *key, unsigned char iv[16],
397 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
398 $code.=<<___;
399 .globl  aesni_gcm_decrypt
400 .type   aesni_gcm_decrypt,\@function,6
401 .align  32
402 aesni_gcm_decrypt:
403         xor     $ret,$ret
404         cmp     \$0x60,$len                     # minimal accepted length
405         jb      .Lgcm_dec_abort
406
407         lea     (%rsp),%rax                     # save stack pointer
408         push    %rbx
409         push    %rbp
410         push    %r12
411         push    %r13
412         push    %r14
413         push    %r15
414 ___
415 $code.=<<___ if ($win64);
416         lea     -0xa8(%rsp),%rsp
417         movaps  %xmm6,-0xd8(%rax)
418         movaps  %xmm7,-0xc8(%rax)
419         movaps  %xmm8,-0xb8(%rax)
420         movaps  %xmm9,-0xa8(%rax)
421         movaps  %xmm10,-0x98(%rax)
422         movaps  %xmm11,-0x88(%rax)
423         movaps  %xmm12,-0x78(%rax)
424         movaps  %xmm13,-0x68(%rax)
425         movaps  %xmm14,-0x58(%rax)
426         movaps  %xmm15,-0x48(%rax)
427 .Lgcm_dec_body:
428 ___
429 $code.=<<___;
430         vzeroupper
431
432         vmovdqu         ($ivp),$T1              # input counter value
433         add             \$-128,%rsp
434         mov             12($ivp),$counter
435         lea             .Lbswap_mask(%rip),$const
436         lea             -0x80($key),$in0        # borrow $in0
437         mov             \$0xf80,$end0           # borrow $end0
438         vmovdqu         ($Xip),$Xi              # load Xi
439         and             \$-128,%rsp             # ensure stack alignment
440         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
441         lea             0x80($key),$key         # size optimization
442         lea             0x20+0x20($Xip),$Xip    # size optimization
443         mov             0xf0-0x80($key),$rounds
444         vpshufb         $Ii,$Xi,$Xi
445
446         and             $end0,$in0
447         and             %rsp,$end0
448         sub             $in0,$end0
449         jc              .Ldec_no_key_aliasing
450         cmp             \$768,$end0
451         jnc             .Ldec_no_key_aliasing
452         sub             $end0,%rsp              # avoid aliasing with key
453 .Ldec_no_key_aliasing:
454
455         vmovdqu         0x50($inp),$Z3          # I[5]
456         lea             ($inp),$in0
457         vmovdqu         0x40($inp),$Z0
458         lea             -0xc0($inp,$len),$end0
459         vmovdqu         0x30($inp),$Z1
460         shr             \$4,$len
461         xor             $ret,$ret
462         vmovdqu         0x20($inp),$Z2
463          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
464         vmovdqu         0x10($inp),$T2
465          vpshufb        $Ii,$Z0,$Z0
466         vmovdqu         ($inp),$Hkey
467          vpshufb        $Ii,$Z1,$Z1
468         vmovdqu         $Z0,0x30(%rsp)
469          vpshufb        $Ii,$Z2,$Z2
470         vmovdqu         $Z1,0x40(%rsp)
471          vpshufb        $Ii,$T2,$T2
472         vmovdqu         $Z2,0x50(%rsp)
473          vpshufb        $Ii,$Hkey,$Hkey
474         vmovdqu         $T2,0x60(%rsp)
475         vmovdqu         $Hkey,0x70(%rsp)
476
477         call            _aesni_ctr32_ghash_6x
478
479         vmovups         $inout0,-0x60($out)     # save output
480         vmovups         $inout1,-0x50($out)
481         vmovups         $inout2,-0x40($out)
482         vmovups         $inout3,-0x30($out)
483         vmovups         $inout4,-0x20($out)
484         vmovups         $inout5,-0x10($out)
485
486         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
487         vmovdqu         $Xi,-0x40($Xip)         # output Xi
488
489         vzeroupper
490 ___
491 $code.=<<___ if ($win64);
492         movaps  -0xd8(%rax),%xmm6
493         movaps  -0xd8(%rax),%xmm7
494         movaps  -0xb8(%rax),%xmm8
495         movaps  -0xa8(%rax),%xmm9
496         movaps  -0x98(%rax),%xmm10
497         movaps  -0x88(%rax),%xmm11
498         movaps  -0x78(%rax),%xmm12
499         movaps  -0x68(%rax),%xmm13
500         movaps  -0x58(%rax),%xmm14
501         movaps  -0x48(%rax),%xmm15
502 ___
503 $code.=<<___;
504         mov     -48(%rax),%r15
505         mov     -40(%rax),%r14
506         mov     -32(%rax),%r13
507         mov     -24(%rax),%r12
508         mov     -16(%rax),%rbp
509         mov     -8(%rax),%rbx
510         lea     (%rax),%rsp             # restore %rsp
511 .Lgcm_dec_abort:
512         mov     $ret,%rax               # return value
513         ret
514 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
515 ___
516
517 $code.=<<___;
518 .type   _aesni_ctr32_6x,\@abi-omnipotent
519 .align  32
520 _aesni_ctr32_6x:
521         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
522         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
523         lea             -1($rounds),%r13
524         vmovups         0x10-0x80($key),$rndkey
525         lea             0x20-0x80($key),%r12
526         vpxor           $Z0,$T1,$inout0
527         add             \$`6<<24`,$counter
528         jc              .Lhandle_ctr32_2
529         vpaddb          $T2,$T1,$inout1
530         vpaddb          $T2,$inout1,$inout2
531         vpxor           $Z0,$inout1,$inout1
532         vpaddb          $T2,$inout2,$inout3
533         vpxor           $Z0,$inout2,$inout2
534         vpaddb          $T2,$inout3,$inout4
535         vpxor           $Z0,$inout3,$inout3
536         vpaddb          $T2,$inout4,$inout5
537         vpxor           $Z0,$inout4,$inout4
538         vpaddb          $T2,$inout5,$T1
539         vpxor           $Z0,$inout5,$inout5
540         jmp             .Loop_ctr32
541
542 .align  16
543 .Loop_ctr32:
544         vaesenc         $rndkey,$inout0,$inout0
545         vaesenc         $rndkey,$inout1,$inout1
546         vaesenc         $rndkey,$inout2,$inout2
547         vaesenc         $rndkey,$inout3,$inout3
548         vaesenc         $rndkey,$inout4,$inout4
549         vaesenc         $rndkey,$inout5,$inout5
550         vmovups         (%r12),$rndkey
551         lea             0x10(%r12),%r12
552         dec             %r13d
553         jnz             .Loop_ctr32
554
555         vmovdqu         (%r12),$Hkey            # last round key
556         vaesenc         $rndkey,$inout0,$inout0
557         vpxor           0x00($inp),$Hkey,$Z0
558         vaesenc         $rndkey,$inout1,$inout1
559         vpxor           0x10($inp),$Hkey,$Z1
560         vaesenc         $rndkey,$inout2,$inout2
561         vpxor           0x20($inp),$Hkey,$Z2
562         vaesenc         $rndkey,$inout3,$inout3
563         vpxor           0x30($inp),$Hkey,$Xi
564         vaesenc         $rndkey,$inout4,$inout4
565         vpxor           0x40($inp),$Hkey,$T2
566         vaesenc         $rndkey,$inout5,$inout5
567         vpxor           0x50($inp),$Hkey,$Hkey
568         lea             0x60($inp),$inp
569
570         vaesenclast     $Z0,$inout0,$inout0
571         vaesenclast     $Z1,$inout1,$inout1
572         vaesenclast     $Z2,$inout2,$inout2
573         vaesenclast     $Xi,$inout3,$inout3
574         vaesenclast     $T2,$inout4,$inout4
575         vaesenclast     $Hkey,$inout5,$inout5
576         vmovups         $inout0,0x00($out)
577         vmovups         $inout1,0x10($out)
578         vmovups         $inout2,0x20($out)
579         vmovups         $inout3,0x30($out)
580         vmovups         $inout4,0x40($out)
581         vmovups         $inout5,0x50($out)
582         lea             0x60($out),$out
583
584         ret
585 .align  32
586 .Lhandle_ctr32_2:
587         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
588         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
589         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
590         vpaddd          $Z1,$Z2,$inout2
591         vpaddd          $Z1,$inout1,$inout3
592         vpshufb         $Ii,$inout1,$inout1
593         vpaddd          $Z1,$inout2,$inout4
594         vpshufb         $Ii,$inout2,$inout2
595         vpxor           $Z0,$inout1,$inout1
596         vpaddd          $Z1,$inout3,$inout5
597         vpshufb         $Ii,$inout3,$inout3
598         vpxor           $Z0,$inout2,$inout2
599         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
600         vpshufb         $Ii,$inout4,$inout4
601         vpxor           $Z0,$inout3,$inout3
602         vpshufb         $Ii,$inout5,$inout5
603         vpxor           $Z0,$inout4,$inout4
604         vpshufb         $Ii,$T1,$T1             # next counter value
605         vpxor           $Z0,$inout5,$inout5
606         jmp     .Loop_ctr32
607 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
608
609 .globl  aesni_gcm_encrypt
610 .type   aesni_gcm_encrypt,\@function,6
611 .align  32
612 aesni_gcm_encrypt:
613         xor     $ret,$ret
614         cmp     \$0x60*3,$len                   # minimal accepted length
615         jb      .Lgcm_enc_abort
616
617         lea     (%rsp),%rax                     # save stack pointer
618         push    %rbx
619         push    %rbp
620         push    %r12
621         push    %r13
622         push    %r14
623         push    %r15
624 ___
625 $code.=<<___ if ($win64);
626         lea     -0xa8(%rsp),%rsp
627         movaps  %xmm6,-0xd8(%rax)
628         movaps  %xmm7,-0xc8(%rax)
629         movaps  %xmm8,-0xb8(%rax)
630         movaps  %xmm9,-0xa8(%rax)
631         movaps  %xmm10,-0x98(%rax)
632         movaps  %xmm11,-0x88(%rax)
633         movaps  %xmm12,-0x78(%rax)
634         movaps  %xmm13,-0x68(%rax)
635         movaps  %xmm14,-0x58(%rax)
636         movaps  %xmm15,-0x48(%rax)
637 .Lgcm_enc_body:
638 ___
639 $code.=<<___;
640         vzeroupper
641
642         vmovdqu         ($ivp),$T1              # input counter value
643         add             \$-128,%rsp
644         mov             12($ivp),$counter
645         lea             .Lbswap_mask(%rip),$const
646         lea             -0x80($key),$in0        # borrow $in0
647         mov             \$0xf80,$end0           # borrow $end0
648         lea             0x80($key),$key         # size optimization
649         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
650         and             \$-128,%rsp             # ensure stack alignment
651         mov             0xf0-0x80($key),$rounds
652
653         and             $end0,$in0
654         and             %rsp,$end0
655         sub             $in0,$end0
656         jc              .Lenc_no_key_aliasing
657         cmp             \$768,$end0
658         jnc             .Lenc_no_key_aliasing
659         sub             $end0,%rsp              # avoid aliasing with key
660 .Lenc_no_key_aliasing:
661
662         lea             ($out),$in0
663         lea             -0xc0($out,$len),$end0
664         shr             \$4,$len
665
666         call            _aesni_ctr32_6x
667         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
668         vpshufb         $Ii,$inout1,$T2
669         vmovdqu         $Xi,0x70(%rsp)
670         vpshufb         $Ii,$inout2,$Z0
671         vmovdqu         $T2,0x60(%rsp)
672         vpshufb         $Ii,$inout3,$Z1
673         vmovdqu         $Z0,0x50(%rsp)
674         vpshufb         $Ii,$inout4,$Z2
675         vmovdqu         $Z1,0x40(%rsp)
676         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
677         vmovdqu         $Z2,0x30(%rsp)
678
679         call            _aesni_ctr32_6x
680
681         vmovdqu         ($Xip),$Xi              # load Xi
682         lea             0x20+0x20($Xip),$Xip    # size optimization
683         sub             \$12,$len
684         mov             \$0x60*2,$ret
685         vpshufb         $Ii,$Xi,$Xi
686
687         call            _aesni_ctr32_ghash_6x
688         vmovdqu         0x20(%rsp),$Z3          # I[5]
689          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
690         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
691         vpunpckhqdq     $Z3,$Z3,$T1
692         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
693          vmovups        $inout0,-0x60($out)     # save output
694          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
695         vpxor           $Z3,$T1,$T1
696          vmovups        $inout1,-0x50($out)
697          vpshufb        $Ii,$inout1,$inout1
698          vmovups        $inout2,-0x40($out)
699          vpshufb        $Ii,$inout2,$inout2
700          vmovups        $inout3,-0x30($out)
701          vpshufb        $Ii,$inout3,$inout3
702          vmovups        $inout4,-0x20($out)
703          vpshufb        $Ii,$inout4,$inout4
704          vmovups        $inout5,-0x10($out)
705          vpshufb        $Ii,$inout5,$inout5
706          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
707 ___
708 { my ($HK,$T3)=($rndkey,$inout0);
709
710 $code.=<<___;
711          vmovdqu        0x30(%rsp),$Z2          # I[4]
712          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
713          vpunpckhqdq    $Z2,$Z2,$T2
714         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
715          vpxor          $Z2,$T2,$T2
716         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
717         vpclmulqdq      \$0x00,$HK,$T1,$T1
718
719          vmovdqu        0x40(%rsp),$T3          # I[3]
720         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
721          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
722         vpxor           $Z1,$Z0,$Z0
723          vpunpckhqdq    $T3,$T3,$Z1
724         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
725          vpxor          $T3,$Z1,$Z1
726         vpxor           $Z3,$Z2,$Z2
727         vpclmulqdq      \$0x10,$HK,$T2,$T2
728          vmovdqu        0x50-0x20($Xip),$HK
729         vpxor           $T1,$T2,$T2
730
731          vmovdqu        0x50(%rsp),$T1          # I[2]
732         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
733          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
734         vpxor           $Z0,$Z3,$Z3
735          vpunpckhqdq    $T1,$T1,$Z0
736         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
737          vpxor          $T1,$Z0,$Z0
738         vpxor           $Z2,$T3,$T3
739         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
740         vpxor           $T2,$Z1,$Z1
741
742          vmovdqu        0x60(%rsp),$T2          # I[1]
743         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
744          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
745         vpxor           $Z3,$Z2,$Z2
746          vpunpckhqdq    $T2,$T2,$Z3
747         vpclmulqdq      \$0x11,$Ii,$T1,$T1
748          vpxor          $T2,$Z3,$Z3
749         vpxor           $T3,$T1,$T1
750         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
751          vmovdqu        0x80-0x20($Xip),$HK
752         vpxor           $Z1,$Z0,$Z0
753
754          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
755         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
756          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
757          vpunpckhqdq    $Xi,$Xi,$T3
758         vpxor           $Z2,$Z1,$Z1
759         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
760          vpxor          $Xi,$T3,$T3
761         vpxor           $T1,$T2,$T2
762         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
763         vpxor           $Z0,$Z3,$Z0
764
765         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
766          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
767          vpunpckhqdq    $inout5,$inout5,$T1
768         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
769          vpxor          $inout5,$T1,$T1
770         vpxor           $Z1,$Z2,$Z1
771         vpclmulqdq      \$0x10,$HK,$T3,$T3
772          vmovdqu        0x20-0x20($Xip),$HK
773         vpxor           $T2,$Xi,$Z3
774         vpxor           $Z0,$T3,$Z2
775
776          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
777           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
778         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
779           vpxor         $T3,$Z2,$Z2
780          vpunpckhqdq    $inout4,$inout4,$T2
781         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
782          vpxor          $inout4,$T2,$T2
783           vpslldq       \$8,$Z2,$T3
784         vpclmulqdq      \$0x00,$HK,$T1,$T1
785           vpxor         $T3,$Z1,$Xi
786           vpsrldq       \$8,$Z2,$Z2
787           vpxor         $Z2,$Z3,$Z3
788
789         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
790          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
791         vpxor           $Z0,$Z1,$Z1
792          vpunpckhqdq    $inout3,$inout3,$T3
793         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
794          vpxor          $inout3,$T3,$T3
795         vpxor           $inout5,$inout4,$inout4
796           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
797         vpclmulqdq      \$0x10,$HK,$T2,$T2
798          vmovdqu        0x50-0x20($Xip),$HK
799         vpxor           $T1,$T2,$T2
800
801         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
802          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
803         vpxor           $Z1,$Z0,$Z0
804          vpunpckhqdq    $inout2,$inout2,$T1
805         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
806          vpxor          $inout2,$T1,$T1
807         vpxor           $inout4,$inout3,$inout3
808           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
809         vpclmulqdq      \$0x00,$HK,$T3,$T3
810         vpxor           $T2,$T3,$T3
811
812           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
813           vxorps        $inout5,$Xi,$Xi
814
815         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
816          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
817         vpxor           $Z0,$Z1,$Z1
818          vpunpckhqdq    $inout1,$inout1,$T2
819         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
820          vpxor          $inout1,$T2,$T2
821           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
822         vpxor           $inout3,$inout2,$inout2
823         vpclmulqdq      \$0x10,$HK,$T1,$T1
824          vmovdqu        0x80-0x20($Xip),$HK
825         vpxor           $T3,$T1,$T1
826
827           vxorps        $Z3,$inout5,$inout5
828           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
829           vxorps        $inout5,$Xi,$Xi
830
831         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
832          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
833         vpxor           $Z1,$Z0,$Z0
834          vpunpckhqdq    $Xi,$Xi,$T3
835         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
836          vpxor          $Xi,$T3,$T3
837         vpxor           $inout2,$inout1,$inout1
838         vpclmulqdq      \$0x00,$HK,$T2,$T2
839         vpxor           $T1,$T2,$T2
840
841         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
842         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
843         vpxor           $Z0,$Z1,$Z1
844         vpclmulqdq      \$0x10,$HK,$T3,$Z2
845         vpxor           $inout1,$Z3,$Z3
846         vpxor           $T2,$Z2,$Z2
847
848         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
849         vpxor           $Z0,$Z2,$Z2
850         vpslldq         \$8,$Z2,$T1
851         vmovdqu         0x10($const),$Hkey      # .Lpoly
852         vpsrldq         \$8,$Z2,$Z2
853         vpxor           $T1,$Z1,$Xi
854         vpxor           $Z2,$Z3,$Z3
855
856         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
857         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
858         vpxor           $T2,$Xi,$Xi
859
860         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
861         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
862         vpxor           $Z3,$T2,$T2
863         vpxor           $T2,$Xi,$Xi
864 ___
865 }
866 $code.=<<___;
867         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
868         vmovdqu         $Xi,-0x40($Xip)         # output Xi
869
870         vzeroupper
871 ___
872 $code.=<<___ if ($win64);
873         movaps  -0xd8(%rax),%xmm6
874         movaps  -0xc8(%rax),%xmm7
875         movaps  -0xb8(%rax),%xmm8
876         movaps  -0xa8(%rax),%xmm9
877         movaps  -0x98(%rax),%xmm10
878         movaps  -0x88(%rax),%xmm11
879         movaps  -0x78(%rax),%xmm12
880         movaps  -0x68(%rax),%xmm13
881         movaps  -0x58(%rax),%xmm14
882         movaps  -0x48(%rax),%xmm15
883 ___
884 $code.=<<___;
885         mov     -48(%rax),%r15
886         mov     -40(%rax),%r14
887         mov     -32(%rax),%r13
888         mov     -24(%rax),%r12
889         mov     -16(%rax),%rbp
890         mov     -8(%rax),%rbx
891         lea     (%rax),%rsp             # restore %rsp
892 .Lgcm_enc_abort:
893         mov     $ret,%rax               # return value
894         ret
895 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
896 ___
897
898 $code.=<<___;
899 .align  64
900 .Lbswap_mask:
901         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
902 .Lpoly:
903         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
904 .Lone_msb:
905         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
906 .Ltwo_lsb:
907         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
908 .Lone_lsb:
909         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
910 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
911 .align  64
912 ___
913 if ($win64) {
914 $rec="%rcx";
915 $frame="%rdx";
916 $context="%r8";
917 $disp="%r9";
918
919 $code.=<<___
920 .extern __imp_RtlVirtualUnwind
921 .type   gcm_se_handler,\@abi-omnipotent
922 .align  16
923 gcm_se_handler:
924         push    %rsi
925         push    %rdi
926         push    %rbx
927         push    %rbp
928         push    %r12
929         push    %r13
930         push    %r14
931         push    %r15
932         pushfq
933         sub     \$64,%rsp
934
935         mov     120($context),%rax      # pull context->Rax
936         mov     248($context),%rbx      # pull context->Rip
937
938         mov     8($disp),%rsi           # disp->ImageBase
939         mov     56($disp),%r11          # disp->HandlerData
940
941         mov     0(%r11),%r10d           # HandlerData[0]
942         lea     (%rsi,%r10),%r10        # prologue label
943         cmp     %r10,%rbx               # context->Rip<prologue label
944         jb      .Lcommon_seh_tail
945
946         mov     152($context),%rax      # pull context->Rsp
947
948         mov     4(%r11),%r10d           # HandlerData[1]
949         lea     (%rsi,%r10),%r10        # epilogue label
950         cmp     %r10,%rbx               # context->Rip>=epilogue label
951         jae     .Lcommon_seh_tail
952
953         mov     120($context),%rax      # pull context->Rax
954
955         mov     -48(%rax),%r15
956         mov     -40(%rax),%r14
957         mov     -32(%rax),%r13
958         mov     -24(%rax),%r12
959         mov     -16(%rax),%rbp
960         mov     -8(%rax),%rbx
961         mov     %r15,240($context)
962         mov     %r14,232($context)
963         mov     %r13,224($context)
964         mov     %r12,216($context)
965         mov     %rbp,160($context)
966         mov     %rbx,144($context)
967
968         lea     -0xd8(%rax),%rsi        # %xmm save area
969         lea     512($context),%rdi      # & context.Xmm6
970         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
971         .long   0xa548f3fc              # cld; rep movsq
972
973 .Lcommon_seh_tail:
974         mov     8(%rax),%rdi
975         mov     16(%rax),%rsi
976         mov     %rax,152($context)      # restore context->Rsp
977         mov     %rsi,168($context)      # restore context->Rsi
978         mov     %rdi,176($context)      # restore context->Rdi
979
980         mov     40($disp),%rdi          # disp->ContextRecord
981         mov     $context,%rsi           # context
982         mov     \$154,%ecx              # sizeof(CONTEXT)
983         .long   0xa548f3fc              # cld; rep movsq
984
985         mov     $disp,%rsi
986         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
987         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
988         mov     0(%rsi),%r8             # arg3, disp->ControlPc
989         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
990         mov     40(%rsi),%r10           # disp->ContextRecord
991         lea     56(%rsi),%r11           # &disp->HandlerData
992         lea     24(%rsi),%r12           # &disp->EstablisherFrame
993         mov     %r10,32(%rsp)           # arg5
994         mov     %r11,40(%rsp)           # arg6
995         mov     %r12,48(%rsp)           # arg7
996         mov     %rcx,56(%rsp)           # arg8, (NULL)
997         call    *__imp_RtlVirtualUnwind(%rip)
998
999         mov     \$1,%eax                # ExceptionContinueSearch
1000         add     \$64,%rsp
1001         popfq
1002         pop     %r15
1003         pop     %r14
1004         pop     %r13
1005         pop     %r12
1006         pop     %rbp
1007         pop     %rbx
1008         pop     %rdi
1009         pop     %rsi
1010         ret
1011 .size   gcm_se_handler,.-gcm_se_handler
1012
1013 .section        .pdata
1014 .align  4
1015         .rva    .LSEH_begin_aesni_gcm_decrypt
1016         .rva    .LSEH_end_aesni_gcm_decrypt
1017         .rva    .LSEH_gcm_dec_info
1018
1019         .rva    .LSEH_begin_aesni_gcm_encrypt
1020         .rva    .LSEH_end_aesni_gcm_encrypt
1021         .rva    .LSEH_gcm_enc_info
1022 .section        .xdata
1023 .align  8
1024 .LSEH_gcm_dec_info:
1025         .byte   9,0,0,0
1026         .rva    gcm_se_handler
1027         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1028 .LSEH_gcm_enc_info:
1029         .byte   9,0,0,0
1030         .rva    gcm_se_handler
1031         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1032 ___
1033 }
1034 }}} else {{{
1035 $code=<<___;    # assembler is too old
1036 .text
1037
1038 .globl  aesni_gcm_encrypt
1039 .type   aesni_gcm_encrypt,\@abi-omnipotent
1040 aesni_gcm_encrypt:
1041         xor     %eax,%eax
1042         ret
1043 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1044
1045 .globl  aesni_gcm_decrypt
1046 .type   aesni_gcm_decrypt,\@abi-omnipotent
1047 aesni_gcm_decrypt:
1048         xor     %eax,%eax
1049         ret
1050 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1051 ___
1052 }}}
1053
1054 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1055
1056 print $code;
1057
1058 close STDOUT;