837c06266c79afdbaf7359d7ead7f4143436da05
[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 #
18 # AES-NI-CTR+GHASH stitch.
19 #
20 # February 2013
21 #
22 # OpenSSL GCM implementation is organized in such way that its
23 # performance is rather close to the sum of its streamed components,
24 # in the context parallelized AES-NI CTR and modulo-scheduled
25 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
26 # was observed to perform significantly better than the sum of the
27 # components on contemporary CPUs, the effort was deemed impossible to
28 # justify. This module is based on combination of Intel submissions,
29 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
30 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
31 # pressure with notable relative improvement, achieving 1.0 cycle per
32 # byte processed with 128-bit key on Haswell processor, 0.74 - on
33 # Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled
34 # measurements for favourable packet size, one divisible by 96.
35 # Applications using the EVP interface will observe a few percent
36 # worse performance.]
37 #
38 # Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
39 #
40 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
41 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
42
43 # $output is the last argument if it looks like a file (it has an extension)
44 # $flavour is the first argument if it doesn't look like a file
45 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
46 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
47
48 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
49
50 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
51 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
52 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
53 die "can't locate x86_64-xlate.pl";
54
55 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
56                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
57         $avx = ($1>=2.20) + ($1>=2.22);
58 }
59
60 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
61             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
62         $avx = ($1>=2.09) + ($1>=2.10);
63 }
64
65 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
66             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
67         $avx = ($1>=10) + ($1>=11);
68 }
69
70 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
71         $avx = ($2>=3.0) + ($2>3.0);
72 }
73
74 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
75     or die "can't call $xlate: $!";
76 *STDOUT=*OUT;
77
78 if ($avx>1) {{{
79
80 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
81
82 ($Ii,$T1,$T2,$Hkey,
83  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
84
85 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
86
87 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
88
89 $code=<<___;
90 .text
91
92 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
93 .align  32
94 _aesni_ctr32_ghash_6x:
95 .cfi_startproc
96         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
97         sub             \$6,$len
98         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
99         vmovdqu         0x00-0x80($key),$rndkey
100         vpaddb          $T2,$T1,$inout1
101         vpaddb          $T2,$inout1,$inout2
102         vpaddb          $T2,$inout2,$inout3
103         vpaddb          $T2,$inout3,$inout4
104         vpaddb          $T2,$inout4,$inout5
105         vpxor           $rndkey,$T1,$inout0
106         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
107         jmp             .Loop6x
108
109 .align  32
110 .Loop6x:
111         add             \$`6<<24`,$counter
112         jc              .Lhandle_ctr32          # discard $inout[1-5]?
113         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
114           vpaddb        $T2,$inout5,$T1         # next counter value
115           vpxor         $rndkey,$inout1,$inout1
116           vpxor         $rndkey,$inout2,$inout2
117
118 .Lresume_ctr32:
119         vmovdqu         $T1,($ivp)              # save next counter value
120         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
121           vpxor         $rndkey,$inout3,$inout3
122           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
123         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
124         xor             %r12,%r12
125         cmp             $in0,$end0
126
127           vaesenc       $T2,$inout0,$inout0
128         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
129           vpxor         $rndkey,$inout4,$inout4
130         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
131           vaesenc       $T2,$inout1,$inout1
132           vpxor         $rndkey,$inout5,$inout5
133         setnc           %r12b
134         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
135           vaesenc       $T2,$inout2,$inout2
136         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
137         neg             %r12
138           vaesenc       $T2,$inout3,$inout3
139          vpxor          $Z1,$Z2,$Z2
140         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
141          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
142           vaesenc       $T2,$inout4,$inout4
143          vpxor          $Z1,$T1,$Z0
144         and             \$0x60,%r12
145           vmovups       0x20-0x80($key),$rndkey
146         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
147           vaesenc       $T2,$inout5,$inout5
148
149         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
150         lea             ($in0,%r12),$in0
151           vaesenc       $rndkey,$inout0,$inout0
152          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
153         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
154          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
155           vaesenc       $rndkey,$inout1,$inout1
156         movbe           0x58($in0),%r13
157           vaesenc       $rndkey,$inout2,$inout2
158         movbe           0x50($in0),%r12
159           vaesenc       $rndkey,$inout3,$inout3
160         mov             %r13,0x20+8(%rsp)
161           vaesenc       $rndkey,$inout4,$inout4
162         mov             %r12,0x28+8(%rsp)
163         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
164           vaesenc       $rndkey,$inout5,$inout5
165
166           vmovups       0x30-0x80($key),$rndkey
167          vpxor          $T1,$Z2,$Z2
168         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
169           vaesenc       $rndkey,$inout0,$inout0
170          vpxor          $T2,$Z2,$Z2
171         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
172           vaesenc       $rndkey,$inout1,$inout1
173          vpxor          $Hkey,$Z3,$Z3
174         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
175           vaesenc       $rndkey,$inout2,$inout2
176         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
177          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
178           vaesenc       $rndkey,$inout3,$inout3
179           vaesenc       $rndkey,$inout4,$inout4
180          vpxor          $T1,$Z0,$Z0
181         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
182           vaesenc       $rndkey,$inout5,$inout5
183
184           vmovups       0x40-0x80($key),$rndkey
185          vpxor          $T2,$Z2,$Z2
186         vpclmulqdq      \$0x00,$T1,$Ii,$T2
187           vaesenc       $rndkey,$inout0,$inout0
188          vpxor          $Hkey,$Z2,$Z2
189         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
190           vaesenc       $rndkey,$inout1,$inout1
191         movbe           0x48($in0),%r13
192          vpxor          $Z1,$Z3,$Z3
193         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
194           vaesenc       $rndkey,$inout2,$inout2
195         movbe           0x40($in0),%r12
196         vpclmulqdq      \$0x11,$T1,$Ii,$T1
197          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
198           vaesenc       $rndkey,$inout3,$inout3
199         mov             %r13,0x30+8(%rsp)
200           vaesenc       $rndkey,$inout4,$inout4
201         mov             %r12,0x38+8(%rsp)
202          vpxor          $T2,$Z0,$Z0
203         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
204           vaesenc       $rndkey,$inout5,$inout5
205
206           vmovups       0x50-0x80($key),$rndkey
207          vpxor          $Hkey,$Z2,$Z2
208         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
209           vaesenc       $rndkey,$inout0,$inout0
210          vpxor          $Z1,$Z2,$Z2
211         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
212           vaesenc       $rndkey,$inout1,$inout1
213         movbe           0x38($in0),%r13
214          vpxor          $T1,$Z3,$Z3
215         vpclmulqdq      \$0x01,$T2,$Ii,$T1
216          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
217           vaesenc       $rndkey,$inout2,$inout2
218         movbe           0x30($in0),%r12
219         vpclmulqdq      \$0x11,$T2,$Ii,$T2
220           vaesenc       $rndkey,$inout3,$inout3
221         mov             %r13,0x40+8(%rsp)
222           vaesenc       $rndkey,$inout4,$inout4
223         mov             %r12,0x48+8(%rsp)
224          vpxor          $Hkey,$Z0,$Z0
225          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
226           vaesenc       $rndkey,$inout5,$inout5
227
228           vmovups       0x60-0x80($key),$rndkey
229          vpxor          $Z1,$Z2,$Z2
230         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
231           vaesenc       $rndkey,$inout0,$inout0
232          vpxor          $T1,$Z2,$Z2
233         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
234           vaesenc       $rndkey,$inout1,$inout1
235         movbe           0x28($in0),%r13
236          vpxor          $T2,$Z3,$Z3
237         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
238           vaesenc       $rndkey,$inout2,$inout2
239         movbe           0x20($in0),%r12
240         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
241           vaesenc       $rndkey,$inout3,$inout3
242         mov             %r13,0x50+8(%rsp)
243           vaesenc       $rndkey,$inout4,$inout4
244         mov             %r12,0x58+8(%rsp)
245         vpxor           $Z1,$Z2,$Z2
246           vaesenc       $rndkey,$inout5,$inout5
247         vpxor           $T1,$Z2,$Z2
248
249           vmovups       0x70-0x80($key),$rndkey
250         vpslldq         \$8,$Z2,$Z1
251         vpxor           $T2,$Z0,$Z0
252         vmovdqu         0x10($const),$Hkey      # .Lpoly
253
254           vaesenc       $rndkey,$inout0,$inout0
255         vpxor           $Xi,$Z3,$Z3
256           vaesenc       $rndkey,$inout1,$inout1
257         vpxor           $Z1,$Z0,$Z0
258         movbe           0x18($in0),%r13
259           vaesenc       $rndkey,$inout2,$inout2
260         movbe           0x10($in0),%r12
261         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
262         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
263         mov             %r13,0x60+8(%rsp)
264           vaesenc       $rndkey,$inout3,$inout3
265         mov             %r12,0x68+8(%rsp)
266           vaesenc       $rndkey,$inout4,$inout4
267           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
268           vaesenc       $rndkey,$inout5,$inout5
269
270           vaesenc       $T1,$inout0,$inout0
271           vmovups       0x90-0x80($key),$rndkey
272           vaesenc       $T1,$inout1,$inout1
273         vpsrldq         \$8,$Z2,$Z2
274           vaesenc       $T1,$inout2,$inout2
275         vpxor           $Z2,$Z3,$Z3
276           vaesenc       $T1,$inout3,$inout3
277         vpxor           $Ii,$Z0,$Z0
278         movbe           0x08($in0),%r13
279           vaesenc       $T1,$inout4,$inout4
280         movbe           0x00($in0),%r12
281           vaesenc       $T1,$inout5,$inout5
282           vmovups       0xa0-0x80($key),$T1
283           cmp           \$11,$rounds
284           jb            .Lenc_tail              # 128-bit key
285
286           vaesenc       $rndkey,$inout0,$inout0
287           vaesenc       $rndkey,$inout1,$inout1
288           vaesenc       $rndkey,$inout2,$inout2
289           vaesenc       $rndkey,$inout3,$inout3
290           vaesenc       $rndkey,$inout4,$inout4
291           vaesenc       $rndkey,$inout5,$inout5
292
293           vaesenc       $T1,$inout0,$inout0
294           vaesenc       $T1,$inout1,$inout1
295           vaesenc       $T1,$inout2,$inout2
296           vaesenc       $T1,$inout3,$inout3
297           vaesenc       $T1,$inout4,$inout4
298           vmovups       0xb0-0x80($key),$rndkey
299           vaesenc       $T1,$inout5,$inout5
300           vmovups       0xc0-0x80($key),$T1
301           je            .Lenc_tail              # 192-bit key
302
303           vaesenc       $rndkey,$inout0,$inout0
304           vaesenc       $rndkey,$inout1,$inout1
305           vaesenc       $rndkey,$inout2,$inout2
306           vaesenc       $rndkey,$inout3,$inout3
307           vaesenc       $rndkey,$inout4,$inout4
308           vaesenc       $rndkey,$inout5,$inout5
309
310           vaesenc       $T1,$inout0,$inout0
311           vaesenc       $T1,$inout1,$inout1
312           vaesenc       $T1,$inout2,$inout2
313           vaesenc       $T1,$inout3,$inout3
314           vaesenc       $T1,$inout4,$inout4
315           vmovups       0xd0-0x80($key),$rndkey
316           vaesenc       $T1,$inout5,$inout5
317           vmovups       0xe0-0x80($key),$T1
318           jmp           .Lenc_tail              # 256-bit key
319
320 .align  32
321 .Lhandle_ctr32:
322         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
323           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
324           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
325           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
326           vpaddd        $Z1,$Z2,$inout2
327         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
328           vpaddd        $Z1,$inout1,$inout3
329           vpshufb       $Ii,$inout1,$inout1
330           vpaddd        $Z1,$inout2,$inout4
331           vpshufb       $Ii,$inout2,$inout2
332           vpxor         $rndkey,$inout1,$inout1
333           vpaddd        $Z1,$inout3,$inout5
334           vpshufb       $Ii,$inout3,$inout3
335           vpxor         $rndkey,$inout2,$inout2
336           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
337           vpshufb       $Ii,$inout4,$inout4
338           vpshufb       $Ii,$inout5,$inout5
339           vpshufb       $Ii,$T1,$T1             # next counter value
340         jmp             .Lresume_ctr32
341
342 .align  32
343 .Lenc_tail:
344           vaesenc       $rndkey,$inout0,$inout0
345         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
346         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
347           vaesenc       $rndkey,$inout1,$inout1
348         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
349           vpxor         0x00($inp),$T1,$T2
350           vaesenc       $rndkey,$inout2,$inout2
351           vpxor         0x10($inp),$T1,$Ii
352           vaesenc       $rndkey,$inout3,$inout3
353           vpxor         0x20($inp),$T1,$Z1
354           vaesenc       $rndkey,$inout4,$inout4
355           vpxor         0x30($inp),$T1,$Z2
356           vaesenc       $rndkey,$inout5,$inout5
357           vpxor         0x40($inp),$T1,$Z3
358           vpxor         0x50($inp),$T1,$Hkey
359           vmovdqu       ($ivp),$T1              # load next counter value
360
361           vaesenclast   $T2,$inout0,$inout0
362           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
363           vaesenclast   $Ii,$inout1,$inout1
364          vpaddb         $T2,$T1,$Ii
365         mov             %r13,0x70+8(%rsp)
366         lea             0x60($inp),$inp
367           vaesenclast   $Z1,$inout2,$inout2
368          vpaddb         $T2,$Ii,$Z1
369         mov             %r12,0x78+8(%rsp)
370         lea             0x60($out),$out
371           vmovdqu       0x00-0x80($key),$rndkey
372           vaesenclast   $Z2,$inout3,$inout3
373          vpaddb         $T2,$Z1,$Z2
374           vaesenclast   $Z3, $inout4,$inout4
375          vpaddb         $T2,$Z2,$Z3
376           vaesenclast   $Hkey,$inout5,$inout5
377          vpaddb         $T2,$Z3,$Hkey
378
379         add             \$0x60,$ret
380         sub             \$0x6,$len
381         jc              .L6x_done
382
383           vmovups       $inout0,-0x60($out)     # save output
384          vpxor          $rndkey,$T1,$inout0
385           vmovups       $inout1,-0x50($out)
386          vmovdqa        $Ii,$inout1             # 0 latency
387           vmovups       $inout2,-0x40($out)
388          vmovdqa        $Z1,$inout2             # 0 latency
389           vmovups       $inout3,-0x30($out)
390          vmovdqa        $Z2,$inout3             # 0 latency
391           vmovups       $inout4,-0x20($out)
392          vmovdqa        $Z3,$inout4             # 0 latency
393           vmovups       $inout5,-0x10($out)
394          vmovdqa        $Hkey,$inout5           # 0 latency
395         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
396         jmp             .Loop6x
397
398 .L6x_done:
399         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
400         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
401
402         ret
403 .cfi_endproc
404 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
405 ___
406 ######################################################################
407 #
408 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
409 #               const AES_KEY *key, unsigned char iv[16],
410 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
411 $code.=<<___;
412 .globl  aesni_gcm_decrypt
413 .type   aesni_gcm_decrypt,\@function,6
414 .align  32
415 aesni_gcm_decrypt:
416 .cfi_startproc
417         xor     $ret,$ret
418         cmp     \$0x60,$len                     # minimal accepted length
419         jb      .Lgcm_dec_abort
420
421         lea     (%rsp),%rax                     # save stack pointer
422 .cfi_def_cfa_register   %rax
423         push    %rbx
424 .cfi_push       %rbx
425         push    %rbp
426 .cfi_push       %rbp
427         push    %r12
428 .cfi_push       %r12
429         push    %r13
430 .cfi_push       %r13
431         push    %r14
432 .cfi_push       %r14
433         push    %r15
434 .cfi_push       %r15
435 ___
436 $code.=<<___ if ($win64);
437         lea     -0xa8(%rsp),%rsp
438         movaps  %xmm6,-0xd8(%rax)
439         movaps  %xmm7,-0xc8(%rax)
440         movaps  %xmm8,-0xb8(%rax)
441         movaps  %xmm9,-0xa8(%rax)
442         movaps  %xmm10,-0x98(%rax)
443         movaps  %xmm11,-0x88(%rax)
444         movaps  %xmm12,-0x78(%rax)
445         movaps  %xmm13,-0x68(%rax)
446         movaps  %xmm14,-0x58(%rax)
447         movaps  %xmm15,-0x48(%rax)
448 .Lgcm_dec_body:
449 ___
450 $code.=<<___;
451         vzeroupper
452
453         vmovdqu         ($ivp),$T1              # input counter value
454         add             \$-128,%rsp
455         mov             12($ivp),$counter
456         lea             .Lbswap_mask(%rip),$const
457         lea             -0x80($key),$in0        # borrow $in0
458         mov             \$0xf80,$end0           # borrow $end0
459         vmovdqu         ($Xip),$Xi              # load Xi
460         and             \$-128,%rsp             # ensure stack alignment
461         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
462         lea             0x80($key),$key         # size optimization
463         lea             0x20+0x20($Xip),$Xip    # size optimization
464         mov             0xf0-0x80($key),$rounds
465         vpshufb         $Ii,$Xi,$Xi
466
467         and             $end0,$in0
468         and             %rsp,$end0
469         sub             $in0,$end0
470         jc              .Ldec_no_key_aliasing
471         cmp             \$768,$end0
472         jnc             .Ldec_no_key_aliasing
473         sub             $end0,%rsp              # avoid aliasing with key
474 .Ldec_no_key_aliasing:
475
476         vmovdqu         0x50($inp),$Z3          # I[5]
477         lea             ($inp),$in0
478         vmovdqu         0x40($inp),$Z0
479         lea             -0xc0($inp,$len),$end0
480         vmovdqu         0x30($inp),$Z1
481         shr             \$4,$len
482         xor             $ret,$ret
483         vmovdqu         0x20($inp),$Z2
484          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
485         vmovdqu         0x10($inp),$T2
486          vpshufb        $Ii,$Z0,$Z0
487         vmovdqu         ($inp),$Hkey
488          vpshufb        $Ii,$Z1,$Z1
489         vmovdqu         $Z0,0x30(%rsp)
490          vpshufb        $Ii,$Z2,$Z2
491         vmovdqu         $Z1,0x40(%rsp)
492          vpshufb        $Ii,$T2,$T2
493         vmovdqu         $Z2,0x50(%rsp)
494          vpshufb        $Ii,$Hkey,$Hkey
495         vmovdqu         $T2,0x60(%rsp)
496         vmovdqu         $Hkey,0x70(%rsp)
497
498         call            _aesni_ctr32_ghash_6x
499
500         vmovups         $inout0,-0x60($out)     # save output
501         vmovups         $inout1,-0x50($out)
502         vmovups         $inout2,-0x40($out)
503         vmovups         $inout3,-0x30($out)
504         vmovups         $inout4,-0x20($out)
505         vmovups         $inout5,-0x10($out)
506
507         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
508         vmovdqu         $Xi,-0x40($Xip)         # output Xi
509
510         vzeroupper
511 ___
512 $code.=<<___ if ($win64);
513         movaps  -0xd8(%rax),%xmm6
514         movaps  -0xc8(%rax),%xmm7
515         movaps  -0xb8(%rax),%xmm8
516         movaps  -0xa8(%rax),%xmm9
517         movaps  -0x98(%rax),%xmm10
518         movaps  -0x88(%rax),%xmm11
519         movaps  -0x78(%rax),%xmm12
520         movaps  -0x68(%rax),%xmm13
521         movaps  -0x58(%rax),%xmm14
522         movaps  -0x48(%rax),%xmm15
523 ___
524 $code.=<<___;
525         mov     -48(%rax),%r15
526 .cfi_restore    %r15
527         mov     -40(%rax),%r14
528 .cfi_restore    %r14
529         mov     -32(%rax),%r13
530 .cfi_restore    %r13
531         mov     -24(%rax),%r12
532 .cfi_restore    %r12
533         mov     -16(%rax),%rbp
534 .cfi_restore    %rbp
535         mov     -8(%rax),%rbx
536 .cfi_restore    %rbx
537         lea     (%rax),%rsp             # restore %rsp
538 .cfi_def_cfa_register   %rsp
539 .Lgcm_dec_abort:
540         mov     $ret,%rax               # return value
541         ret
542 .cfi_endproc
543 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
544 ___
545
546 $code.=<<___;
547 .type   _aesni_ctr32_6x,\@abi-omnipotent
548 .align  32
549 _aesni_ctr32_6x:
550 .cfi_startproc
551         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
552         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
553         lea             -1($rounds),%r13
554         vmovups         0x10-0x80($key),$rndkey
555         lea             0x20-0x80($key),%r12
556         vpxor           $Z0,$T1,$inout0
557         add             \$`6<<24`,$counter
558         jc              .Lhandle_ctr32_2
559         vpaddb          $T2,$T1,$inout1
560         vpaddb          $T2,$inout1,$inout2
561         vpxor           $Z0,$inout1,$inout1
562         vpaddb          $T2,$inout2,$inout3
563         vpxor           $Z0,$inout2,$inout2
564         vpaddb          $T2,$inout3,$inout4
565         vpxor           $Z0,$inout3,$inout3
566         vpaddb          $T2,$inout4,$inout5
567         vpxor           $Z0,$inout4,$inout4
568         vpaddb          $T2,$inout5,$T1
569         vpxor           $Z0,$inout5,$inout5
570         jmp             .Loop_ctr32
571
572 .align  16
573 .Loop_ctr32:
574         vaesenc         $rndkey,$inout0,$inout0
575         vaesenc         $rndkey,$inout1,$inout1
576         vaesenc         $rndkey,$inout2,$inout2
577         vaesenc         $rndkey,$inout3,$inout3
578         vaesenc         $rndkey,$inout4,$inout4
579         vaesenc         $rndkey,$inout5,$inout5
580         vmovups         (%r12),$rndkey
581         lea             0x10(%r12),%r12
582         dec             %r13d
583         jnz             .Loop_ctr32
584
585         vmovdqu         (%r12),$Hkey            # last round key
586         vaesenc         $rndkey,$inout0,$inout0
587         vpxor           0x00($inp),$Hkey,$Z0
588         vaesenc         $rndkey,$inout1,$inout1
589         vpxor           0x10($inp),$Hkey,$Z1
590         vaesenc         $rndkey,$inout2,$inout2
591         vpxor           0x20($inp),$Hkey,$Z2
592         vaesenc         $rndkey,$inout3,$inout3
593         vpxor           0x30($inp),$Hkey,$Xi
594         vaesenc         $rndkey,$inout4,$inout4
595         vpxor           0x40($inp),$Hkey,$T2
596         vaesenc         $rndkey,$inout5,$inout5
597         vpxor           0x50($inp),$Hkey,$Hkey
598         lea             0x60($inp),$inp
599
600         vaesenclast     $Z0,$inout0,$inout0
601         vaesenclast     $Z1,$inout1,$inout1
602         vaesenclast     $Z2,$inout2,$inout2
603         vaesenclast     $Xi,$inout3,$inout3
604         vaesenclast     $T2,$inout4,$inout4
605         vaesenclast     $Hkey,$inout5,$inout5
606         vmovups         $inout0,0x00($out)
607         vmovups         $inout1,0x10($out)
608         vmovups         $inout2,0x20($out)
609         vmovups         $inout3,0x30($out)
610         vmovups         $inout4,0x40($out)
611         vmovups         $inout5,0x50($out)
612         lea             0x60($out),$out
613
614         ret
615 .align  32
616 .Lhandle_ctr32_2:
617         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
618         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
619         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
620         vpaddd          $Z1,$Z2,$inout2
621         vpaddd          $Z1,$inout1,$inout3
622         vpshufb         $Ii,$inout1,$inout1
623         vpaddd          $Z1,$inout2,$inout4
624         vpshufb         $Ii,$inout2,$inout2
625         vpxor           $Z0,$inout1,$inout1
626         vpaddd          $Z1,$inout3,$inout5
627         vpshufb         $Ii,$inout3,$inout3
628         vpxor           $Z0,$inout2,$inout2
629         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
630         vpshufb         $Ii,$inout4,$inout4
631         vpxor           $Z0,$inout3,$inout3
632         vpshufb         $Ii,$inout5,$inout5
633         vpxor           $Z0,$inout4,$inout4
634         vpshufb         $Ii,$T1,$T1             # next counter value
635         vpxor           $Z0,$inout5,$inout5
636         jmp     .Loop_ctr32
637 .cfi_endproc
638 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
639
640 .globl  aesni_gcm_encrypt
641 .type   aesni_gcm_encrypt,\@function,6
642 .align  32
643 aesni_gcm_encrypt:
644 .cfi_startproc
645         xor     $ret,$ret
646         cmp     \$0x60*3,$len                   # minimal accepted length
647         jb      .Lgcm_enc_abort
648
649         lea     (%rsp),%rax                     # save stack pointer
650 .cfi_def_cfa_register   %rax
651         push    %rbx
652 .cfi_push       %rbx
653         push    %rbp
654 .cfi_push       %rbp
655         push    %r12
656 .cfi_push       %r12
657         push    %r13
658 .cfi_push       %r13
659         push    %r14
660 .cfi_push       %r14
661         push    %r15
662 .cfi_push       %r15
663 ___
664 $code.=<<___ if ($win64);
665         lea     -0xa8(%rsp),%rsp
666         movaps  %xmm6,-0xd8(%rax)
667         movaps  %xmm7,-0xc8(%rax)
668         movaps  %xmm8,-0xb8(%rax)
669         movaps  %xmm9,-0xa8(%rax)
670         movaps  %xmm10,-0x98(%rax)
671         movaps  %xmm11,-0x88(%rax)
672         movaps  %xmm12,-0x78(%rax)
673         movaps  %xmm13,-0x68(%rax)
674         movaps  %xmm14,-0x58(%rax)
675         movaps  %xmm15,-0x48(%rax)
676 .Lgcm_enc_body:
677 ___
678 $code.=<<___;
679         vzeroupper
680
681         vmovdqu         ($ivp),$T1              # input counter value
682         add             \$-128,%rsp
683         mov             12($ivp),$counter
684         lea             .Lbswap_mask(%rip),$const
685         lea             -0x80($key),$in0        # borrow $in0
686         mov             \$0xf80,$end0           # borrow $end0
687         lea             0x80($key),$key         # size optimization
688         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
689         and             \$-128,%rsp             # ensure stack alignment
690         mov             0xf0-0x80($key),$rounds
691
692         and             $end0,$in0
693         and             %rsp,$end0
694         sub             $in0,$end0
695         jc              .Lenc_no_key_aliasing
696         cmp             \$768,$end0
697         jnc             .Lenc_no_key_aliasing
698         sub             $end0,%rsp              # avoid aliasing with key
699 .Lenc_no_key_aliasing:
700
701         lea             ($out),$in0
702         lea             -0xc0($out,$len),$end0
703         shr             \$4,$len
704
705         call            _aesni_ctr32_6x
706         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
707         vpshufb         $Ii,$inout1,$T2
708         vmovdqu         $Xi,0x70(%rsp)
709         vpshufb         $Ii,$inout2,$Z0
710         vmovdqu         $T2,0x60(%rsp)
711         vpshufb         $Ii,$inout3,$Z1
712         vmovdqu         $Z0,0x50(%rsp)
713         vpshufb         $Ii,$inout4,$Z2
714         vmovdqu         $Z1,0x40(%rsp)
715         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
716         vmovdqu         $Z2,0x30(%rsp)
717
718         call            _aesni_ctr32_6x
719
720         vmovdqu         ($Xip),$Xi              # load Xi
721         lea             0x20+0x20($Xip),$Xip    # size optimization
722         sub             \$12,$len
723         mov             \$0x60*2,$ret
724         vpshufb         $Ii,$Xi,$Xi
725
726         call            _aesni_ctr32_ghash_6x
727         vmovdqu         0x20(%rsp),$Z3          # I[5]
728          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
729         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
730         vpunpckhqdq     $Z3,$Z3,$T1
731         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
732          vmovups        $inout0,-0x60($out)     # save output
733          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
734         vpxor           $Z3,$T1,$T1
735          vmovups        $inout1,-0x50($out)
736          vpshufb        $Ii,$inout1,$inout1
737          vmovups        $inout2,-0x40($out)
738          vpshufb        $Ii,$inout2,$inout2
739          vmovups        $inout3,-0x30($out)
740          vpshufb        $Ii,$inout3,$inout3
741          vmovups        $inout4,-0x20($out)
742          vpshufb        $Ii,$inout4,$inout4
743          vmovups        $inout5,-0x10($out)
744          vpshufb        $Ii,$inout5,$inout5
745          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
746 ___
747 { my ($HK,$T3)=($rndkey,$inout0);
748
749 $code.=<<___;
750          vmovdqu        0x30(%rsp),$Z2          # I[4]
751          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
752          vpunpckhqdq    $Z2,$Z2,$T2
753         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
754          vpxor          $Z2,$T2,$T2
755         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
756         vpclmulqdq      \$0x00,$HK,$T1,$T1
757
758          vmovdqu        0x40(%rsp),$T3          # I[3]
759         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
760          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
761         vpxor           $Z1,$Z0,$Z0
762          vpunpckhqdq    $T3,$T3,$Z1
763         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
764          vpxor          $T3,$Z1,$Z1
765         vpxor           $Z3,$Z2,$Z2
766         vpclmulqdq      \$0x10,$HK,$T2,$T2
767          vmovdqu        0x50-0x20($Xip),$HK
768         vpxor           $T1,$T2,$T2
769
770          vmovdqu        0x50(%rsp),$T1          # I[2]
771         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
772          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
773         vpxor           $Z0,$Z3,$Z3
774          vpunpckhqdq    $T1,$T1,$Z0
775         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
776          vpxor          $T1,$Z0,$Z0
777         vpxor           $Z2,$T3,$T3
778         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
779         vpxor           $T2,$Z1,$Z1
780
781          vmovdqu        0x60(%rsp),$T2          # I[1]
782         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
783          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
784         vpxor           $Z3,$Z2,$Z2
785          vpunpckhqdq    $T2,$T2,$Z3
786         vpclmulqdq      \$0x11,$Ii,$T1,$T1
787          vpxor          $T2,$Z3,$Z3
788         vpxor           $T3,$T1,$T1
789         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
790          vmovdqu        0x80-0x20($Xip),$HK
791         vpxor           $Z1,$Z0,$Z0
792
793          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
794         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
795          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
796          vpunpckhqdq    $Xi,$Xi,$T3
797         vpxor           $Z2,$Z1,$Z1
798         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
799          vpxor          $Xi,$T3,$T3
800         vpxor           $T1,$T2,$T2
801         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
802         vpxor           $Z0,$Z3,$Z0
803
804         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
805          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
806          vpunpckhqdq    $inout5,$inout5,$T1
807         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
808          vpxor          $inout5,$T1,$T1
809         vpxor           $Z1,$Z2,$Z1
810         vpclmulqdq      \$0x10,$HK,$T3,$T3
811          vmovdqu        0x20-0x20($Xip),$HK
812         vpxor           $T2,$Xi,$Z3
813         vpxor           $Z0,$T3,$Z2
814
815          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
816           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
817         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
818           vpxor         $T3,$Z2,$Z2
819          vpunpckhqdq    $inout4,$inout4,$T2
820         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
821          vpxor          $inout4,$T2,$T2
822           vpslldq       \$8,$Z2,$T3
823         vpclmulqdq      \$0x00,$HK,$T1,$T1
824           vpxor         $T3,$Z1,$Xi
825           vpsrldq       \$8,$Z2,$Z2
826           vpxor         $Z2,$Z3,$Z3
827
828         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
829          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
830         vpxor           $Z0,$Z1,$Z1
831          vpunpckhqdq    $inout3,$inout3,$T3
832         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
833          vpxor          $inout3,$T3,$T3
834         vpxor           $inout5,$inout4,$inout4
835           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
836         vpclmulqdq      \$0x10,$HK,$T2,$T2
837          vmovdqu        0x50-0x20($Xip),$HK
838         vpxor           $T1,$T2,$T2
839
840         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
841          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
842         vpxor           $Z1,$Z0,$Z0
843          vpunpckhqdq    $inout2,$inout2,$T1
844         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
845          vpxor          $inout2,$T1,$T1
846         vpxor           $inout4,$inout3,$inout3
847           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
848         vpclmulqdq      \$0x00,$HK,$T3,$T3
849         vpxor           $T2,$T3,$T3
850
851           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
852           vxorps        $inout5,$Xi,$Xi
853
854         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
855          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
856         vpxor           $Z0,$Z1,$Z1
857          vpunpckhqdq    $inout1,$inout1,$T2
858         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
859          vpxor          $inout1,$T2,$T2
860           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
861         vpxor           $inout3,$inout2,$inout2
862         vpclmulqdq      \$0x10,$HK,$T1,$T1
863          vmovdqu        0x80-0x20($Xip),$HK
864         vpxor           $T3,$T1,$T1
865
866           vxorps        $Z3,$inout5,$inout5
867           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
868           vxorps        $inout5,$Xi,$Xi
869
870         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
871          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
872         vpxor           $Z1,$Z0,$Z0
873          vpunpckhqdq    $Xi,$Xi,$T3
874         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
875          vpxor          $Xi,$T3,$T3
876         vpxor           $inout2,$inout1,$inout1
877         vpclmulqdq      \$0x00,$HK,$T2,$T2
878         vpxor           $T1,$T2,$T2
879
880         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
881         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
882         vpxor           $Z0,$Z1,$Z1
883         vpclmulqdq      \$0x10,$HK,$T3,$Z2
884         vpxor           $inout1,$Z3,$Z3
885         vpxor           $T2,$Z2,$Z2
886
887         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
888         vpxor           $Z0,$Z2,$Z2
889         vpslldq         \$8,$Z2,$T1
890         vmovdqu         0x10($const),$Hkey      # .Lpoly
891         vpsrldq         \$8,$Z2,$Z2
892         vpxor           $T1,$Z1,$Xi
893         vpxor           $Z2,$Z3,$Z3
894
895         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
896         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
897         vpxor           $T2,$Xi,$Xi
898
899         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
900         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
901         vpxor           $Z3,$T2,$T2
902         vpxor           $T2,$Xi,$Xi
903 ___
904 }
905 $code.=<<___;
906         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
907         vmovdqu         $Xi,-0x40($Xip)         # output Xi
908
909         vzeroupper
910 ___
911 $code.=<<___ if ($win64);
912         movaps  -0xd8(%rax),%xmm6
913         movaps  -0xc8(%rax),%xmm7
914         movaps  -0xb8(%rax),%xmm8
915         movaps  -0xa8(%rax),%xmm9
916         movaps  -0x98(%rax),%xmm10
917         movaps  -0x88(%rax),%xmm11
918         movaps  -0x78(%rax),%xmm12
919         movaps  -0x68(%rax),%xmm13
920         movaps  -0x58(%rax),%xmm14
921         movaps  -0x48(%rax),%xmm15
922 ___
923 $code.=<<___;
924         mov     -48(%rax),%r15
925 .cfi_restore    %r15
926         mov     -40(%rax),%r14
927 .cfi_restore    %r14
928         mov     -32(%rax),%r13
929 .cfi_restore    %r13
930         mov     -24(%rax),%r12
931 .cfi_restore    %r12
932         mov     -16(%rax),%rbp
933 .cfi_restore    %rbp
934         mov     -8(%rax),%rbx
935 .cfi_restore    %rbx
936         lea     (%rax),%rsp             # restore %rsp
937 .cfi_def_cfa_register   %rsp
938 .Lgcm_enc_abort:
939         mov     $ret,%rax               # return value
940         ret
941 .cfi_endproc
942 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
943 ___
944
945 $code.=<<___;
946 .align  64
947 .Lbswap_mask:
948         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
949 .Lpoly:
950         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
951 .Lone_msb:
952         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
953 .Ltwo_lsb:
954         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
955 .Lone_lsb:
956         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
957 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
958 .align  64
959 ___
960 if ($win64) {
961 $rec="%rcx";
962 $frame="%rdx";
963 $context="%r8";
964 $disp="%r9";
965
966 $code.=<<___
967 .extern __imp_RtlVirtualUnwind
968 .type   gcm_se_handler,\@abi-omnipotent
969 .align  16
970 gcm_se_handler:
971         push    %rsi
972         push    %rdi
973         push    %rbx
974         push    %rbp
975         push    %r12
976         push    %r13
977         push    %r14
978         push    %r15
979         pushfq
980         sub     \$64,%rsp
981
982         mov     120($context),%rax      # pull context->Rax
983         mov     248($context),%rbx      # pull context->Rip
984
985         mov     8($disp),%rsi           # disp->ImageBase
986         mov     56($disp),%r11          # disp->HandlerData
987
988         mov     0(%r11),%r10d           # HandlerData[0]
989         lea     (%rsi,%r10),%r10        # prologue label
990         cmp     %r10,%rbx               # context->Rip<prologue label
991         jb      .Lcommon_seh_tail
992
993         mov     152($context),%rax      # pull context->Rsp
994
995         mov     4(%r11),%r10d           # HandlerData[1]
996         lea     (%rsi,%r10),%r10        # epilogue label
997         cmp     %r10,%rbx               # context->Rip>=epilogue label
998         jae     .Lcommon_seh_tail
999
1000         mov     120($context),%rax      # pull context->Rax
1001
1002         mov     -48(%rax),%r15
1003         mov     -40(%rax),%r14
1004         mov     -32(%rax),%r13
1005         mov     -24(%rax),%r12
1006         mov     -16(%rax),%rbp
1007         mov     -8(%rax),%rbx
1008         mov     %r15,240($context)
1009         mov     %r14,232($context)
1010         mov     %r13,224($context)
1011         mov     %r12,216($context)
1012         mov     %rbp,160($context)
1013         mov     %rbx,144($context)
1014
1015         lea     -0xd8(%rax),%rsi        # %xmm save area
1016         lea     512($context),%rdi      # & context.Xmm6
1017         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
1018         .long   0xa548f3fc              # cld; rep movsq
1019
1020 .Lcommon_seh_tail:
1021         mov     8(%rax),%rdi
1022         mov     16(%rax),%rsi
1023         mov     %rax,152($context)      # restore context->Rsp
1024         mov     %rsi,168($context)      # restore context->Rsi
1025         mov     %rdi,176($context)      # restore context->Rdi
1026
1027         mov     40($disp),%rdi          # disp->ContextRecord
1028         mov     $context,%rsi           # context
1029         mov     \$154,%ecx              # sizeof(CONTEXT)
1030         .long   0xa548f3fc              # cld; rep movsq
1031
1032         mov     $disp,%rsi
1033         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
1034         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
1035         mov     0(%rsi),%r8             # arg3, disp->ControlPc
1036         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
1037         mov     40(%rsi),%r10           # disp->ContextRecord
1038         lea     56(%rsi),%r11           # &disp->HandlerData
1039         lea     24(%rsi),%r12           # &disp->EstablisherFrame
1040         mov     %r10,32(%rsp)           # arg5
1041         mov     %r11,40(%rsp)           # arg6
1042         mov     %r12,48(%rsp)           # arg7
1043         mov     %rcx,56(%rsp)           # arg8, (NULL)
1044         call    *__imp_RtlVirtualUnwind(%rip)
1045
1046         mov     \$1,%eax                # ExceptionContinueSearch
1047         add     \$64,%rsp
1048         popfq
1049         pop     %r15
1050         pop     %r14
1051         pop     %r13
1052         pop     %r12
1053         pop     %rbp
1054         pop     %rbx
1055         pop     %rdi
1056         pop     %rsi
1057         ret
1058 .size   gcm_se_handler,.-gcm_se_handler
1059
1060 .section        .pdata
1061 .align  4
1062         .rva    .LSEH_begin_aesni_gcm_decrypt
1063         .rva    .LSEH_end_aesni_gcm_decrypt
1064         .rva    .LSEH_gcm_dec_info
1065
1066         .rva    .LSEH_begin_aesni_gcm_encrypt
1067         .rva    .LSEH_end_aesni_gcm_encrypt
1068         .rva    .LSEH_gcm_enc_info
1069 .section        .xdata
1070 .align  8
1071 .LSEH_gcm_dec_info:
1072         .byte   9,0,0,0
1073         .rva    gcm_se_handler
1074         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1075 .LSEH_gcm_enc_info:
1076         .byte   9,0,0,0
1077         .rva    gcm_se_handler
1078         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1079 ___
1080 }
1081 }}} else {{{
1082 $code=<<___;    # assembler is too old
1083 .text
1084
1085 .globl  aesni_gcm_encrypt
1086 .type   aesni_gcm_encrypt,\@abi-omnipotent
1087 aesni_gcm_encrypt:
1088 .cfi_startproc
1089         xor     %eax,%eax
1090         ret
1091 .cfi_endproc
1092 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1093
1094 .globl  aesni_gcm_decrypt
1095 .type   aesni_gcm_decrypt,\@abi-omnipotent
1096 aesni_gcm_decrypt:
1097 .cfi_startproc
1098         xor     %eax,%eax
1099         ret
1100 .cfi_endproc
1101 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1102 ___
1103 }}}
1104
1105 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1106
1107 print $code;
1108
1109 close STDOUT;