Add Broadwell performance results.
[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 #
11 # AES-NI-CTR+GHASH stitch.
12 #
13 # February 2013
14 #
15 # OpenSSL GCM implementation is organized in such way that its
16 # performance is rather close to the sum of its streamed components,
17 # in the context parallelized AES-NI CTR and modulo-scheduled
18 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
19 # was observed to perform significantly better than the sum of the
20 # components on contemporary CPUs, the effort was deemed impossible to
21 # justify. This module is based on combination of Intel submissions,
22 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
23 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
24 # pressure with notable relative improvement, achieving 1.0 cycle per
25 # byte processed with 128-bit key on Haswell processor, and 0.74 -
26 # on Broadwell. [Mentioned results are raw profiled measurements for
27 # favourable packet size, one divisible by 96. Applications using the
28 # EVP interface will observe a few percent worse performance.]
29 #
30 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
31 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
32
33 $flavour = shift;
34 $output  = shift;
35 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
36
37 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
38
39 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
40 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
41 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
42 die "can't locate x86_64-xlate.pl";
43
44 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
45                 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
46         $avx = ($1>=2.19) + ($1>=2.22);
47 }
48
49 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
50             `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
51         $avx = ($1>=2.09) + ($1>=2.10);
52 }
53
54 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
55             `ml64 2>&1` =~ /Version ([0-9]+)\./) {
56         $avx = ($1>=10) + ($1>=11);
57 }
58
59 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
60         $avx = ($2>=3.0) + ($2>3.0);
61 }
62
63 open OUT,"| \"$^X\" $xlate $flavour $output";
64 *STDOUT=*OUT;
65
66 if ($avx>1) {{{
67
68 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
69
70 ($Ii,$T1,$T2,$Hkey,
71  $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
72
73 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
74
75 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
76
77 $code=<<___;
78 .text
79
80 .type   _aesni_ctr32_ghash_6x,\@abi-omnipotent
81 .align  32
82 _aesni_ctr32_ghash_6x:
83         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
84         sub             \$6,$len
85         vpxor           $Z0,$Z0,$Z0             # $Z0   = 0
86         vmovdqu         0x00-0x80($key),$rndkey
87         vpaddb          $T2,$T1,$inout1
88         vpaddb          $T2,$inout1,$inout2
89         vpaddb          $T2,$inout2,$inout3
90         vpaddb          $T2,$inout3,$inout4
91         vpaddb          $T2,$inout4,$inout5
92         vpxor           $rndkey,$T1,$inout0
93         vmovdqu         $Z0,16+8(%rsp)          # "$Z3" = 0
94         jmp             .Loop6x
95
96 .align  32
97 .Loop6x:
98         add             \$`6<<24`,$counter
99         jc              .Lhandle_ctr32          # discard $inout[1-5]?
100         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
101           vpaddb        $T2,$inout5,$T1         # next counter value
102           vpxor         $rndkey,$inout1,$inout1
103           vpxor         $rndkey,$inout2,$inout2
104
105 .Lresume_ctr32:
106         vmovdqu         $T1,($ivp)              # save next counter value
107         vpclmulqdq      \$0x10,$Hkey,$Z3,$Z1
108           vpxor         $rndkey,$inout3,$inout3
109           vmovups       0x10-0x80($key),$T2     # borrow $T2 for $rndkey
110         vpclmulqdq      \$0x01,$Hkey,$Z3,$Z2
111         xor             %r12,%r12
112         cmp             $in0,$end0
113
114           vaesenc       $T2,$inout0,$inout0
115         vmovdqu         0x30+8(%rsp),$Ii        # I[4]
116           vpxor         $rndkey,$inout4,$inout4
117         vpclmulqdq      \$0x00,$Hkey,$Z3,$T1
118           vaesenc       $T2,$inout1,$inout1
119           vpxor         $rndkey,$inout5,$inout5
120         setnc           %r12b
121         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
122           vaesenc       $T2,$inout2,$inout2
123         vmovdqu         0x10-0x20($Xip),$Hkey   # $Hkey^2
124         neg             %r12
125           vaesenc       $T2,$inout3,$inout3
126          vpxor          $Z1,$Z2,$Z2
127         vpclmulqdq      \$0x00,$Hkey,$Ii,$Z1
128          vpxor          $Z0,$Xi,$Xi             # modulo-scheduled
129           vaesenc       $T2,$inout4,$inout4
130          vpxor          $Z1,$T1,$Z0
131         and             \$0x60,%r12
132           vmovups       0x20-0x80($key),$rndkey
133         vpclmulqdq      \$0x10,$Hkey,$Ii,$T1
134           vaesenc       $T2,$inout5,$inout5
135
136         vpclmulqdq      \$0x01,$Hkey,$Ii,$T2
137         lea             ($in0,%r12),$in0
138           vaesenc       $rndkey,$inout0,$inout0
139          vpxor          16+8(%rsp),$Xi,$Xi      # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
140         vpclmulqdq      \$0x11,$Hkey,$Ii,$Hkey
141          vmovdqu        0x40+8(%rsp),$Ii        # I[3]
142           vaesenc       $rndkey,$inout1,$inout1
143         movbe           0x58($in0),%r13
144           vaesenc       $rndkey,$inout2,$inout2
145         movbe           0x50($in0),%r12
146           vaesenc       $rndkey,$inout3,$inout3
147         mov             %r13,0x20+8(%rsp)
148           vaesenc       $rndkey,$inout4,$inout4
149         mov             %r12,0x28+8(%rsp)
150         vmovdqu         0x30-0x20($Xip),$Z1     # borrow $Z1 for $Hkey^3
151           vaesenc       $rndkey,$inout5,$inout5
152
153           vmovups       0x30-0x80($key),$rndkey
154          vpxor          $T1,$Z2,$Z2
155         vpclmulqdq      \$0x00,$Z1,$Ii,$T1
156           vaesenc       $rndkey,$inout0,$inout0
157          vpxor          $T2,$Z2,$Z2
158         vpclmulqdq      \$0x10,$Z1,$Ii,$T2
159           vaesenc       $rndkey,$inout1,$inout1
160          vpxor          $Hkey,$Z3,$Z3
161         vpclmulqdq      \$0x01,$Z1,$Ii,$Hkey
162           vaesenc       $rndkey,$inout2,$inout2
163         vpclmulqdq      \$0x11,$Z1,$Ii,$Z1
164          vmovdqu        0x50+8(%rsp),$Ii        # I[2]
165           vaesenc       $rndkey,$inout3,$inout3
166           vaesenc       $rndkey,$inout4,$inout4
167          vpxor          $T1,$Z0,$Z0
168         vmovdqu         0x40-0x20($Xip),$T1     # borrow $T1 for $Hkey^4
169           vaesenc       $rndkey,$inout5,$inout5
170
171           vmovups       0x40-0x80($key),$rndkey
172          vpxor          $T2,$Z2,$Z2
173         vpclmulqdq      \$0x00,$T1,$Ii,$T2
174           vaesenc       $rndkey,$inout0,$inout0
175          vpxor          $Hkey,$Z2,$Z2
176         vpclmulqdq      \$0x10,$T1,$Ii,$Hkey
177           vaesenc       $rndkey,$inout1,$inout1
178         movbe           0x48($in0),%r13
179          vpxor          $Z1,$Z3,$Z3
180         vpclmulqdq      \$0x01,$T1,$Ii,$Z1
181           vaesenc       $rndkey,$inout2,$inout2
182         movbe           0x40($in0),%r12
183         vpclmulqdq      \$0x11,$T1,$Ii,$T1
184          vmovdqu        0x60+8(%rsp),$Ii        # I[1]
185           vaesenc       $rndkey,$inout3,$inout3
186         mov             %r13,0x30+8(%rsp)
187           vaesenc       $rndkey,$inout4,$inout4
188         mov             %r12,0x38+8(%rsp)
189          vpxor          $T2,$Z0,$Z0
190         vmovdqu         0x60-0x20($Xip),$T2     # borrow $T2 for $Hkey^5
191           vaesenc       $rndkey,$inout5,$inout5
192
193           vmovups       0x50-0x80($key),$rndkey
194          vpxor          $Hkey,$Z2,$Z2
195         vpclmulqdq      \$0x00,$T2,$Ii,$Hkey
196           vaesenc       $rndkey,$inout0,$inout0
197          vpxor          $Z1,$Z2,$Z2
198         vpclmulqdq      \$0x10,$T2,$Ii,$Z1
199           vaesenc       $rndkey,$inout1,$inout1
200         movbe           0x38($in0),%r13
201          vpxor          $T1,$Z3,$Z3
202         vpclmulqdq      \$0x01,$T2,$Ii,$T1
203          vpxor          0x70+8(%rsp),$Xi,$Xi    # accumulate I[0]
204           vaesenc       $rndkey,$inout2,$inout2
205         movbe           0x30($in0),%r12
206         vpclmulqdq      \$0x11,$T2,$Ii,$T2
207           vaesenc       $rndkey,$inout3,$inout3
208         mov             %r13,0x40+8(%rsp)
209           vaesenc       $rndkey,$inout4,$inout4
210         mov             %r12,0x48+8(%rsp)
211          vpxor          $Hkey,$Z0,$Z0
212          vmovdqu        0x70-0x20($Xip),$Hkey   # $Hkey^6
213           vaesenc       $rndkey,$inout5,$inout5
214
215           vmovups       0x60-0x80($key),$rndkey
216          vpxor          $Z1,$Z2,$Z2
217         vpclmulqdq      \$0x10,$Hkey,$Xi,$Z1
218           vaesenc       $rndkey,$inout0,$inout0
219          vpxor          $T1,$Z2,$Z2
220         vpclmulqdq      \$0x01,$Hkey,$Xi,$T1
221           vaesenc       $rndkey,$inout1,$inout1
222         movbe           0x28($in0),%r13
223          vpxor          $T2,$Z3,$Z3
224         vpclmulqdq      \$0x00,$Hkey,$Xi,$T2
225           vaesenc       $rndkey,$inout2,$inout2
226         movbe           0x20($in0),%r12
227         vpclmulqdq      \$0x11,$Hkey,$Xi,$Xi
228           vaesenc       $rndkey,$inout3,$inout3
229         mov             %r13,0x50+8(%rsp)
230           vaesenc       $rndkey,$inout4,$inout4
231         mov             %r12,0x58+8(%rsp)
232         vpxor           $Z1,$Z2,$Z2
233           vaesenc       $rndkey,$inout5,$inout5
234         vpxor           $T1,$Z2,$Z2
235
236           vmovups       0x70-0x80($key),$rndkey
237         vpslldq         \$8,$Z2,$Z1
238         vpxor           $T2,$Z0,$Z0
239         vmovdqu         0x10($const),$Hkey      # .Lpoly
240
241           vaesenc       $rndkey,$inout0,$inout0
242         vpxor           $Xi,$Z3,$Z3
243           vaesenc       $rndkey,$inout1,$inout1
244         vpxor           $Z1,$Z0,$Z0
245         movbe           0x18($in0),%r13
246           vaesenc       $rndkey,$inout2,$inout2
247         movbe           0x10($in0),%r12
248         vpalignr        \$8,$Z0,$Z0,$Ii         # 1st phase
249         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
250         mov             %r13,0x60+8(%rsp)
251           vaesenc       $rndkey,$inout3,$inout3
252         mov             %r12,0x68+8(%rsp)
253           vaesenc       $rndkey,$inout4,$inout4
254           vmovups       0x80-0x80($key),$T1     # borrow $T1 for $rndkey
255           vaesenc       $rndkey,$inout5,$inout5
256
257           vaesenc       $T1,$inout0,$inout0
258           vmovups       0x90-0x80($key),$rndkey
259           vaesenc       $T1,$inout1,$inout1
260         vpsrldq         \$8,$Z2,$Z2
261           vaesenc       $T1,$inout2,$inout2
262         vpxor           $Z2,$Z3,$Z3
263           vaesenc       $T1,$inout3,$inout3
264         vpxor           $Ii,$Z0,$Z0
265         movbe           0x08($in0),%r13
266           vaesenc       $T1,$inout4,$inout4
267         movbe           0x00($in0),%r12
268           vaesenc       $T1,$inout5,$inout5
269           vmovups       0xa0-0x80($key),$T1
270           cmp           \$11,$rounds
271           jb            .Lenc_tail              # 128-bit key
272
273           vaesenc       $rndkey,$inout0,$inout0
274           vaesenc       $rndkey,$inout1,$inout1
275           vaesenc       $rndkey,$inout2,$inout2
276           vaesenc       $rndkey,$inout3,$inout3
277           vaesenc       $rndkey,$inout4,$inout4
278           vaesenc       $rndkey,$inout5,$inout5
279
280           vaesenc       $T1,$inout0,$inout0
281           vaesenc       $T1,$inout1,$inout1
282           vaesenc       $T1,$inout2,$inout2
283           vaesenc       $T1,$inout3,$inout3
284           vaesenc       $T1,$inout4,$inout4
285           vmovups       0xb0-0x80($key),$rndkey
286           vaesenc       $T1,$inout5,$inout5
287           vmovups       0xc0-0x80($key),$T1
288           je            .Lenc_tail              # 192-bit key
289
290           vaesenc       $rndkey,$inout0,$inout0
291           vaesenc       $rndkey,$inout1,$inout1
292           vaesenc       $rndkey,$inout2,$inout2
293           vaesenc       $rndkey,$inout3,$inout3
294           vaesenc       $rndkey,$inout4,$inout4
295           vaesenc       $rndkey,$inout5,$inout5
296
297           vaesenc       $T1,$inout0,$inout0
298           vaesenc       $T1,$inout1,$inout1
299           vaesenc       $T1,$inout2,$inout2
300           vaesenc       $T1,$inout3,$inout3
301           vaesenc       $T1,$inout4,$inout4
302           vmovups       0xd0-0x80($key),$rndkey
303           vaesenc       $T1,$inout5,$inout5
304           vmovups       0xe0-0x80($key),$T1
305           jmp           .Lenc_tail              # 256-bit key
306
307 .align  32
308 .Lhandle_ctr32:
309         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
310           vpshufb       $Ii,$T1,$Z2             # byte-swap counter
311           vmovdqu       0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
312           vpaddd        0x40($const),$Z2,$inout1        # .Lone_lsb
313           vpaddd        $Z1,$Z2,$inout2
314         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
315           vpaddd        $Z1,$inout1,$inout3
316           vpshufb       $Ii,$inout1,$inout1
317           vpaddd        $Z1,$inout2,$inout4
318           vpshufb       $Ii,$inout2,$inout2
319           vpxor         $rndkey,$inout1,$inout1
320           vpaddd        $Z1,$inout3,$inout5
321           vpshufb       $Ii,$inout3,$inout3
322           vpxor         $rndkey,$inout2,$inout2
323           vpaddd        $Z1,$inout4,$T1         # byte-swapped next counter value
324           vpshufb       $Ii,$inout4,$inout4
325           vpshufb       $Ii,$inout5,$inout5
326           vpshufb       $Ii,$T1,$T1             # next counter value
327         jmp             .Lresume_ctr32
328
329 .align  32
330 .Lenc_tail:
331           vaesenc       $rndkey,$inout0,$inout0
332         vmovdqu         $Z3,16+8(%rsp)          # postpone vpxor $Z3,$Xi,$Xi
333         vpalignr        \$8,$Z0,$Z0,$Xi         # 2nd phase
334           vaesenc       $rndkey,$inout1,$inout1
335         vpclmulqdq      \$0x10,$Hkey,$Z0,$Z0
336           vpxor         0x00($inp),$T1,$T2
337           vaesenc       $rndkey,$inout2,$inout2
338           vpxor         0x10($inp),$T1,$Ii
339           vaesenc       $rndkey,$inout3,$inout3
340           vpxor         0x20($inp),$T1,$Z1
341           vaesenc       $rndkey,$inout4,$inout4
342           vpxor         0x30($inp),$T1,$Z2
343           vaesenc       $rndkey,$inout5,$inout5
344           vpxor         0x40($inp),$T1,$Z3
345           vpxor         0x50($inp),$T1,$Hkey
346           vmovdqu       ($ivp),$T1              # load next counter value
347
348           vaesenclast   $T2,$inout0,$inout0
349           vmovdqu       0x20($const),$T2        # borrow $T2, .Lone_msb
350           vaesenclast   $Ii,$inout1,$inout1
351          vpaddb         $T2,$T1,$Ii
352         mov             %r13,0x70+8(%rsp)
353         lea             0x60($inp),$inp
354           vaesenclast   $Z1,$inout2,$inout2
355          vpaddb         $T2,$Ii,$Z1
356         mov             %r12,0x78+8(%rsp)
357         lea             0x60($out),$out
358           vmovdqu       0x00-0x80($key),$rndkey
359           vaesenclast   $Z2,$inout3,$inout3
360          vpaddb         $T2,$Z1,$Z2
361           vaesenclast   $Z3, $inout4,$inout4
362          vpaddb         $T2,$Z2,$Z3
363           vaesenclast   $Hkey,$inout5,$inout5
364          vpaddb         $T2,$Z3,$Hkey
365
366         add             \$0x60,$ret
367         sub             \$0x6,$len
368         jc              .L6x_done
369
370           vmovups       $inout0,-0x60($out)     # save output
371          vpxor          $rndkey,$T1,$inout0
372           vmovups       $inout1,-0x50($out)
373          vmovdqa        $Ii,$inout1             # 0 latency
374           vmovups       $inout2,-0x40($out)
375          vmovdqa        $Z1,$inout2             # 0 latency
376           vmovups       $inout3,-0x30($out)
377          vmovdqa        $Z2,$inout3             # 0 latency
378           vmovups       $inout4,-0x20($out)
379          vmovdqa        $Z3,$inout4             # 0 latency
380           vmovups       $inout5,-0x10($out)
381          vmovdqa        $Hkey,$inout5           # 0 latency
382         vmovdqu         0x20+8(%rsp),$Z3        # I[5]
383         jmp             .Loop6x
384
385 .L6x_done:
386         vpxor           16+8(%rsp),$Xi,$Xi      # modulo-scheduled
387         vpxor           $Z0,$Xi,$Xi             # modulo-scheduled
388
389         ret
390 .size   _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
391 ___
392 ######################################################################
393 #
394 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
395 #               const AES_KEY *key, unsigned char iv[16],
396 #               struct { u128 Xi,H,Htbl[9]; } *Xip);
397 $code.=<<___;
398 .globl  aesni_gcm_decrypt
399 .type   aesni_gcm_decrypt,\@function,6
400 .align  32
401 aesni_gcm_decrypt:
402         xor     $ret,$ret
403         cmp     \$0x60,$len                     # minimal accepted length
404         jb      .Lgcm_dec_abort
405
406         lea     (%rsp),%rax                     # save stack pointer
407         push    %rbx
408         push    %rbp
409         push    %r12
410         push    %r13
411         push    %r14
412         push    %r15
413 ___
414 $code.=<<___ if ($win64);
415         lea     -0xa8(%rsp),%rsp
416         movaps  %xmm6,-0xd8(%rax)
417         movaps  %xmm7,-0xc8(%rax)
418         movaps  %xmm8,-0xb8(%rax)
419         movaps  %xmm9,-0xa8(%rax)
420         movaps  %xmm10,-0x98(%rax)
421         movaps  %xmm11,-0x88(%rax)
422         movaps  %xmm12,-0x78(%rax)
423         movaps  %xmm13,-0x68(%rax)
424         movaps  %xmm14,-0x58(%rax)
425         movaps  %xmm15,-0x48(%rax)
426 .Lgcm_dec_body:
427 ___
428 $code.=<<___;
429         vzeroupper
430
431         vmovdqu         ($ivp),$T1              # input counter value
432         add             \$-128,%rsp
433         mov             12($ivp),$counter
434         lea             .Lbswap_mask(%rip),$const
435         lea             -0x80($key),$in0        # borrow $in0
436         mov             \$0xf80,$end0           # borrow $end0
437         vmovdqu         ($Xip),$Xi              # load Xi
438         and             \$-128,%rsp             # ensure stack alignment
439         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
440         lea             0x80($key),$key         # size optimization
441         lea             0x20+0x20($Xip),$Xip    # size optimization
442         mov             0xf0-0x80($key),$rounds
443         vpshufb         $Ii,$Xi,$Xi
444
445         and             $end0,$in0
446         and             %rsp,$end0
447         sub             $in0,$end0
448         jc              .Ldec_no_key_aliasing
449         cmp             \$768,$end0
450         jnc             .Ldec_no_key_aliasing
451         sub             $end0,%rsp              # avoid aliasing with key
452 .Ldec_no_key_aliasing:
453
454         vmovdqu         0x50($inp),$Z3          # I[5]
455         lea             ($inp),$in0
456         vmovdqu         0x40($inp),$Z0
457         lea             -0xc0($inp,$len),$end0
458         vmovdqu         0x30($inp),$Z1
459         shr             \$4,$len
460         xor             $ret,$ret
461         vmovdqu         0x20($inp),$Z2
462          vpshufb        $Ii,$Z3,$Z3             # passed to _aesni_ctr32_ghash_6x
463         vmovdqu         0x10($inp),$T2
464          vpshufb        $Ii,$Z0,$Z0
465         vmovdqu         ($inp),$Hkey
466          vpshufb        $Ii,$Z1,$Z1
467         vmovdqu         $Z0,0x30(%rsp)
468          vpshufb        $Ii,$Z2,$Z2
469         vmovdqu         $Z1,0x40(%rsp)
470          vpshufb        $Ii,$T2,$T2
471         vmovdqu         $Z2,0x50(%rsp)
472          vpshufb        $Ii,$Hkey,$Hkey
473         vmovdqu         $T2,0x60(%rsp)
474         vmovdqu         $Hkey,0x70(%rsp)
475
476         call            _aesni_ctr32_ghash_6x
477
478         vmovups         $inout0,-0x60($out)     # save output
479         vmovups         $inout1,-0x50($out)
480         vmovups         $inout2,-0x40($out)
481         vmovups         $inout3,-0x30($out)
482         vmovups         $inout4,-0x20($out)
483         vmovups         $inout5,-0x10($out)
484
485         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
486         vmovdqu         $Xi,-0x40($Xip)         # output Xi
487
488         vzeroupper
489 ___
490 $code.=<<___ if ($win64);
491         movaps  -0xd8(%rax),%xmm6
492         movaps  -0xd8(%rax),%xmm7
493         movaps  -0xb8(%rax),%xmm8
494         movaps  -0xa8(%rax),%xmm9
495         movaps  -0x98(%rax),%xmm10
496         movaps  -0x88(%rax),%xmm11
497         movaps  -0x78(%rax),%xmm12
498         movaps  -0x68(%rax),%xmm13
499         movaps  -0x58(%rax),%xmm14
500         movaps  -0x48(%rax),%xmm15
501 ___
502 $code.=<<___;
503         mov     -48(%rax),%r15
504         mov     -40(%rax),%r14
505         mov     -32(%rax),%r13
506         mov     -24(%rax),%r12
507         mov     -16(%rax),%rbp
508         mov     -8(%rax),%rbx
509         lea     (%rax),%rsp             # restore %rsp
510 .Lgcm_dec_abort:
511         mov     $ret,%rax               # return value
512         ret
513 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
514 ___
515
516 $code.=<<___;
517 .type   _aesni_ctr32_6x,\@abi-omnipotent
518 .align  32
519 _aesni_ctr32_6x:
520         vmovdqu         0x00-0x80($key),$Z0     # borrow $Z0 for $rndkey
521         vmovdqu         0x20($const),$T2        # borrow $T2, .Lone_msb
522         lea             -1($rounds),%r13
523         vmovups         0x10-0x80($key),$rndkey
524         lea             0x20-0x80($key),%r12
525         vpxor           $Z0,$T1,$inout0
526         add             \$`6<<24`,$counter
527         jc              .Lhandle_ctr32_2
528         vpaddb          $T2,$T1,$inout1
529         vpaddb          $T2,$inout1,$inout2
530         vpxor           $Z0,$inout1,$inout1
531         vpaddb          $T2,$inout2,$inout3
532         vpxor           $Z0,$inout2,$inout2
533         vpaddb          $T2,$inout3,$inout4
534         vpxor           $Z0,$inout3,$inout3
535         vpaddb          $T2,$inout4,$inout5
536         vpxor           $Z0,$inout4,$inout4
537         vpaddb          $T2,$inout5,$T1
538         vpxor           $Z0,$inout5,$inout5
539         jmp             .Loop_ctr32
540
541 .align  16
542 .Loop_ctr32:
543         vaesenc         $rndkey,$inout0,$inout0
544         vaesenc         $rndkey,$inout1,$inout1
545         vaesenc         $rndkey,$inout2,$inout2
546         vaesenc         $rndkey,$inout3,$inout3
547         vaesenc         $rndkey,$inout4,$inout4
548         vaesenc         $rndkey,$inout5,$inout5
549         vmovups         (%r12),$rndkey
550         lea             0x10(%r12),%r12
551         dec             %r13d
552         jnz             .Loop_ctr32
553
554         vmovdqu         (%r12),$Hkey            # last round key
555         vaesenc         $rndkey,$inout0,$inout0
556         vpxor           0x00($inp),$Hkey,$Z0
557         vaesenc         $rndkey,$inout1,$inout1
558         vpxor           0x10($inp),$Hkey,$Z1
559         vaesenc         $rndkey,$inout2,$inout2
560         vpxor           0x20($inp),$Hkey,$Z2
561         vaesenc         $rndkey,$inout3,$inout3
562         vpxor           0x30($inp),$Hkey,$Xi
563         vaesenc         $rndkey,$inout4,$inout4
564         vpxor           0x40($inp),$Hkey,$T2
565         vaesenc         $rndkey,$inout5,$inout5
566         vpxor           0x50($inp),$Hkey,$Hkey
567         lea             0x60($inp),$inp
568
569         vaesenclast     $Z0,$inout0,$inout0
570         vaesenclast     $Z1,$inout1,$inout1
571         vaesenclast     $Z2,$inout2,$inout2
572         vaesenclast     $Xi,$inout3,$inout3
573         vaesenclast     $T2,$inout4,$inout4
574         vaesenclast     $Hkey,$inout5,$inout5
575         vmovups         $inout0,0x00($out)
576         vmovups         $inout1,0x10($out)
577         vmovups         $inout2,0x20($out)
578         vmovups         $inout3,0x30($out)
579         vmovups         $inout4,0x40($out)
580         vmovups         $inout5,0x50($out)
581         lea             0x60($out),$out
582
583         ret
584 .align  32
585 .Lhandle_ctr32_2:
586         vpshufb         $Ii,$T1,$Z2             # byte-swap counter
587         vmovdqu         0x30($const),$Z1        # borrow $Z1, .Ltwo_lsb
588         vpaddd          0x40($const),$Z2,$inout1        # .Lone_lsb
589         vpaddd          $Z1,$Z2,$inout2
590         vpaddd          $Z1,$inout1,$inout3
591         vpshufb         $Ii,$inout1,$inout1
592         vpaddd          $Z1,$inout2,$inout4
593         vpshufb         $Ii,$inout2,$inout2
594         vpxor           $Z0,$inout1,$inout1
595         vpaddd          $Z1,$inout3,$inout5
596         vpshufb         $Ii,$inout3,$inout3
597         vpxor           $Z0,$inout2,$inout2
598         vpaddd          $Z1,$inout4,$T1         # byte-swapped next counter value
599         vpshufb         $Ii,$inout4,$inout4
600         vpxor           $Z0,$inout3,$inout3
601         vpshufb         $Ii,$inout5,$inout5
602         vpxor           $Z0,$inout4,$inout4
603         vpshufb         $Ii,$T1,$T1             # next counter value
604         vpxor           $Z0,$inout5,$inout5
605         jmp     .Loop_ctr32
606 .size   _aesni_ctr32_6x,.-_aesni_ctr32_6x
607
608 .globl  aesni_gcm_encrypt
609 .type   aesni_gcm_encrypt,\@function,6
610 .align  32
611 aesni_gcm_encrypt:
612         xor     $ret,$ret
613         cmp     \$0x60*3,$len                   # minimal accepted length
614         jb      .Lgcm_enc_abort
615
616         lea     (%rsp),%rax                     # save stack pointer
617         push    %rbx
618         push    %rbp
619         push    %r12
620         push    %r13
621         push    %r14
622         push    %r15
623 ___
624 $code.=<<___ if ($win64);
625         lea     -0xa8(%rsp),%rsp
626         movaps  %xmm6,-0xd8(%rax)
627         movaps  %xmm7,-0xc8(%rax)
628         movaps  %xmm8,-0xb8(%rax)
629         movaps  %xmm9,-0xa8(%rax)
630         movaps  %xmm10,-0x98(%rax)
631         movaps  %xmm11,-0x88(%rax)
632         movaps  %xmm12,-0x78(%rax)
633         movaps  %xmm13,-0x68(%rax)
634         movaps  %xmm14,-0x58(%rax)
635         movaps  %xmm15,-0x48(%rax)
636 .Lgcm_enc_body:
637 ___
638 $code.=<<___;
639         vzeroupper
640
641         vmovdqu         ($ivp),$T1              # input counter value
642         add             \$-128,%rsp
643         mov             12($ivp),$counter
644         lea             .Lbswap_mask(%rip),$const
645         lea             -0x80($key),$in0        # borrow $in0
646         mov             \$0xf80,$end0           # borrow $end0
647         lea             0x80($key),$key         # size optimization
648         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
649         and             \$-128,%rsp             # ensure stack alignment
650         mov             0xf0-0x80($key),$rounds
651
652         and             $end0,$in0
653         and             %rsp,$end0
654         sub             $in0,$end0
655         jc              .Lenc_no_key_aliasing
656         cmp             \$768,$end0
657         jnc             .Lenc_no_key_aliasing
658         sub             $end0,%rsp              # avoid aliasing with key
659 .Lenc_no_key_aliasing:
660
661         lea             ($out),$in0
662         lea             -0xc0($out,$len),$end0
663         shr             \$4,$len
664
665         call            _aesni_ctr32_6x
666         vpshufb         $Ii,$inout0,$Xi         # save bswapped output on stack
667         vpshufb         $Ii,$inout1,$T2
668         vmovdqu         $Xi,0x70(%rsp)
669         vpshufb         $Ii,$inout2,$Z0
670         vmovdqu         $T2,0x60(%rsp)
671         vpshufb         $Ii,$inout3,$Z1
672         vmovdqu         $Z0,0x50(%rsp)
673         vpshufb         $Ii,$inout4,$Z2
674         vmovdqu         $Z1,0x40(%rsp)
675         vpshufb         $Ii,$inout5,$Z3         # passed to _aesni_ctr32_ghash_6x
676         vmovdqu         $Z2,0x30(%rsp)
677
678         call            _aesni_ctr32_6x
679
680         vmovdqu         ($Xip),$Xi              # load Xi
681         lea             0x20+0x20($Xip),$Xip    # size optimization
682         sub             \$12,$len
683         mov             \$0x60*2,$ret
684         vpshufb         $Ii,$Xi,$Xi
685
686         call            _aesni_ctr32_ghash_6x
687         vmovdqu         0x20(%rsp),$Z3          # I[5]
688          vmovdqu        ($const),$Ii            # borrow $Ii for .Lbswap_mask
689         vmovdqu         0x00-0x20($Xip),$Hkey   # $Hkey^1
690         vpunpckhqdq     $Z3,$Z3,$T1
691         vmovdqu         0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
692          vmovups        $inout0,-0x60($out)     # save output
693          vpshufb        $Ii,$inout0,$inout0     # but keep bswapped copy
694         vpxor           $Z3,$T1,$T1
695          vmovups        $inout1,-0x50($out)
696          vpshufb        $Ii,$inout1,$inout1
697          vmovups        $inout2,-0x40($out)
698          vpshufb        $Ii,$inout2,$inout2
699          vmovups        $inout3,-0x30($out)
700          vpshufb        $Ii,$inout3,$inout3
701          vmovups        $inout4,-0x20($out)
702          vpshufb        $Ii,$inout4,$inout4
703          vmovups        $inout5,-0x10($out)
704          vpshufb        $Ii,$inout5,$inout5
705          vmovdqu        $inout0,0x10(%rsp)      # free $inout0
706 ___
707 { my ($HK,$T3)=($rndkey,$inout0);
708
709 $code.=<<___;
710          vmovdqu        0x30(%rsp),$Z2          # I[4]
711          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
712          vpunpckhqdq    $Z2,$Z2,$T2
713         vpclmulqdq      \$0x00,$Hkey,$Z3,$Z1
714          vpxor          $Z2,$T2,$T2
715         vpclmulqdq      \$0x11,$Hkey,$Z3,$Z3
716         vpclmulqdq      \$0x00,$HK,$T1,$T1
717
718          vmovdqu        0x40(%rsp),$T3          # I[3]
719         vpclmulqdq      \$0x00,$Ii,$Z2,$Z0
720          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
721         vpxor           $Z1,$Z0,$Z0
722          vpunpckhqdq    $T3,$T3,$Z1
723         vpclmulqdq      \$0x11,$Ii,$Z2,$Z2
724          vpxor          $T3,$Z1,$Z1
725         vpxor           $Z3,$Z2,$Z2
726         vpclmulqdq      \$0x10,$HK,$T2,$T2
727          vmovdqu        0x50-0x20($Xip),$HK
728         vpxor           $T1,$T2,$T2
729
730          vmovdqu        0x50(%rsp),$T1          # I[2]
731         vpclmulqdq      \$0x00,$Hkey,$T3,$Z3
732          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
733         vpxor           $Z0,$Z3,$Z3
734          vpunpckhqdq    $T1,$T1,$Z0
735         vpclmulqdq      \$0x11,$Hkey,$T3,$T3
736          vpxor          $T1,$Z0,$Z0
737         vpxor           $Z2,$T3,$T3
738         vpclmulqdq      \$0x00,$HK,$Z1,$Z1
739         vpxor           $T2,$Z1,$Z1
740
741          vmovdqu        0x60(%rsp),$T2          # I[1]
742         vpclmulqdq      \$0x00,$Ii,$T1,$Z2
743          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
744         vpxor           $Z3,$Z2,$Z2
745          vpunpckhqdq    $T2,$T2,$Z3
746         vpclmulqdq      \$0x11,$Ii,$T1,$T1
747          vpxor          $T2,$Z3,$Z3
748         vpxor           $T3,$T1,$T1
749         vpclmulqdq      \$0x10,$HK,$Z0,$Z0
750          vmovdqu        0x80-0x20($Xip),$HK
751         vpxor           $Z1,$Z0,$Z0
752
753          vpxor          0x70(%rsp),$Xi,$Xi      # accumulate I[0]
754         vpclmulqdq      \$0x00,$Hkey,$T2,$Z1
755          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
756          vpunpckhqdq    $Xi,$Xi,$T3
757         vpxor           $Z2,$Z1,$Z1
758         vpclmulqdq      \$0x11,$Hkey,$T2,$T2
759          vpxor          $Xi,$T3,$T3
760         vpxor           $T1,$T2,$T2
761         vpclmulqdq      \$0x00,$HK,$Z3,$Z3
762         vpxor           $Z0,$Z3,$Z0
763
764         vpclmulqdq      \$0x00,$Ii,$Xi,$Z2
765          vmovdqu        0x00-0x20($Xip),$Hkey   # $Hkey^1
766          vpunpckhqdq    $inout5,$inout5,$T1
767         vpclmulqdq      \$0x11,$Ii,$Xi,$Xi
768          vpxor          $inout5,$T1,$T1
769         vpxor           $Z1,$Z2,$Z1
770         vpclmulqdq      \$0x10,$HK,$T3,$T3
771          vmovdqu        0x20-0x20($Xip),$HK
772         vpxor           $T2,$Xi,$Z3
773         vpxor           $Z0,$T3,$Z2
774
775          vmovdqu        0x10-0x20($Xip),$Ii     # borrow $Ii for $Hkey^2
776           vpxor         $Z1,$Z3,$T3             # aggregated Karatsuba post-processing
777         vpclmulqdq      \$0x00,$Hkey,$inout5,$Z0
778           vpxor         $T3,$Z2,$Z2
779          vpunpckhqdq    $inout4,$inout4,$T2
780         vpclmulqdq      \$0x11,$Hkey,$inout5,$inout5
781          vpxor          $inout4,$T2,$T2
782           vpslldq       \$8,$Z2,$T3
783         vpclmulqdq      \$0x00,$HK,$T1,$T1
784           vpxor         $T3,$Z1,$Xi
785           vpsrldq       \$8,$Z2,$Z2
786           vpxor         $Z2,$Z3,$Z3
787
788         vpclmulqdq      \$0x00,$Ii,$inout4,$Z1
789          vmovdqu        0x30-0x20($Xip),$Hkey   # $Hkey^3
790         vpxor           $Z0,$Z1,$Z1
791          vpunpckhqdq    $inout3,$inout3,$T3
792         vpclmulqdq      \$0x11,$Ii,$inout4,$inout4
793          vpxor          $inout3,$T3,$T3
794         vpxor           $inout5,$inout4,$inout4
795           vpalignr      \$8,$Xi,$Xi,$inout5     # 1st phase
796         vpclmulqdq      \$0x10,$HK,$T2,$T2
797          vmovdqu        0x50-0x20($Xip),$HK
798         vpxor           $T1,$T2,$T2
799
800         vpclmulqdq      \$0x00,$Hkey,$inout3,$Z0
801          vmovdqu        0x40-0x20($Xip),$Ii     # borrow $Ii for $Hkey^4
802         vpxor           $Z1,$Z0,$Z0
803          vpunpckhqdq    $inout2,$inout2,$T1
804         vpclmulqdq      \$0x11,$Hkey,$inout3,$inout3
805          vpxor          $inout2,$T1,$T1
806         vpxor           $inout4,$inout3,$inout3
807           vxorps        0x10(%rsp),$Z3,$Z3      # accumulate $inout0
808         vpclmulqdq      \$0x00,$HK,$T3,$T3
809         vpxor           $T2,$T3,$T3
810
811           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
812           vxorps        $inout5,$Xi,$Xi
813
814         vpclmulqdq      \$0x00,$Ii,$inout2,$Z1
815          vmovdqu        0x60-0x20($Xip),$Hkey   # $Hkey^5
816         vpxor           $Z0,$Z1,$Z1
817          vpunpckhqdq    $inout1,$inout1,$T2
818         vpclmulqdq      \$0x11,$Ii,$inout2,$inout2
819          vpxor          $inout1,$T2,$T2
820           vpalignr      \$8,$Xi,$Xi,$inout5     # 2nd phase
821         vpxor           $inout3,$inout2,$inout2
822         vpclmulqdq      \$0x10,$HK,$T1,$T1
823          vmovdqu        0x80-0x20($Xip),$HK
824         vpxor           $T3,$T1,$T1
825
826           vxorps        $Z3,$inout5,$inout5
827           vpclmulqdq    \$0x10,0x10($const),$Xi,$Xi
828           vxorps        $inout5,$Xi,$Xi
829
830         vpclmulqdq      \$0x00,$Hkey,$inout1,$Z0
831          vmovdqu        0x70-0x20($Xip),$Ii     # borrow $Ii for $Hkey^6
832         vpxor           $Z1,$Z0,$Z0
833          vpunpckhqdq    $Xi,$Xi,$T3
834         vpclmulqdq      \$0x11,$Hkey,$inout1,$inout1
835          vpxor          $Xi,$T3,$T3
836         vpxor           $inout2,$inout1,$inout1
837         vpclmulqdq      \$0x00,$HK,$T2,$T2
838         vpxor           $T1,$T2,$T2
839
840         vpclmulqdq      \$0x00,$Ii,$Xi,$Z1
841         vpclmulqdq      \$0x11,$Ii,$Xi,$Z3
842         vpxor           $Z0,$Z1,$Z1
843         vpclmulqdq      \$0x10,$HK,$T3,$Z2
844         vpxor           $inout1,$Z3,$Z3
845         vpxor           $T2,$Z2,$Z2
846
847         vpxor           $Z1,$Z3,$Z0             # aggregated Karatsuba post-processing
848         vpxor           $Z0,$Z2,$Z2
849         vpslldq         \$8,$Z2,$T1
850         vmovdqu         0x10($const),$Hkey      # .Lpoly
851         vpsrldq         \$8,$Z2,$Z2
852         vpxor           $T1,$Z1,$Xi
853         vpxor           $Z2,$Z3,$Z3
854
855         vpalignr        \$8,$Xi,$Xi,$T2         # 1st phase
856         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
857         vpxor           $T2,$Xi,$Xi
858
859         vpalignr        \$8,$Xi,$Xi,$T2         # 2nd phase
860         vpclmulqdq      \$0x10,$Hkey,$Xi,$Xi
861         vpxor           $Z3,$T2,$T2
862         vpxor           $T2,$Xi,$Xi
863 ___
864 }
865 $code.=<<___;
866         vpshufb         ($const),$Xi,$Xi        # .Lbswap_mask
867         vmovdqu         $Xi,-0x40($Xip)         # output Xi
868
869         vzeroupper
870 ___
871 $code.=<<___ if ($win64);
872         movaps  -0xd8(%rax),%xmm6
873         movaps  -0xc8(%rax),%xmm7
874         movaps  -0xb8(%rax),%xmm8
875         movaps  -0xa8(%rax),%xmm9
876         movaps  -0x98(%rax),%xmm10
877         movaps  -0x88(%rax),%xmm11
878         movaps  -0x78(%rax),%xmm12
879         movaps  -0x68(%rax),%xmm13
880         movaps  -0x58(%rax),%xmm14
881         movaps  -0x48(%rax),%xmm15
882 ___
883 $code.=<<___;
884         mov     -48(%rax),%r15
885         mov     -40(%rax),%r14
886         mov     -32(%rax),%r13
887         mov     -24(%rax),%r12
888         mov     -16(%rax),%rbp
889         mov     -8(%rax),%rbx
890         lea     (%rax),%rsp             # restore %rsp
891 .Lgcm_enc_abort:
892         mov     $ret,%rax               # return value
893         ret
894 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
895 ___
896
897 $code.=<<___;
898 .align  64
899 .Lbswap_mask:
900         .byte   15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
901 .Lpoly:
902         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
903 .Lone_msb:
904         .byte   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
905 .Ltwo_lsb:
906         .byte   2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
907 .Lone_lsb:
908         .byte   1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
909 .asciz  "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
910 .align  64
911 ___
912 if ($win64) {
913 $rec="%rcx";
914 $frame="%rdx";
915 $context="%r8";
916 $disp="%r9";
917
918 $code.=<<___
919 .extern __imp_RtlVirtualUnwind
920 .type   gcm_se_handler,\@abi-omnipotent
921 .align  16
922 gcm_se_handler:
923         push    %rsi
924         push    %rdi
925         push    %rbx
926         push    %rbp
927         push    %r12
928         push    %r13
929         push    %r14
930         push    %r15
931         pushfq
932         sub     \$64,%rsp
933
934         mov     120($context),%rax      # pull context->Rax
935         mov     248($context),%rbx      # pull context->Rip
936
937         mov     8($disp),%rsi           # disp->ImageBase
938         mov     56($disp),%r11          # disp->HandlerData
939
940         mov     0(%r11),%r10d           # HandlerData[0]
941         lea     (%rsi,%r10),%r10        # prologue label
942         cmp     %r10,%rbx               # context->Rip<prologue label
943         jb      .Lcommon_seh_tail
944
945         mov     152($context),%rax      # pull context->Rsp
946
947         mov     4(%r11),%r10d           # HandlerData[1]
948         lea     (%rsi,%r10),%r10        # epilogue label
949         cmp     %r10,%rbx               # context->Rip>=epilogue label
950         jae     .Lcommon_seh_tail
951
952         mov     120($context),%rax      # pull context->Rax
953
954         mov     -48(%rax),%r15
955         mov     -40(%rax),%r14
956         mov     -32(%rax),%r13
957         mov     -24(%rax),%r12
958         mov     -16(%rax),%rbp
959         mov     -8(%rax),%rbx
960         mov     %r15,240($context)
961         mov     %r14,232($context)
962         mov     %r13,224($context)
963         mov     %r12,216($context)
964         mov     %rbp,160($context)
965         mov     %rbx,144($context)
966
967         lea     -0xd8(%rax),%rsi        # %xmm save area
968         lea     512($context),%rdi      # & context.Xmm6
969         mov     \$20,%ecx               # 10*sizeof(%xmm0)/sizeof(%rax)
970         .long   0xa548f3fc              # cld; rep movsq
971
972 .Lcommon_seh_tail:
973         mov     8(%rax),%rdi
974         mov     16(%rax),%rsi
975         mov     %rax,152($context)      # restore context->Rsp
976         mov     %rsi,168($context)      # restore context->Rsi
977         mov     %rdi,176($context)      # restore context->Rdi
978
979         mov     40($disp),%rdi          # disp->ContextRecord
980         mov     $context,%rsi           # context
981         mov     \$154,%ecx              # sizeof(CONTEXT)
982         .long   0xa548f3fc              # cld; rep movsq
983
984         mov     $disp,%rsi
985         xor     %rcx,%rcx               # arg1, UNW_FLAG_NHANDLER
986         mov     8(%rsi),%rdx            # arg2, disp->ImageBase
987         mov     0(%rsi),%r8             # arg3, disp->ControlPc
988         mov     16(%rsi),%r9            # arg4, disp->FunctionEntry
989         mov     40(%rsi),%r10           # disp->ContextRecord
990         lea     56(%rsi),%r11           # &disp->HandlerData
991         lea     24(%rsi),%r12           # &disp->EstablisherFrame
992         mov     %r10,32(%rsp)           # arg5
993         mov     %r11,40(%rsp)           # arg6
994         mov     %r12,48(%rsp)           # arg7
995         mov     %rcx,56(%rsp)           # arg8, (NULL)
996         call    *__imp_RtlVirtualUnwind(%rip)
997
998         mov     \$1,%eax                # ExceptionContinueSearch
999         add     \$64,%rsp
1000         popfq
1001         pop     %r15
1002         pop     %r14
1003         pop     %r13
1004         pop     %r12
1005         pop     %rbp
1006         pop     %rbx
1007         pop     %rdi
1008         pop     %rsi
1009         ret
1010 .size   gcm_se_handler,.-gcm_se_handler
1011
1012 .section        .pdata
1013 .align  4
1014         .rva    .LSEH_begin_aesni_gcm_decrypt
1015         .rva    .LSEH_end_aesni_gcm_decrypt
1016         .rva    .LSEH_gcm_dec_info
1017
1018         .rva    .LSEH_begin_aesni_gcm_encrypt
1019         .rva    .LSEH_end_aesni_gcm_encrypt
1020         .rva    .LSEH_gcm_enc_info
1021 .section        .xdata
1022 .align  8
1023 .LSEH_gcm_dec_info:
1024         .byte   9,0,0,0
1025         .rva    gcm_se_handler
1026         .rva    .Lgcm_dec_body,.Lgcm_dec_abort
1027 .LSEH_gcm_enc_info:
1028         .byte   9,0,0,0
1029         .rva    gcm_se_handler
1030         .rva    .Lgcm_enc_body,.Lgcm_enc_abort
1031 ___
1032 }
1033 }}} else {{{
1034 $code=<<___;    # assembler is too old
1035 .text
1036
1037 .globl  aesni_gcm_encrypt
1038 .type   aesni_gcm_encrypt,\@abi-omnipotent
1039 aesni_gcm_encrypt:
1040         xor     %eax,%eax
1041         ret
1042 .size   aesni_gcm_encrypt,.-aesni_gcm_encrypt
1043
1044 .globl  aesni_gcm_decrypt
1045 .type   aesni_gcm_decrypt,\@abi-omnipotent
1046 aesni_gcm_decrypt:
1047         xor     %eax,%eax
1048         ret
1049 .size   aesni_gcm_decrypt,.-aesni_gcm_decrypt
1050 ___
1051 }}}
1052
1053 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1054
1055 print $code;
1056
1057 close STDOUT;