3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
11 # AES-NI-CTR+GHASH stitch.
15 # OpenSSL GCM implementation is organized in such way that its
16 # performance is rather close to the sum of its streamed components,
17 # in the context parallelized AES-NI CTR and modulo-scheduled
18 # PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
19 # was observed to perform significantly better than the sum of the
20 # components on contemporary CPUs, the effort was deemed impossible to
21 # justify. This module is based on combination of Intel submissions,
22 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
23 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
24 # pressure with notable relative improvement, achieving 1.0 cycle per
25 # byte processed with 128-bit key on Haswell processor.
27 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
28 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
32 if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
34 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
36 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
37 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
38 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
39 die "can't locate x86_64-xlate.pl";
41 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
42 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
43 $avx = ($1>=2.19) + ($1>=2.22);
46 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
47 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
48 $avx = ($1>=2.09) + ($1>=2.10);
51 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
52 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
53 $avx = ($1>=10) + ($1>=11);
56 open OUT,"| \"$^X\" $xlate $flavour $output";
61 ($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
64 $Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
66 ($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
68 ($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
73 .type _aesni_ctr32_ghash_6x,\@abi-omnipotent
75 _aesni_ctr32_ghash_6x:
76 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
78 vpxor $Z0,$Z0,$Z0 # $Z0 = 0
79 vmovdqu 0x00-0x80($key),$rndkey
80 vpaddb $T2,$T1,$inout1
81 vpaddb $T2,$inout1,$inout2
82 vpaddb $T2,$inout2,$inout3
83 vpaddb $T2,$inout3,$inout4
84 vpaddb $T2,$inout4,$inout5
85 vpxor $rndkey,$T1,$inout0
86 vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
92 jc .Lhandle_ctr32 # discard $inout[1-5]?
93 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
94 vpaddb $T2,$inout5,$T1 # next counter value
95 vpxor $rndkey,$inout1,$inout1
96 vpxor $rndkey,$inout2,$inout2
99 vmovdqu $T1,($ivp) # save next counter value
100 vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
101 vpxor $rndkey,$inout3,$inout3
102 vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
103 vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
107 vaesenc $T2,$inout0,$inout0
108 vmovdqu 0x30+8(%rsp),$Ii # I[4]
109 vpxor $rndkey,$inout4,$inout4
110 vpclmulqdq \$0x00,$Hkey,$Z3,$T1
111 vaesenc $T2,$inout1,$inout1
112 vpxor $rndkey,$inout5,$inout5
114 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
115 vaesenc $T2,$inout2,$inout2
116 vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
118 vaesenc $T2,$inout3,$inout3
120 vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
121 vpxor $Z0,$Xi,$Xi # modulo-scheduled
122 vaesenc $T2,$inout4,$inout4
125 vmovups 0x20-0x80($key),$rndkey
126 vpclmulqdq \$0x10,$Hkey,$Ii,$T1
127 vaesenc $T2,$inout5,$inout5
129 vpclmulqdq \$0x01,$Hkey,$Ii,$T2
131 vaesenc $rndkey,$inout0,$inout0
132 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
133 vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
134 vmovdqu 0x40+8(%rsp),$Ii # I[3]
135 vaesenc $rndkey,$inout1,$inout1
136 movbe 0x58($in0),%r13
137 vaesenc $rndkey,$inout2,$inout2
138 movbe 0x50($in0),%r12
139 vaesenc $rndkey,$inout3,$inout3
140 mov %r13,0x20+8(%rsp)
141 vaesenc $rndkey,$inout4,$inout4
142 mov %r12,0x28+8(%rsp)
143 vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
144 vaesenc $rndkey,$inout5,$inout5
146 vmovups 0x30-0x80($key),$rndkey
148 vpclmulqdq \$0x00,$Z1,$Ii,$T1
149 vaesenc $rndkey,$inout0,$inout0
151 vpclmulqdq \$0x10,$Z1,$Ii,$T2
152 vaesenc $rndkey,$inout1,$inout1
154 vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
155 vaesenc $rndkey,$inout2,$inout2
156 vpclmulqdq \$0x11,$Z1,$Ii,$Z1
157 vmovdqu 0x50+8(%rsp),$Ii # I[2]
158 vaesenc $rndkey,$inout3,$inout3
159 vaesenc $rndkey,$inout4,$inout4
161 vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
162 vaesenc $rndkey,$inout5,$inout5
164 vmovups 0x40-0x80($key),$rndkey
166 vpclmulqdq \$0x00,$T1,$Ii,$T2
167 vaesenc $rndkey,$inout0,$inout0
169 vpclmulqdq \$0x10,$T1,$Ii,$Hkey
170 vaesenc $rndkey,$inout1,$inout1
171 movbe 0x48($in0),%r13
173 vpclmulqdq \$0x01,$T1,$Ii,$Z1
174 vaesenc $rndkey,$inout2,$inout2
175 movbe 0x40($in0),%r12
176 vpclmulqdq \$0x11,$T1,$Ii,$T1
177 vmovdqu 0x60+8(%rsp),$Ii # I[1]
178 vaesenc $rndkey,$inout3,$inout3
179 mov %r13,0x30+8(%rsp)
180 vaesenc $rndkey,$inout4,$inout4
181 mov %r12,0x38+8(%rsp)
183 vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
184 vaesenc $rndkey,$inout5,$inout5
186 vmovups 0x50-0x80($key),$rndkey
188 vpclmulqdq \$0x00,$T2,$Ii,$Hkey
189 vaesenc $rndkey,$inout0,$inout0
191 vpclmulqdq \$0x10,$T2,$Ii,$Z1
192 vaesenc $rndkey,$inout1,$inout1
193 movbe 0x38($in0),%r13
195 vpclmulqdq \$0x01,$T2,$Ii,$T1
196 vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
197 vaesenc $rndkey,$inout2,$inout2
198 movbe 0x30($in0),%r12
199 vpclmulqdq \$0x11,$T2,$Ii,$T2
200 vaesenc $rndkey,$inout3,$inout3
201 mov %r13,0x40+8(%rsp)
202 vaesenc $rndkey,$inout4,$inout4
203 mov %r12,0x48+8(%rsp)
205 vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
206 vaesenc $rndkey,$inout5,$inout5
208 vmovups 0x60-0x80($key),$rndkey
210 vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
211 vaesenc $rndkey,$inout0,$inout0
213 vpclmulqdq \$0x01,$Hkey,$Xi,$T1
214 vaesenc $rndkey,$inout1,$inout1
215 movbe 0x28($in0),%r13
217 vpclmulqdq \$0x00,$Hkey,$Xi,$T2
218 vaesenc $rndkey,$inout2,$inout2
219 movbe 0x20($in0),%r12
220 vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
221 vaesenc $rndkey,$inout3,$inout3
222 mov %r13,0x50+8(%rsp)
223 vaesenc $rndkey,$inout4,$inout4
224 mov %r12,0x58+8(%rsp)
226 vaesenc $rndkey,$inout5,$inout5
229 vmovups 0x70-0x80($key),$rndkey
232 vmovdqu 0x10($const),$Hkey # .Lpoly
234 vaesenc $rndkey,$inout0,$inout0
236 vaesenc $rndkey,$inout1,$inout1
238 movbe 0x18($in0),%r13
239 vaesenc $rndkey,$inout2,$inout2
240 movbe 0x10($in0),%r12
241 vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
242 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
243 mov %r13,0x60+8(%rsp)
244 vaesenc $rndkey,$inout3,$inout3
245 mov %r12,0x68+8(%rsp)
246 vaesenc $rndkey,$inout4,$inout4
247 vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
248 vaesenc $rndkey,$inout5,$inout5
250 vaesenc $T1,$inout0,$inout0
251 vmovups 0x90-0x80($key),$rndkey
252 vaesenc $T1,$inout1,$inout1
254 vaesenc $T1,$inout2,$inout2
256 vaesenc $T1,$inout3,$inout3
258 movbe 0x08($in0),%r13
259 vaesenc $T1,$inout4,$inout4
260 movbe 0x00($in0),%r12
261 vaesenc $T1,$inout5,$inout5
262 vmovups 0xa0-0x80($key),$T1
264 jb .Lenc_tail # 128-bit key
266 vaesenc $rndkey,$inout0,$inout0
267 vaesenc $rndkey,$inout1,$inout1
268 vaesenc $rndkey,$inout2,$inout2
269 vaesenc $rndkey,$inout3,$inout3
270 vaesenc $rndkey,$inout4,$inout4
271 vaesenc $rndkey,$inout5,$inout5
273 vaesenc $T1,$inout0,$inout0
274 vaesenc $T1,$inout1,$inout1
275 vaesenc $T1,$inout2,$inout2
276 vaesenc $T1,$inout3,$inout3
277 vaesenc $T1,$inout4,$inout4
278 vmovups 0xb0-0x80($key),$rndkey
279 vaesenc $T1,$inout5,$inout5
280 vmovups 0xc0-0x80($key),$T1
281 je .Lenc_tail # 192-bit key
283 vaesenc $rndkey,$inout0,$inout0
284 vaesenc $rndkey,$inout1,$inout1
285 vaesenc $rndkey,$inout2,$inout2
286 vaesenc $rndkey,$inout3,$inout3
287 vaesenc $rndkey,$inout4,$inout4
288 vaesenc $rndkey,$inout5,$inout5
290 vaesenc $T1,$inout0,$inout0
291 vaesenc $T1,$inout1,$inout1
292 vaesenc $T1,$inout2,$inout2
293 vaesenc $T1,$inout3,$inout3
294 vaesenc $T1,$inout4,$inout4
295 vmovups 0xd0-0x80($key),$rndkey
296 vaesenc $T1,$inout5,$inout5
297 vmovups 0xe0-0x80($key),$T1
298 jmp .Lenc_tail # 256-bit key
302 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
303 vpshufb $Ii,$T1,$Z2 # byte-swap counter
304 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
305 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
306 vpaddd $Z1,$Z2,$inout2
307 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
308 vpaddd $Z1,$inout1,$inout3
309 vpshufb $Ii,$inout1,$inout1
310 vpaddd $Z1,$inout2,$inout4
311 vpshufb $Ii,$inout2,$inout2
312 vpxor $rndkey,$inout1,$inout1
313 vpaddd $Z1,$inout3,$inout5
314 vpshufb $Ii,$inout3,$inout3
315 vpxor $rndkey,$inout2,$inout2
316 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
317 vpshufb $Ii,$inout4,$inout4
318 vpshufb $Ii,$inout5,$inout5
319 vpshufb $Ii,$T1,$T1 # next counter value
324 vaesenc $rndkey,$inout0,$inout0
325 vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
326 vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
327 vaesenc $rndkey,$inout1,$inout1
328 vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
329 vpxor 0x00($inp),$T1,$T2
330 vaesenc $rndkey,$inout2,$inout2
331 vpxor 0x10($inp),$T1,$Ii
332 vaesenc $rndkey,$inout3,$inout3
333 vpxor 0x20($inp),$T1,$Z1
334 vaesenc $rndkey,$inout4,$inout4
335 vpxor 0x30($inp),$T1,$Z2
336 vaesenc $rndkey,$inout5,$inout5
337 vpxor 0x40($inp),$T1,$Z3
338 vpxor 0x50($inp),$T1,$Hkey
339 vmovdqu ($ivp),$T1 # load next counter value
341 vaesenclast $T2,$inout0,$inout0
342 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
343 vaesenclast $Ii,$inout1,$inout1
345 mov %r13,0x70+8(%rsp)
347 vaesenclast $Z1,$inout2,$inout2
349 mov %r12,0x78+8(%rsp)
351 vmovdqu 0x00-0x80($key),$rndkey
352 vaesenclast $Z2,$inout3,$inout3
354 vaesenclast $Z3, $inout4,$inout4
356 vaesenclast $Hkey,$inout5,$inout5
363 vmovups $inout0,-0x60($out) # save output
364 vpxor $rndkey,$T1,$inout0
365 vmovups $inout1,-0x50($out)
366 vmovdqa $Ii,$inout1 # 0 latency
367 vmovups $inout2,-0x40($out)
368 vmovdqa $Z1,$inout2 # 0 latency
369 vmovups $inout3,-0x30($out)
370 vmovdqa $Z2,$inout3 # 0 latency
371 vmovups $inout4,-0x20($out)
372 vmovdqa $Z3,$inout4 # 0 latency
373 vmovups $inout5,-0x10($out)
374 vmovdqa $Hkey,$inout5 # 0 latency
375 vmovdqu 0x20+8(%rsp),$Z3 # I[5]
379 vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
380 vpxor $Z0,$Xi,$Xi # modulo-scheduled
383 .size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
385 ######################################################################
387 # size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
388 # const AES_KEY *key, unsigned char iv[16],
389 # struct { u128 Xi,H,Htbl[9]; } *Xip);
391 .globl aesni_gcm_decrypt
392 .type aesni_gcm_decrypt,\@function,6
396 cmp \$0x60,$len # minimal accepted length
399 lea (%rsp),%rax # save stack pointer
407 $code.=<<___ if ($win64);
409 movaps %xmm6,-0xd8(%rax)
410 movaps %xmm7,-0xc8(%rax)
411 movaps %xmm8,-0xb8(%rax)
412 movaps %xmm9,-0xa8(%rax)
413 movaps %xmm10,-0x98(%rax)
414 movaps %xmm11,-0x88(%rax)
415 movaps %xmm12,-0x78(%rax)
416 movaps %xmm13,-0x68(%rax)
417 movaps %xmm14,-0x58(%rax)
418 movaps %xmm15,-0x48(%rax)
424 vmovdqu ($ivp),$T1 # input counter value
426 mov 12($ivp),$counter
427 lea .Lbswap_mask(%rip),$const
428 lea -0x80($key),$in0 # borrow $in0
429 mov \$0xf80,$end0 # borrow $end0
430 vmovdqu ($Xip),$Xi # load Xi
431 and \$-128,%rsp # ensure stack alignment
432 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
433 lea 0x80($key),$key # size optimization
434 lea 0x20+0x20($Xip),$Xip # size optimization
435 mov 0xf0-0x80($key),$rounds
441 jc .Ldec_no_key_aliasing
443 jnc .Ldec_no_key_aliasing
444 sub $end0,%rsp # avoid aliasing with key
445 .Ldec_no_key_aliasing:
447 vmovdqu 0x50($inp),$Z3 # I[5]
449 vmovdqu 0x40($inp),$Z0
450 lea -0xc0($inp,$len),$end0
451 vmovdqu 0x30($inp),$Z1
454 vmovdqu 0x20($inp),$Z2
455 vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
456 vmovdqu 0x10($inp),$T2
460 vmovdqu $Z0,0x30(%rsp)
462 vmovdqu $Z1,0x40(%rsp)
464 vmovdqu $Z2,0x50(%rsp)
465 vpshufb $Ii,$Hkey,$Hkey
466 vmovdqu $T2,0x60(%rsp)
467 vmovdqu $Hkey,0x70(%rsp)
469 call _aesni_ctr32_ghash_6x
471 vmovups $inout0,-0x60($out) # save output
472 vmovups $inout1,-0x50($out)
473 vmovups $inout2,-0x40($out)
474 vmovups $inout3,-0x30($out)
475 vmovups $inout4,-0x20($out)
476 vmovups $inout5,-0x10($out)
478 vpshufb ($const),$Xi,$Xi # .Lbswap_mask
479 vmovdqu $Xi,-0x40($Xip) # output Xi
483 $code.=<<___ if ($win64);
484 movaps -0xd8(%rax),%xmm6
485 movaps -0xd8(%rax),%xmm7
486 movaps -0xb8(%rax),%xmm8
487 movaps -0xa8(%rax),%xmm9
488 movaps -0x98(%rax),%xmm10
489 movaps -0x88(%rax),%xmm11
490 movaps -0x78(%rax),%xmm12
491 movaps -0x68(%rax),%xmm13
492 movaps -0x58(%rax),%xmm14
493 movaps -0x48(%rax),%xmm15
502 lea (%rax),%rsp # restore %rsp
504 mov $ret,%rax # return value
506 .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
510 .type _aesni_ctr32_6x,\@abi-omnipotent
513 vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
514 vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
516 vmovups 0x10-0x80($key),$rndkey
517 lea 0x20-0x80($key),%r12
518 vpxor $Z0,$T1,$inout0
521 vpaddb $T2,$T1,$inout1
522 vpaddb $T2,$inout1,$inout2
523 vpxor $Z0,$inout1,$inout1
524 vpaddb $T2,$inout2,$inout3
525 vpxor $Z0,$inout2,$inout2
526 vpaddb $T2,$inout3,$inout4
527 vpxor $Z0,$inout3,$inout3
528 vpaddb $T2,$inout4,$inout5
529 vpxor $Z0,$inout4,$inout4
530 vpaddb $T2,$inout5,$T1
531 vpxor $Z0,$inout5,$inout5
536 vaesenc $rndkey,$inout0,$inout0
537 vaesenc $rndkey,$inout1,$inout1
538 vaesenc $rndkey,$inout2,$inout2
539 vaesenc $rndkey,$inout3,$inout3
540 vaesenc $rndkey,$inout4,$inout4
541 vaesenc $rndkey,$inout5,$inout5
542 vmovups (%r12),$rndkey
547 vmovdqu (%r12),$Hkey # last round key
548 vaesenc $rndkey,$inout0,$inout0
549 vpxor 0x00($inp),$Hkey,$Z0
550 vaesenc $rndkey,$inout1,$inout1
551 vpxor 0x10($inp),$Hkey,$Z1
552 vaesenc $rndkey,$inout2,$inout2
553 vpxor 0x20($inp),$Hkey,$Z2
554 vaesenc $rndkey,$inout3,$inout3
555 vpxor 0x30($inp),$Hkey,$Xi
556 vaesenc $rndkey,$inout4,$inout4
557 vpxor 0x40($inp),$Hkey,$T2
558 vaesenc $rndkey,$inout5,$inout5
559 vpxor 0x50($inp),$Hkey,$Hkey
562 vaesenclast $Z0,$inout0,$inout0
563 vaesenclast $Z1,$inout1,$inout1
564 vaesenclast $Z2,$inout2,$inout2
565 vaesenclast $Xi,$inout3,$inout3
566 vaesenclast $T2,$inout4,$inout4
567 vaesenclast $Hkey,$inout5,$inout5
568 vmovups $inout0,0x00($out)
569 vmovups $inout1,0x10($out)
570 vmovups $inout2,0x20($out)
571 vmovups $inout3,0x30($out)
572 vmovups $inout4,0x40($out)
573 vmovups $inout5,0x50($out)
579 vpshufb $Ii,$T1,$Z2 # byte-swap counter
580 vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
581 vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
582 vpaddd $Z1,$Z2,$inout2
583 vpaddd $Z1,$inout1,$inout3
584 vpshufb $Ii,$inout1,$inout1
585 vpaddd $Z1,$inout2,$inout4
586 vpshufb $Ii,$inout2,$inout2
587 vpxor $Z0,$inout1,$inout1
588 vpaddd $Z1,$inout3,$inout5
589 vpshufb $Ii,$inout3,$inout3
590 vpxor $Z0,$inout2,$inout2
591 vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
592 vpshufb $Ii,$inout4,$inout4
593 vpxor $Z0,$inout3,$inout3
594 vpshufb $Ii,$inout5,$inout5
595 vpxor $Z0,$inout4,$inout4
596 vpshufb $Ii,$T1,$T1 # next counter value
597 vpxor $Z0,$inout5,$inout5
599 .size _aesni_ctr32_6x,.-_aesni_ctr32_6x
601 .globl aesni_gcm_encrypt
602 .type aesni_gcm_encrypt,\@function,6
606 cmp \$0x60*3,$len # minimal accepted length
609 lea (%rsp),%rax # save stack pointer
617 $code.=<<___ if ($win64);
619 movaps %xmm6,-0xd8(%rax)
620 movaps %xmm7,-0xc8(%rax)
621 movaps %xmm8,-0xb8(%rax)
622 movaps %xmm9,-0xa8(%rax)
623 movaps %xmm10,-0x98(%rax)
624 movaps %xmm11,-0x88(%rax)
625 movaps %xmm12,-0x78(%rax)
626 movaps %xmm13,-0x68(%rax)
627 movaps %xmm14,-0x58(%rax)
628 movaps %xmm15,-0x48(%rax)
634 vmovdqu ($ivp),$T1 # input counter value
636 mov 12($ivp),$counter
637 lea .Lbswap_mask(%rip),$const
638 lea -0x80($key),$in0 # borrow $in0
639 mov \$0xf80,$end0 # borrow $end0
640 lea 0x80($key),$key # size optimization
641 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
642 and \$-128,%rsp # ensure stack alignment
643 mov 0xf0-0x80($key),$rounds
648 jc .Lenc_no_key_aliasing
650 jnc .Lenc_no_key_aliasing
651 sub $end0,%rsp # avoid aliasing with key
652 .Lenc_no_key_aliasing:
655 lea -0xc0($out,$len),$end0
659 vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
660 vpshufb $Ii,$inout1,$T2
661 vmovdqu $Xi,0x70(%rsp)
662 vpshufb $Ii,$inout2,$Z0
663 vmovdqu $T2,0x60(%rsp)
664 vpshufb $Ii,$inout3,$Z1
665 vmovdqu $Z0,0x50(%rsp)
666 vpshufb $Ii,$inout4,$Z2
667 vmovdqu $Z1,0x40(%rsp)
668 vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
669 vmovdqu $Z2,0x30(%rsp)
673 vmovdqu ($Xip),$Xi # load Xi
674 lea 0x20+0x20($Xip),$Xip # size optimization
679 call _aesni_ctr32_ghash_6x
680 vmovdqu 0x20(%rsp),$Z3 # I[5]
681 vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
682 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
683 vpunpckhqdq $Z3,$Z3,$T1
684 vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
685 vmovups $inout0,-0x60($out) # save output
686 vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
688 vmovups $inout1,-0x50($out)
689 vpshufb $Ii,$inout1,$inout1
690 vmovups $inout2,-0x40($out)
691 vpshufb $Ii,$inout2,$inout2
692 vmovups $inout3,-0x30($out)
693 vpshufb $Ii,$inout3,$inout3
694 vmovups $inout4,-0x20($out)
695 vpshufb $Ii,$inout4,$inout4
696 vmovups $inout5,-0x10($out)
697 vpshufb $Ii,$inout5,$inout5
698 vmovdqu $inout0,0x10(%rsp) # free $inout0
700 { my ($HK,$T3)=($rndkey,$inout0);
703 vmovdqu 0x30(%rsp),$Z2 # I[4]
704 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
705 vpunpckhqdq $Z2,$Z2,$T2
706 vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
708 vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
709 vpclmulqdq \$0x00,$HK,$T1,$T1
711 vmovdqu 0x40(%rsp),$T3 # I[3]
712 vpclmulqdq \$0x00,$Ii,$Z2,$Z0
713 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
715 vpunpckhqdq $T3,$T3,$Z1
716 vpclmulqdq \$0x11,$Ii,$Z2,$Z2
719 vpclmulqdq \$0x10,$HK,$T2,$T2
720 vmovdqu 0x50-0x20($Xip),$HK
723 vmovdqu 0x50(%rsp),$T1 # I[2]
724 vpclmulqdq \$0x00,$Hkey,$T3,$Z3
725 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
727 vpunpckhqdq $T1,$T1,$Z0
728 vpclmulqdq \$0x11,$Hkey,$T3,$T3
731 vpclmulqdq \$0x00,$HK,$Z1,$Z1
734 vmovdqu 0x60(%rsp),$T2 # I[1]
735 vpclmulqdq \$0x00,$Ii,$T1,$Z2
736 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
738 vpunpckhqdq $T2,$T2,$Z3
739 vpclmulqdq \$0x11,$Ii,$T1,$T1
742 vpclmulqdq \$0x10,$HK,$Z0,$Z0
743 vmovdqu 0x80-0x20($Xip),$HK
746 vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
747 vpclmulqdq \$0x00,$Hkey,$T2,$Z1
748 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
749 vpunpckhqdq $Xi,$Xi,$T3
751 vpclmulqdq \$0x11,$Hkey,$T2,$T2
754 vpclmulqdq \$0x00,$HK,$Z3,$Z3
757 vpclmulqdq \$0x00,$Ii,$Xi,$Z2
758 vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
759 vpunpckhqdq $inout5,$inout5,$T1
760 vpclmulqdq \$0x11,$Ii,$Xi,$Xi
761 vpxor $inout5,$T1,$T1
763 vpclmulqdq \$0x10,$HK,$T3,$T3
764 vmovdqu 0x20-0x20($Xip),$HK
768 vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
769 vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
770 vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
772 vpunpckhqdq $inout4,$inout4,$T2
773 vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
774 vpxor $inout4,$T2,$T2
776 vpclmulqdq \$0x00,$HK,$T1,$T1
781 vpclmulqdq \$0x00,$Ii,$inout4,$Z1
782 vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
784 vpunpckhqdq $inout3,$inout3,$T3
785 vpclmulqdq \$0x11,$Ii,$inout4,$inout4
786 vpxor $inout3,$T3,$T3
787 vpxor $inout5,$inout4,$inout4
788 vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
789 vpclmulqdq \$0x10,$HK,$T2,$T2
790 vmovdqu 0x50-0x20($Xip),$HK
793 vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
794 vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
796 vpunpckhqdq $inout2,$inout2,$T1
797 vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
798 vpxor $inout2,$T1,$T1
799 vpxor $inout4,$inout3,$inout3
800 vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
801 vpclmulqdq \$0x00,$HK,$T3,$T3
804 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
805 vxorps $inout5,$Xi,$Xi
807 vpclmulqdq \$0x00,$Ii,$inout2,$Z1
808 vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
810 vpunpckhqdq $inout1,$inout1,$T2
811 vpclmulqdq \$0x11,$Ii,$inout2,$inout2
812 vpxor $inout1,$T2,$T2
813 vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
814 vpxor $inout3,$inout2,$inout2
815 vpclmulqdq \$0x10,$HK,$T1,$T1
816 vmovdqu 0x80-0x20($Xip),$HK
819 vxorps $Z3,$inout5,$inout5
820 vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
821 vxorps $inout5,$Xi,$Xi
823 vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
824 vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
826 vpunpckhqdq $Xi,$Xi,$T3
827 vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
829 vpxor $inout2,$inout1,$inout1
830 vpclmulqdq \$0x00,$HK,$T2,$T2
833 vpclmulqdq \$0x00,$Ii,$Xi,$Z1
834 vpclmulqdq \$0x11,$Ii,$Xi,$Z3
836 vpclmulqdq \$0x10,$HK,$T3,$Z2
837 vpxor $inout1,$Z3,$Z3
840 vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
843 vmovdqu 0x10($const),$Hkey # .Lpoly
848 vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
849 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
852 vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
853 vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
859 vpshufb ($const),$Xi,$Xi # .Lbswap_mask
860 vmovdqu $Xi,-0x40($Xip) # output Xi
864 $code.=<<___ if ($win64);
865 movaps -0xd8(%rax),%xmm6
866 movaps -0xc8(%rax),%xmm7
867 movaps -0xb8(%rax),%xmm8
868 movaps -0xa8(%rax),%xmm9
869 movaps -0x98(%rax),%xmm10
870 movaps -0x88(%rax),%xmm11
871 movaps -0x78(%rax),%xmm12
872 movaps -0x68(%rax),%xmm13
873 movaps -0x58(%rax),%xmm14
874 movaps -0x48(%rax),%xmm15
883 lea (%rax),%rsp # restore %rsp
885 mov $ret,%rax # return value
887 .size aesni_gcm_encrypt,.-aesni_gcm_encrypt
893 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
895 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
897 .byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
899 .byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
901 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
902 .asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
912 .extern __imp_RtlVirtualUnwind
913 .type gcm_se_handler,\@abi-omnipotent
927 mov 120($context),%rax # pull context->Rax
928 mov 248($context),%rbx # pull context->Rip
930 mov 8($disp),%rsi # disp->ImageBase
931 mov 56($disp),%r11 # disp->HandlerData
933 mov 0(%r11),%r10d # HandlerData[0]
934 lea (%rsi,%r10),%r10 # prologue label
935 cmp %r10,%rbx # context->Rip<prologue label
938 mov 152($context),%rax # pull context->Rsp
940 mov 4(%r11),%r10d # HandlerData[1]
941 lea (%rsi,%r10),%r10 # epilogue label
942 cmp %r10,%rbx # context->Rip>=epilogue label
943 jae .Lcommon_seh_tail
945 mov 120($context),%rax # pull context->Rax
953 mov %r15,240($context)
954 mov %r14,232($context)
955 mov %r13,224($context)
956 mov %r12,216($context)
957 mov %rbp,160($context)
958 mov %rbx,144($context)
960 lea -0xd8(%rax),%rsi # %xmm save area
961 lea 512($context),%rdi # & context.Xmm6
962 mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
963 .long 0xa548f3fc # cld; rep movsq
968 mov %rax,152($context) # restore context->Rsp
969 mov %rsi,168($context) # restore context->Rsi
970 mov %rdi,176($context) # restore context->Rdi
972 mov 40($disp),%rdi # disp->ContextRecord
973 mov $context,%rsi # context
974 mov \$154,%ecx # sizeof(CONTEXT)
975 .long 0xa548f3fc # cld; rep movsq
978 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
979 mov 8(%rsi),%rdx # arg2, disp->ImageBase
980 mov 0(%rsi),%r8 # arg3, disp->ControlPc
981 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
982 mov 40(%rsi),%r10 # disp->ContextRecord
983 lea 56(%rsi),%r11 # &disp->HandlerData
984 lea 24(%rsi),%r12 # &disp->EstablisherFrame
985 mov %r10,32(%rsp) # arg5
986 mov %r11,40(%rsp) # arg6
987 mov %r12,48(%rsp) # arg7
988 mov %rcx,56(%rsp) # arg8, (NULL)
989 call *__imp_RtlVirtualUnwind(%rip)
991 mov \$1,%eax # ExceptionContinueSearch
1003 .size gcm_se_handler,.-gcm_se_handler
1007 .rva .LSEH_begin_aesni_gcm_decrypt
1008 .rva .LSEH_end_aesni_gcm_decrypt
1009 .rva .LSEH_gcm_dec_info
1011 .rva .LSEH_begin_aesni_gcm_encrypt
1012 .rva .LSEH_end_aesni_gcm_encrypt
1013 .rva .LSEH_gcm_enc_info
1019 .rva .Lgcm_dec_body,.Lgcm_dec_abort
1023 .rva .Lgcm_enc_body,.Lgcm_enc_abort
1027 $code=<<___; # assembler is too old
1030 .globl aesni_gcm_encrypt
1031 .type aesni_gcm_encrypt,\@abi-omnipotent
1035 .size aesni_gcm_encrypt,.-aesni_gcm_encrypt
1037 .globl aesni_gcm_decrypt
1038 .type aesni_gcm_decrypt,\@abi-omnipotent
1042 .size aesni_gcm_decrypt,.-aesni_gcm_decrypt
1046 $code =~ s/\`([^\`]*)\`/eval($1)/gem;