2 # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
19 # The module implements "4-bit" GCM GHASH function and underlying
20 # single multiplication operation in GF(2^128). "4-bit" means that
21 # it uses 256 bytes per-key table [+128 bytes shared table]. GHASH
22 # function features so called "528B" variant utilizing additional
23 # 256+16 bytes of per-key storage [+512 bytes shared table].
24 # Performance results are for this streamed GHASH subroutine and are
25 # expressed in cycles per processed byte, less is better:
27 # gcc 3.4.x(*) assembler
30 # Opteron 19.3 7.7 +150%
31 # Core2 17.8 8.1(**) +120%
33 # VIA Nano 21.8 10.1 +115%
35 # (*) comparison is not completely fair, because C results are
36 # for vanilla "256B" implementation, while assembler results
38 # (**) it's mystery [to me] why Core2 result is not same as for
43 # Add PCLMULQDQ version performing at 2.02 cycles per processed byte.
44 # See ghash-x86.pl for background information and details about coding
47 # Special thanks to David Woodhouse for providing access to a
48 # Westmere-based system on behalf of Intel Open Source Technology Centre.
52 # Overhaul: aggregate Karatsuba post-processing, improve ILP in
53 # reduction_alg9, increase reduction aggregate factor to 4x. As for
54 # the latter. ghash-x86.pl discusses that it makes lesser sense to
55 # increase aggregate factor. Then why increase here? Critical path
56 # consists of 3 independent pclmulqdq instructions, Karatsuba post-
57 # processing and reduction. "On top" of this we lay down aggregated
58 # multiplication operations, triplets of independent pclmulqdq's. As
59 # issue rate for pclmulqdq is limited, it makes lesser sense to
60 # aggregate more multiplications than it takes to perform remaining
61 # non-multiplication operations. 2x is near-optimal coefficient for
62 # contemporary Intel CPUs (therefore modest improvement coefficient),
63 # but not for Bulldozer. Latter is because logical SIMD operations
64 # are twice as slow in comparison to Intel, so that critical path is
65 # longer. A CPU with higher pclmulqdq issue rate would also benefit
66 # from higher aggregate factor...
69 # Sandy Bridge 1.80(+8%)
70 # Ivy Bridge 1.80(+7%)
71 # Haswell 0.55(+93%) (if system doesn't support AVX)
72 # Broadwell 0.45(+110%)(if system doesn't support AVX)
73 # Skylake 0.44(+110%)(if system doesn't support AVX)
74 # Bulldozer 1.49(+27%)
75 # Silvermont 2.88(+13%)
76 # Knights L 2.12(-) (if system doesn't support AVX)
81 # ... 8x aggregate factor AVX code path is using reduction algorithm
82 # suggested by Shay Gueron[1]. Even though contemporary AVX-capable
83 # CPUs such as Sandy and Ivy Bridge can execute it, the code performs
84 # sub-optimally in comparison to above mentioned version. But thanks
85 # to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
86 # it performs in 0.41 cycles per byte on Haswell processor, in
87 # 0.29 on Broadwell, and in 0.36 on Skylake.
89 # Knights Landing achieves 1.09 cpb.
91 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
93 # $output is the last argument if it looks like a file (it has an extension)
94 # $flavour is the first argument if it doesn't look like a file
95 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
96 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
98 $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
100 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
101 ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
102 ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
103 die "can't locate x86_64-xlate.pl";
105 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
106 =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
107 $avx = ($1>=2.20) + ($1>=2.22);
110 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
111 `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
112 $avx = ($1>=2.09) + ($1>=2.10);
115 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
116 `ml64 2>&1` =~ /Version ([0-9]+)\./) {
117 $avx = ($1>=10) + ($1>=11);
120 if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([0-9]+\.[0-9]+)/) {
121 $avx = ($2>=3.0) + ($2>3.0);
124 open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""
125 or die "can't call $xlate: $!";
130 # common register layout
141 # per-function register layout
145 sub LB() { my $r=shift; $r =~ s/%[er]([a-d])x/%\1l/ or
146 $r =~ s/%[er]([sd]i)/%\1l/ or
147 $r =~ s/%[er](bp)/%\1l/ or
148 $r =~ s/%(r[0-9]+)[d]?/%\1b/; $r; }
150 sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
151 { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
153 $arg = "\$$arg" if ($arg*1 eq $arg);
154 $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
165 mov `&LB("$Zlo")`,`&LB("$nlo")`
166 mov `&LB("$Zlo")`,`&LB("$nhi")`
167 shl \$4,`&LB("$nlo")`
169 mov 8($Htbl,$nlo),$Zlo
170 mov ($Htbl,$nlo),$Zhi
171 and \$0xf0,`&LB("$nhi")`
180 mov ($inp,$cnt),`&LB("$nlo")`
182 xor 8($Htbl,$nhi),$Zlo
184 xor ($Htbl,$nhi),$Zhi
185 mov `&LB("$nlo")`,`&LB("$nhi")`
186 xor ($rem_4bit,$rem,8),$Zhi
188 shl \$4,`&LB("$nlo")`
197 xor 8($Htbl,$nlo),$Zlo
199 xor ($Htbl,$nlo),$Zhi
200 and \$0xf0,`&LB("$nhi")`
201 xor ($rem_4bit,$rem,8),$Zhi
212 xor 8($Htbl,$nlo),$Zlo
214 xor ($Htbl,$nlo),$Zhi
215 and \$0xf0,`&LB("$nhi")`
216 xor ($rem_4bit,$rem,8),$Zhi
224 xor 8($Htbl,$nhi),$Zlo
226 xor ($Htbl,$nhi),$Zhi
228 xor ($rem_4bit,$rem,8),$Zhi
237 .extern OPENSSL_ia32cap_P
239 .globl gcm_gmult_4bit
240 .type gcm_gmult_4bit,\@function,2
247 push %rbp # %rbp and others are pushed exclusively in
249 push %r12 # order to reuse Win64 exception handler...
258 .cfi_adjust_cfa_offset 280
262 lea .Lrem_4bit(%rip),$rem_4bit
269 lea 280+48(%rsp),%rsi
274 .cfi_def_cfa_register %rsp
278 .size gcm_gmult_4bit,.-gcm_gmult_4bit
281 # per-function register layout
287 .globl gcm_ghash_4bit
288 .type gcm_ghash_4bit,\@function,4
306 .cfi_adjust_cfa_offset 280
308 mov $inp,%r14 # reassign couple of args
314 my @nhi=("%ebx","%ecx");
315 my @rem=("%r12","%r13");
318 &sub ($Htbl,-128); # size optimization
319 &lea ($Hshr4,"16+128(%rsp)");
320 { my @lo =($nlo,$nhi);
324 for ($i=0,$j=-2;$i<18;$i++,$j++) {
325 &mov ("$j(%rsp)",&LB($dat)) if ($i>1);
326 &or ($lo[0],$tmp) if ($i>1);
327 &mov (&LB($dat),&LB($lo[1])) if ($i>0 && $i<17);
328 &shr ($lo[1],4) if ($i>0 && $i<17);
329 &mov ($tmp,$hi[1]) if ($i>0 && $i<17);
330 &shr ($hi[1],4) if ($i>0 && $i<17);
331 &mov ("8*$j($Hshr4)",$hi[0]) if ($i>1);
332 &mov ($hi[0],"16*$i+0-128($Htbl)") if ($i<16);
333 &shl (&LB($dat),4) if ($i>0 && $i<17);
334 &mov ("8*$j-128($Hshr4)",$lo[0]) if ($i>1);
335 &mov ($lo[0],"16*$i+8-128($Htbl)") if ($i<16);
336 &shl ($tmp,60) if ($i>0 && $i<17);
338 push (@lo,shift(@lo));
339 push (@hi,shift(@hi));
343 &mov ($Zlo,"8($Xi)");
344 &mov ($Zhi,"0($Xi)");
345 &add ($len,$inp); # pointer to the end of data
346 &lea ($rem_8bit,".Lrem_8bit(%rip)");
347 &jmp (".Louter_loop");
349 $code.=".align 16\n.Louter_loop:\n";
350 &xor ($Zhi,"($inp)");
351 &mov ("%rdx","8($inp)");
352 &lea ($inp,"16($inp)");
355 &mov ("8($Xi)","%rdx");
360 &mov (&LB($nlo),&LB($dat));
361 &movz ($nhi[0],&LB($dat));
365 for ($j=11,$i=0;$i<15;$i++) {
367 &xor ($Zlo,"8($Htbl,$nlo)") if ($i>0);
368 &xor ($Zhi,"($Htbl,$nlo)") if ($i>0);
369 &mov ($Zlo,"8($Htbl,$nlo)") if ($i==0);
370 &mov ($Zhi,"($Htbl,$nlo)") if ($i==0);
372 &mov (&LB($nlo),&LB($dat));
373 &xor ($Zlo,$tmp) if ($i>0);
374 &movzw ($rem[1],"($rem_8bit,$rem[1],2)") if ($i>0);
376 &movz ($nhi[1],&LB($dat));
378 &movzb ($rem[0],"(%rsp,$nhi[0])");
380 &shr ($nhi[1],4) if ($i<14);
381 &and ($nhi[1],0xf0) if ($i==14);
382 &shl ($rem[1],48) if ($i>0);
386 &xor ($Zhi,$rem[1]) if ($i>0);
389 &movz ($rem[0],&LB($rem[0]));
390 &mov ($dat,"$j($Xi)") if (--$j%4==0);
393 &xor ($Zlo,"-128($Hshr4,$nhi[0],8)");
395 &xor ($Zhi,"($Hshr4,$nhi[0],8)");
397 unshift (@nhi,pop(@nhi)); # "rotate" registers
398 unshift (@rem,pop(@rem));
400 &movzw ($rem[1],"($rem_8bit,$rem[1],2)");
401 &xor ($Zlo,"8($Htbl,$nlo)");
402 &xor ($Zhi,"($Htbl,$nlo)");
408 &movz ($rem[0],&LB($Zlo));
412 &shl (&LB($rem[0]),4);
415 &xor ($Zlo,"8($Htbl,$nhi[0])");
416 &movzw ($rem[0],"($rem_8bit,$rem[0],2)");
419 &xor ($Zhi,"($Htbl,$nhi[0])");
428 &jb (".Louter_loop");
434 lea 280+48(%rsp),%rsi
449 .cfi_def_cfa_register %rsp
453 .size gcm_ghash_4bit,.-gcm_ghash_4bit
456 ######################################################################
459 @_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
460 ("%rdi","%rsi","%rdx","%rcx"); # Unix order
462 ($Xi,$Xhi)=("%xmm0","%xmm1"); $Hkey="%xmm2";
463 ($T1,$T2,$T3)=("%xmm3","%xmm4","%xmm5");
465 sub clmul64x64_T2 { # minimal register pressure
466 my ($Xhi,$Xi,$Hkey,$HK)=@_;
468 if (!defined($HK)) { $HK = $T2;
471 pshufd \$0b01001110,$Xi,$T1
472 pshufd \$0b01001110,$Hkey,$T2
479 pshufd \$0b01001110,$Xi,$T1
484 pclmulqdq \$0x00,$Hkey,$Xi #######
485 pclmulqdq \$0x11,$Hkey,$Xhi #######
486 pclmulqdq \$0x00,$HK,$T1 #######
498 sub reduction_alg9 { # 17/11 times faster than Intel version
528 { my ($Htbl,$Xip)=@_4args;
532 .globl gcm_init_clmul
533 .type gcm_init_clmul,\@abi-omnipotent
539 $code.=<<___ if ($win64);
540 .LSEH_begin_gcm_init_clmul:
541 # I can't trust assembler to use specific encoding:-(
542 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
543 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
547 pshufd \$0b01001110,$Hkey,$Hkey # dword swap
550 pshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
555 pcmpgtd $T2,$T3 # broadcast carry bit
557 por $T1,$Hkey # H<<=1
560 pand .L0x1c2_polynomial(%rip),$T3
561 pxor $T3,$Hkey # if(carry) H^=0x1c2_polynomial
564 pshufd \$0b01001110,$Hkey,$HK
568 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK);
569 &reduction_alg9 ($Xhi,$Xi);
571 pshufd \$0b01001110,$Hkey,$T1
572 pshufd \$0b01001110,$Xi,$T2
573 pxor $Hkey,$T1 # Karatsuba pre-processing
574 movdqu $Hkey,0x00($Htbl) # save H
575 pxor $Xi,$T2 # Karatsuba pre-processing
576 movdqu $Xi,0x10($Htbl) # save H^2
577 palignr \$8,$T1,$T2 # low part is H.lo^H.hi...
578 movdqu $T2,0x20($Htbl) # save Karatsuba "salt"
581 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^3
582 &reduction_alg9 ($Xhi,$Xi);
586 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H^4
587 &reduction_alg9 ($Xhi,$Xi);
589 pshufd \$0b01001110,$T3,$T1
590 pshufd \$0b01001110,$Xi,$T2
591 pxor $T3,$T1 # Karatsuba pre-processing
592 movdqu $T3,0x30($Htbl) # save H^3
593 pxor $Xi,$T2 # Karatsuba pre-processing
594 movdqu $Xi,0x40($Htbl) # save H^4
595 palignr \$8,$T1,$T2 # low part is H^3.lo^H^3.hi...
596 movdqu $T2,0x50($Htbl) # save Karatsuba "salt"
599 $code.=<<___ if ($win64);
602 .LSEH_end_gcm_init_clmul:
607 .size gcm_init_clmul,.-gcm_init_clmul
611 { my ($Xip,$Htbl)=@_4args;
614 .globl gcm_gmult_clmul
615 .type gcm_gmult_clmul,\@abi-omnipotent
622 movdqa .Lbswap_mask(%rip),$T3
624 movdqu 0x20($Htbl),$T2
627 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
628 $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
629 # experimental alternative. special thing about is that there
630 # no dependency between the two multiplications...
632 mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
636 movq %r11,$T3 # borrow $T3
638 pshufb $T3,$T2 # ($Xi&7)·0xE0
640 pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
643 paddd $T2,$T2 # <<(64+56+1)
645 pclmulqdq \$0x01,$T3,$Xi
646 movdqa .Lbswap_mask(%rip),$T3 # reload $T3
657 .size gcm_gmult_clmul,.-gcm_gmult_clmul
661 { my ($Xip,$Htbl,$inp,$len)=@_4args;
662 my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
663 my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
666 .globl gcm_ghash_clmul
667 .type gcm_ghash_clmul,\@abi-omnipotent
674 $code.=<<___ if ($win64);
676 .LSEH_begin_gcm_ghash_clmul:
677 # I can't trust assembler to use specific encoding:-(
678 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
679 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
680 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
681 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
682 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
683 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
684 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
685 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
686 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
687 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
688 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
691 movdqa .Lbswap_mask(%rip),$T3
695 movdqu 0x20($Htbl),$HK
701 movdqu 0x10($Htbl),$Hkey2
704 my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
707 mov OPENSSL_ia32cap_P+4(%rip),%eax
711 and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
712 cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
716 mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
717 movdqu 0x30($Htbl),$Hkey3
718 movdqu 0x40($Htbl),$Hkey4
721 # Xi+4 =[(H*Ii+3) + (H^2*Ii+2) + (H^3*Ii+1) + H^4*(Ii+Xi)] mod P
723 movdqu 0x30($inp),$Xln
724 movdqu 0x20($inp),$Xl
728 pshufd \$0b01001110,$Xln,$Xmn
730 pclmulqdq \$0x00,$Hkey,$Xln
731 pclmulqdq \$0x11,$Hkey,$Xhn
732 pclmulqdq \$0x00,$HK,$Xmn
735 pshufd \$0b01001110,$Xl,$Xm
737 pclmulqdq \$0x00,$Hkey2,$Xl
738 pclmulqdq \$0x11,$Hkey2,$Xh
739 pclmulqdq \$0x10,$HK,$Xm
742 movups 0x50($Htbl),$HK
745 movdqu 0x10($inp),$Xl
750 pshufd \$0b01001110,$Xl,$Xm
753 pclmulqdq \$0x00,$Hkey3,$Xl
755 pshufd \$0b01001110,$Xi,$T1
757 pclmulqdq \$0x11,$Hkey3,$Xh
758 pclmulqdq \$0x00,$HK,$Xm
769 pclmulqdq \$0x00,$Hkey4,$Xi
771 movdqu 0x30($inp),$Xl
773 pclmulqdq \$0x11,$Hkey4,$Xhi
775 movdqu 0x20($inp),$Xln
777 pclmulqdq \$0x10,$HK,$T1
778 pshufd \$0b01001110,$Xl,$Xm
782 movups 0x20($Htbl),$HK
784 pclmulqdq \$0x00,$Hkey,$Xl
785 pshufd \$0b01001110,$Xln,$Xmn
787 pxor $Xi,$T1 # aggregated Karatsuba post-processing
792 pclmulqdq \$0x11,$Hkey,$Xh
796 movdqa .L7_mask(%rip),$T1
800 pand $Xi,$T1 # 1st phase
803 pclmulqdq \$0x00,$HK,$Xm
807 pclmulqdq \$0x00,$Hkey2,$Xln
813 movdqa $Xi,$T2 # 2nd phase
815 pclmulqdq \$0x11,$Hkey2,$Xhn
817 movdqu 0x10($inp),$Xl
819 pclmulqdq \$0x10,$HK,$Xmn
821 movups 0x50($Htbl),$HK
829 pshufd \$0b01001110,$Xl,$Xm
833 pclmulqdq \$0x00,$Hkey3,$Xl
837 pclmulqdq \$0x11,$Hkey3,$Xh
839 pshufd \$0b01001110,$Xi,$T1
842 pclmulqdq \$0x00,$HK,$Xm
850 pclmulqdq \$0x00,$Hkey4,$Xi
851 pclmulqdq \$0x11,$Hkey4,$Xhi
852 pclmulqdq \$0x10,$HK,$T1
856 pxor $Xi,$Xhi # aggregated Karatsuba post-processing
868 &reduction_alg9($Xhi,$Xi);
872 movdqu 0x20($Htbl),$HK
880 # Xi+2 =[H*(Ii+1 + Xi+1)] mod P =
881 # [(H*Ii+1) + (H*Xi+1)] mod P =
882 # [(H*Ii+1) + H^2*(Ii+Xi)] mod P
884 movdqu ($inp),$T1 # Ii
885 movdqu 16($inp),$Xln # Ii+1
891 pshufd \$0b01001110,$Xln,$Xmn
893 pclmulqdq \$0x00,$Hkey,$Xln
894 pclmulqdq \$0x11,$Hkey,$Xhn
895 pclmulqdq \$0x00,$HK,$Xmn
897 lea 32($inp),$inp # i+=2
908 pshufd \$0b01001110,$Xi,$Xmn #
911 pclmulqdq \$0x00,$Hkey2,$Xi
912 pclmulqdq \$0x11,$Hkey2,$Xhi
913 pclmulqdq \$0x10,$HK,$Xmn
915 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
917 movdqu ($inp),$T2 # Ii
918 pxor $Xi,$T1 # aggregated Karatsuba post-processing
920 movdqu 16($inp),$Xln # Ii+1
923 pxor $T2,$Xhi # "Ii+Xi", consume early
934 movdqa $Xi,$T2 # 1st phase
938 pclmulqdq \$0x00,$Hkey,$Xln #######
946 pshufd \$0b01001110,$Xhn,$Xmn
950 movdqa $Xi,$T2 # 2nd phase
952 pclmulqdq \$0x11,$Hkey,$Xhn #######
959 pclmulqdq \$0x00,$HK,$Xmn #######
968 pshufd \$0b01001110,$Xi,$Xmn #
971 pclmulqdq \$0x00,$Hkey2,$Xi
972 pclmulqdq \$0x11,$Hkey2,$Xhi
973 pclmulqdq \$0x10,$HK,$Xmn
975 pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
986 &reduction_alg9 ($Xhi,$Xi);
992 movdqu ($inp),$T1 # Ii
996 &clmul64x64_T2 ($Xhi,$Xi,$Hkey,$HK); # H*(Ii+Xi)
997 &reduction_alg9 ($Xhi,$Xi);
1003 $code.=<<___ if ($win64);
1005 movaps 0x10(%rsp),%xmm7
1006 movaps 0x20(%rsp),%xmm8
1007 movaps 0x30(%rsp),%xmm9
1008 movaps 0x40(%rsp),%xmm10
1009 movaps 0x50(%rsp),%xmm11
1010 movaps 0x60(%rsp),%xmm12
1011 movaps 0x70(%rsp),%xmm13
1012 movaps 0x80(%rsp),%xmm14
1013 movaps 0x90(%rsp),%xmm15
1015 .LSEH_end_gcm_ghash_clmul:
1020 .size gcm_ghash_clmul,.-gcm_ghash_clmul
1026 .type gcm_init_avx,\@abi-omnipotent
1032 my ($Htbl,$Xip)=@_4args;
1035 $code.=<<___ if ($win64);
1036 .LSEH_begin_gcm_init_avx:
1037 # I can't trust assembler to use specific encoding:-(
1038 .byte 0x48,0x83,0xec,0x18 #sub $0x18,%rsp
1039 .byte 0x0f,0x29,0x34,0x24 #movaps %xmm6,(%rsp)
1044 vmovdqu ($Xip),$Hkey
1045 vpshufd \$0b01001110,$Hkey,$Hkey # dword swap
1048 vpshufd \$0b11111111,$Hkey,$T2 # broadcast uppermost dword
1049 vpsrlq \$63,$Hkey,$T1
1050 vpsllq \$1,$Hkey,$Hkey
1052 vpcmpgtd $T2,$T3,$T3 # broadcast carry bit
1054 vpor $T1,$Hkey,$Hkey # H<<=1
1057 vpand .L0x1c2_polynomial(%rip),$T3,$T3
1058 vpxor $T3,$Hkey,$Hkey # if(carry) H^=0x1c2_polynomial
1060 vpunpckhqdq $Hkey,$Hkey,$HK
1063 mov \$4,%r10 # up to H^8
1064 jmp .Linit_start_avx
1067 sub clmul64x64_avx {
1068 my ($Xhi,$Xi,$Hkey,$HK)=@_;
1070 if (!defined($HK)) { $HK = $T2;
1072 vpunpckhqdq $Xi,$Xi,$T1
1073 vpunpckhqdq $Hkey,$Hkey,$T2
1079 vpunpckhqdq $Xi,$Xi,$T1
1084 vpclmulqdq \$0x11,$Hkey,$Xi,$Xhi #######
1085 vpclmulqdq \$0x00,$Hkey,$Xi,$Xi #######
1086 vpclmulqdq \$0x00,$HK,$T1,$T1 #######
1087 vpxor $Xi,$Xhi,$T2 #
1090 vpslldq \$8,$T1,$T2 #
1101 vpsllq \$57,$Xi,$T1 # 1st phase
1106 vpslldq \$8,$T2,$T1 #
1111 vpsrlq \$1,$Xi,$T2 # 2nd phase
1116 vpsrlq \$1,$Xi,$Xi #
1117 vpxor $Xhi,$Xi,$Xi #
1124 vpalignr \$8,$T1,$T2,$T3 # low part is H.lo^H.hi...
1125 vmovdqu $T3,-0x10($Htbl) # save Karatsuba "salt"
1127 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^3,5,7
1128 &reduction_avx ($Xhi,$Xi);
1133 &clmul64x64_avx ($Xhi,$Xi,$Hkey,$HK); # calculate H^2,4,6,8
1134 &reduction_avx ($Xhi,$Xi);
1136 vpshufd \$0b01001110,$T3,$T1
1137 vpshufd \$0b01001110,$Xi,$T2
1138 vpxor $T3,$T1,$T1 # Karatsuba pre-processing
1139 vmovdqu $T3,0x00($Htbl) # save H^1,3,5,7
1140 vpxor $Xi,$T2,$T2 # Karatsuba pre-processing
1141 vmovdqu $Xi,0x10($Htbl) # save H^2,4,6,8
1142 lea 0x30($Htbl),$Htbl
1146 vpalignr \$8,$T2,$T1,$T3 # last "salt" is flipped
1147 vmovdqu $T3,-0x10($Htbl)
1151 $code.=<<___ if ($win64);
1154 .LSEH_end_gcm_init_avx:
1159 .size gcm_init_avx,.-gcm_init_avx
1165 .size gcm_init_avx,.-gcm_init_avx
1170 .globl gcm_gmult_avx
1171 .type gcm_gmult_avx,\@abi-omnipotent
1178 .size gcm_gmult_avx,.-gcm_gmult_avx
1182 .globl gcm_ghash_avx
1183 .type gcm_ghash_avx,\@abi-omnipotent
1190 my ($Xip,$Htbl,$inp,$len)=@_4args;
1194 $Xi,$Xo,$Tred,$bswap,$Ii,$Ij) = map("%xmm$_",(0..15));
1196 $code.=<<___ if ($win64);
1197 lea -0x88(%rsp),%rax
1198 .LSEH_begin_gcm_ghash_avx:
1199 # I can't trust assembler to use specific encoding:-(
1200 .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
1201 .byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6,-0x20(%rax)
1202 .byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7,-0x10(%rax)
1203 .byte 0x44,0x0f,0x29,0x00 #movaps %xmm8,0(%rax)
1204 .byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9,0x10(%rax)
1205 .byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10,0x20(%rax)
1206 .byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11,0x30(%rax)
1207 .byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12,0x40(%rax)
1208 .byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13,0x50(%rax)
1209 .byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14,0x60(%rax)
1210 .byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15,0x70(%rax)
1215 vmovdqu ($Xip),$Xi # load $Xi
1216 lea .L0x1c2_polynomial(%rip),%r10
1217 lea 0x40($Htbl),$Htbl # size optimization
1218 vmovdqu .Lbswap_mask(%rip),$bswap
1219 vpshufb $bswap,$Xi,$Xi
1224 vmovdqu 0x70($inp),$Ii # I[7]
1225 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1226 vpshufb $bswap,$Ii,$Ii
1227 vmovdqu 0x20-0x40($Htbl),$HK
1229 vpunpckhqdq $Ii,$Ii,$T2
1230 vmovdqu 0x60($inp),$Ij # I[6]
1231 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1233 vpshufb $bswap,$Ij,$Ij
1234 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1235 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1236 vpunpckhqdq $Ij,$Ij,$T1
1237 vmovdqu 0x50($inp),$Ii # I[5]
1238 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1241 vpshufb $bswap,$Ii,$Ii
1242 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1243 vpunpckhqdq $Ii,$Ii,$T2
1244 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1245 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1247 vmovdqu 0x40($inp),$Ij # I[4]
1248 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1249 vmovdqu 0x50-0x40($Htbl),$HK
1251 vpshufb $bswap,$Ij,$Ij
1252 vpxor $Xlo,$Zlo,$Zlo
1253 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1254 vpxor $Xhi,$Zhi,$Zhi
1255 vpunpckhqdq $Ij,$Ij,$T1
1256 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1257 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1258 vpxor $Xmi,$Zmi,$Zmi
1259 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1262 vmovdqu 0x30($inp),$Ii # I[3]
1263 vpxor $Zlo,$Xlo,$Xlo
1264 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1265 vpxor $Zhi,$Xhi,$Xhi
1266 vpshufb $bswap,$Ii,$Ii
1267 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1268 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1269 vpxor $Zmi,$Xmi,$Xmi
1270 vpunpckhqdq $Ii,$Ii,$T2
1271 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1272 vmovdqu 0x80-0x40($Htbl),$HK
1275 vmovdqu 0x20($inp),$Ij # I[2]
1276 vpxor $Xlo,$Zlo,$Zlo
1277 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1278 vpxor $Xhi,$Zhi,$Zhi
1279 vpshufb $bswap,$Ij,$Ij
1280 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1281 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1282 vpxor $Xmi,$Zmi,$Zmi
1283 vpunpckhqdq $Ij,$Ij,$T1
1284 vpclmulqdq \$0x00,$HK,$T2,$Xmi
1287 vmovdqu 0x10($inp),$Ii # I[1]
1288 vpxor $Zlo,$Xlo,$Xlo
1289 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1290 vpxor $Zhi,$Xhi,$Xhi
1291 vpshufb $bswap,$Ii,$Ii
1292 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1293 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1294 vpxor $Zmi,$Xmi,$Xmi
1295 vpunpckhqdq $Ii,$Ii,$T2
1296 vpclmulqdq \$0x10,$HK,$T1,$Zmi
1297 vmovdqu 0xb0-0x40($Htbl),$HK
1300 vmovdqu ($inp),$Ij # I[0]
1301 vpxor $Xlo,$Zlo,$Zlo
1302 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1303 vpxor $Xhi,$Zhi,$Zhi
1304 vpshufb $bswap,$Ij,$Ij
1305 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1306 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1307 vpxor $Xmi,$Zmi,$Zmi
1308 vpclmulqdq \$0x10,$HK,$T2,$Xmi
1314 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1320 vpunpckhqdq $Ij,$Ij,$T1
1321 vmovdqu 0x70($inp),$Ii # I[7]
1322 vpxor $Xlo,$Zlo,$Zlo
1324 vpclmulqdq \$0x00,$Hkey,$Ij,$Xi
1325 vpshufb $bswap,$Ii,$Ii
1326 vpxor $Xhi,$Zhi,$Zhi
1327 vpclmulqdq \$0x11,$Hkey,$Ij,$Xo
1328 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1329 vpunpckhqdq $Ii,$Ii,$T2
1330 vpxor $Xmi,$Zmi,$Zmi
1331 vpclmulqdq \$0x00,$HK,$T1,$Tred
1332 vmovdqu 0x20-0x40($Htbl),$HK
1335 vmovdqu 0x60($inp),$Ij # I[6]
1336 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1337 vpxor $Zlo,$Xi,$Xi # collect result
1338 vpshufb $bswap,$Ij,$Ij
1339 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1341 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1342 vpunpckhqdq $Ij,$Ij,$T1
1343 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1344 vpxor $Zmi,$Tred,$Tred
1347 vmovdqu 0x50($inp),$Ii # I[5]
1348 vpxor $Xi,$Tred,$Tred # aggregated Karatsuba post-processing
1349 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1350 vpxor $Xo,$Tred,$Tred
1351 vpslldq \$8,$Tred,$T2
1352 vpxor $Xlo,$Zlo,$Zlo
1353 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1354 vpsrldq \$8,$Tred,$Tred
1356 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1357 vpshufb $bswap,$Ii,$Ii
1358 vxorps $Tred,$Xo, $Xo
1359 vpxor $Xhi,$Zhi,$Zhi
1360 vpunpckhqdq $Ii,$Ii,$T2
1361 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1362 vmovdqu 0x50-0x40($Htbl),$HK
1364 vpxor $Xmi,$Zmi,$Zmi
1366 vmovdqu 0x40($inp),$Ij # I[4]
1367 vpalignr \$8,$Xi,$Xi,$Tred # 1st phase
1368 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1369 vpshufb $bswap,$Ij,$Ij
1370 vpxor $Zlo,$Xlo,$Xlo
1371 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1372 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1373 vpunpckhqdq $Ij,$Ij,$T1
1374 vpxor $Zhi,$Xhi,$Xhi
1375 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1377 vpxor $Zmi,$Xmi,$Xmi
1379 vmovdqu 0x30($inp),$Ii # I[3]
1380 vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1381 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1382 vpshufb $bswap,$Ii,$Ii
1383 vpxor $Xlo,$Zlo,$Zlo
1384 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1385 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1386 vpunpckhqdq $Ii,$Ii,$T2
1387 vpxor $Xhi,$Zhi,$Zhi
1388 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1389 vmovdqu 0x80-0x40($Htbl),$HK
1391 vpxor $Xmi,$Zmi,$Zmi
1393 vmovdqu 0x20($inp),$Ij # I[2]
1394 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1395 vpshufb $bswap,$Ij,$Ij
1396 vpxor $Zlo,$Xlo,$Xlo
1397 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1398 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1399 vpunpckhqdq $Ij,$Ij,$T1
1400 vpxor $Zhi,$Xhi,$Xhi
1401 vpclmulqdq \$0x00,$HK, $T2,$Xmi
1403 vpxor $Zmi,$Xmi,$Xmi
1404 vxorps $Tred,$Xi,$Xi
1406 vmovdqu 0x10($inp),$Ii # I[1]
1407 vpalignr \$8,$Xi,$Xi,$Tred # 2nd phase
1408 vpclmulqdq \$0x00,$Hkey,$Ij,$Zlo
1409 vpshufb $bswap,$Ii,$Ii
1410 vpxor $Xlo,$Zlo,$Zlo
1411 vpclmulqdq \$0x11,$Hkey,$Ij,$Zhi
1412 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1413 vpclmulqdq \$0x10,(%r10),$Xi,$Xi
1414 vxorps $Xo,$Tred,$Tred
1415 vpunpckhqdq $Ii,$Ii,$T2
1416 vpxor $Xhi,$Zhi,$Zhi
1417 vpclmulqdq \$0x10,$HK, $T1,$Zmi
1418 vmovdqu 0xb0-0x40($Htbl),$HK
1420 vpxor $Xmi,$Zmi,$Zmi
1422 vmovdqu ($inp),$Ij # I[0]
1423 vpclmulqdq \$0x00,$Hkey,$Ii,$Xlo
1424 vpshufb $bswap,$Ij,$Ij
1425 vpclmulqdq \$0x11,$Hkey,$Ii,$Xhi
1426 vmovdqu 0xa0-0x40($Htbl),$Hkey # $Hkey^8
1428 vpclmulqdq \$0x10,$HK, $T2,$Xmi
1429 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1436 jmp .Ltail_no_xor_avx
1440 vmovdqu -0x10($inp,$len),$Ii # very last word
1441 lea ($inp,$len),$inp
1442 vmovdqu 0x00-0x40($Htbl),$Hkey # $Hkey^1
1443 vmovdqu 0x20-0x40($Htbl),$HK
1444 vpshufb $bswap,$Ii,$Ij
1446 vmovdqa $Xlo,$Zlo # subtle way to zero $Zlo,
1447 vmovdqa $Xhi,$Zhi # $Zhi and
1448 vmovdqa $Xmi,$Zmi # $Zmi
1452 vpunpckhqdq $Ij,$Ij,$T1
1453 vpxor $Xlo,$Zlo,$Zlo
1454 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1456 vmovdqu -0x20($inp),$Ii
1457 vpxor $Xhi,$Zhi,$Zhi
1458 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1459 vmovdqu 0x10-0x40($Htbl),$Hkey # $Hkey^2
1460 vpshufb $bswap,$Ii,$Ij
1461 vpxor $Xmi,$Zmi,$Zmi
1462 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1467 vpunpckhqdq $Ij,$Ij,$T1
1468 vpxor $Xlo,$Zlo,$Zlo
1469 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1471 vmovdqu -0x30($inp),$Ii
1472 vpxor $Xhi,$Zhi,$Zhi
1473 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1474 vmovdqu 0x30-0x40($Htbl),$Hkey # $Hkey^3
1475 vpshufb $bswap,$Ii,$Ij
1476 vpxor $Xmi,$Zmi,$Zmi
1477 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1478 vmovdqu 0x50-0x40($Htbl),$HK
1482 vpunpckhqdq $Ij,$Ij,$T1
1483 vpxor $Xlo,$Zlo,$Zlo
1484 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1486 vmovdqu -0x40($inp),$Ii
1487 vpxor $Xhi,$Zhi,$Zhi
1488 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1489 vmovdqu 0x40-0x40($Htbl),$Hkey # $Hkey^4
1490 vpshufb $bswap,$Ii,$Ij
1491 vpxor $Xmi,$Zmi,$Zmi
1492 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1497 vpunpckhqdq $Ij,$Ij,$T1
1498 vpxor $Xlo,$Zlo,$Zlo
1499 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1501 vmovdqu -0x50($inp),$Ii
1502 vpxor $Xhi,$Zhi,$Zhi
1503 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1504 vmovdqu 0x60-0x40($Htbl),$Hkey # $Hkey^5
1505 vpshufb $bswap,$Ii,$Ij
1506 vpxor $Xmi,$Zmi,$Zmi
1507 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1508 vmovdqu 0x80-0x40($Htbl),$HK
1512 vpunpckhqdq $Ij,$Ij,$T1
1513 vpxor $Xlo,$Zlo,$Zlo
1514 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1516 vmovdqu -0x60($inp),$Ii
1517 vpxor $Xhi,$Zhi,$Zhi
1518 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1519 vmovdqu 0x70-0x40($Htbl),$Hkey # $Hkey^6
1520 vpshufb $bswap,$Ii,$Ij
1521 vpxor $Xmi,$Zmi,$Zmi
1522 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1527 vpunpckhqdq $Ij,$Ij,$T1
1528 vpxor $Xlo,$Zlo,$Zlo
1529 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1531 vmovdqu -0x70($inp),$Ii
1532 vpxor $Xhi,$Zhi,$Zhi
1533 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1534 vmovdqu 0x90-0x40($Htbl),$Hkey # $Hkey^7
1535 vpshufb $bswap,$Ii,$Ij
1536 vpxor $Xmi,$Zmi,$Zmi
1537 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1538 vmovq 0xb8-0x40($Htbl),$HK
1544 vpxor $Xi,$Ij,$Ij # accumulate $Xi
1546 vpunpckhqdq $Ij,$Ij,$T1
1547 vpxor $Xlo,$Zlo,$Zlo
1548 vpclmulqdq \$0x00,$Hkey,$Ij,$Xlo
1550 vpxor $Xhi,$Zhi,$Zhi
1551 vpclmulqdq \$0x11,$Hkey,$Ij,$Xhi
1552 vpxor $Xmi,$Zmi,$Zmi
1553 vpclmulqdq \$0x00,$HK,$T1,$Xmi
1555 vmovdqu (%r10),$Tred
1559 vpxor $Xmi,$Zmi,$Zmi
1561 vpxor $Xi, $Zmi,$Zmi # aggregated Karatsuba post-processing
1562 vpxor $Xo, $Zmi,$Zmi
1563 vpslldq \$8, $Zmi,$T2
1564 vpsrldq \$8, $Zmi,$Zmi
1568 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 1st phase
1569 vpalignr \$8,$Xi,$Xi,$Xi
1572 vpclmulqdq \$0x10,$Tred,$Xi,$T2 # 2nd phase
1573 vpalignr \$8,$Xi,$Xi,$Xi
1580 vpshufb $bswap,$Xi,$Xi
1584 $code.=<<___ if ($win64);
1586 movaps 0x10(%rsp),%xmm7
1587 movaps 0x20(%rsp),%xmm8
1588 movaps 0x30(%rsp),%xmm9
1589 movaps 0x40(%rsp),%xmm10
1590 movaps 0x50(%rsp),%xmm11
1591 movaps 0x60(%rsp),%xmm12
1592 movaps 0x70(%rsp),%xmm13
1593 movaps 0x80(%rsp),%xmm14
1594 movaps 0x90(%rsp),%xmm15
1596 .LSEH_end_gcm_ghash_avx:
1601 .size gcm_ghash_avx,.-gcm_ghash_avx
1607 .size gcm_ghash_avx,.-gcm_ghash_avx
1614 .byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
1616 .byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
1620 .long 7,0,`0xE1<<1`,0
1622 .type .Lrem_4bit,\@object
1624 .long 0,`0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`
1625 .long 0,`0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`
1626 .long 0,`0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`
1627 .long 0,`0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`
1628 .type .Lrem_8bit,\@object
1630 .value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E
1631 .value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E
1632 .value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E
1633 .value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E
1634 .value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E
1635 .value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E
1636 .value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E
1637 .value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E
1638 .value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE
1639 .value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE
1640 .value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE
1641 .value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE
1642 .value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E
1643 .value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E
1644 .value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE
1645 .value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE
1646 .value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E
1647 .value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E
1648 .value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E
1649 .value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E
1650 .value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E
1651 .value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E
1652 .value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E
1653 .value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E
1654 .value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE
1655 .value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE
1656 .value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE
1657 .value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE
1658 .value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E
1659 .value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E
1660 .value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE
1661 .value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE
1663 .asciz "GHASH for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
1667 # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
1668 # CONTEXT *context,DISPATCHER_CONTEXT *disp)
1676 .extern __imp_RtlVirtualUnwind
1677 .type se_handler,\@abi-omnipotent
1691 mov 120($context),%rax # pull context->Rax
1692 mov 248($context),%rbx # pull context->Rip
1694 mov 8($disp),%rsi # disp->ImageBase
1695 mov 56($disp),%r11 # disp->HandlerData
1697 mov 0(%r11),%r10d # HandlerData[0]
1698 lea (%rsi,%r10),%r10 # prologue label
1699 cmp %r10,%rbx # context->Rip<prologue label
1702 mov 152($context),%rax # pull context->Rsp
1704 mov 4(%r11),%r10d # HandlerData[1]
1705 lea (%rsi,%r10),%r10 # epilogue label
1706 cmp %r10,%rbx # context->Rip>=epilogue label
1709 lea 48+280(%rax),%rax # adjust "rsp"
1717 mov %rbx,144($context) # restore context->Rbx
1718 mov %rbp,160($context) # restore context->Rbp
1719 mov %r12,216($context) # restore context->R12
1720 mov %r13,224($context) # restore context->R13
1721 mov %r14,232($context) # restore context->R14
1722 mov %r15,240($context) # restore context->R15
1727 mov %rax,152($context) # restore context->Rsp
1728 mov %rsi,168($context) # restore context->Rsi
1729 mov %rdi,176($context) # restore context->Rdi
1731 mov 40($disp),%rdi # disp->ContextRecord
1732 mov $context,%rsi # context
1733 mov \$`1232/8`,%ecx # sizeof(CONTEXT)
1734 .long 0xa548f3fc # cld; rep movsq
1737 xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
1738 mov 8(%rsi),%rdx # arg2, disp->ImageBase
1739 mov 0(%rsi),%r8 # arg3, disp->ControlPc
1740 mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
1741 mov 40(%rsi),%r10 # disp->ContextRecord
1742 lea 56(%rsi),%r11 # &disp->HandlerData
1743 lea 24(%rsi),%r12 # &disp->EstablisherFrame
1744 mov %r10,32(%rsp) # arg5
1745 mov %r11,40(%rsp) # arg6
1746 mov %r12,48(%rsp) # arg7
1747 mov %rcx,56(%rsp) # arg8, (NULL)
1748 call *__imp_RtlVirtualUnwind(%rip)
1750 mov \$1,%eax # ExceptionContinueSearch
1762 .size se_handler,.-se_handler
1766 .rva .LSEH_begin_gcm_gmult_4bit
1767 .rva .LSEH_end_gcm_gmult_4bit
1768 .rva .LSEH_info_gcm_gmult_4bit
1770 .rva .LSEH_begin_gcm_ghash_4bit
1771 .rva .LSEH_end_gcm_ghash_4bit
1772 .rva .LSEH_info_gcm_ghash_4bit
1774 .rva .LSEH_begin_gcm_init_clmul
1775 .rva .LSEH_end_gcm_init_clmul
1776 .rva .LSEH_info_gcm_init_clmul
1778 .rva .LSEH_begin_gcm_ghash_clmul
1779 .rva .LSEH_end_gcm_ghash_clmul
1780 .rva .LSEH_info_gcm_ghash_clmul
1782 $code.=<<___ if ($avx);
1783 .rva .LSEH_begin_gcm_init_avx
1784 .rva .LSEH_end_gcm_init_avx
1785 .rva .LSEH_info_gcm_init_clmul
1787 .rva .LSEH_begin_gcm_ghash_avx
1788 .rva .LSEH_end_gcm_ghash_avx
1789 .rva .LSEH_info_gcm_ghash_clmul
1794 .LSEH_info_gcm_gmult_4bit:
1797 .rva .Lgmult_prologue,.Lgmult_epilogue # HandlerData
1798 .LSEH_info_gcm_ghash_4bit:
1801 .rva .Lghash_prologue,.Lghash_epilogue # HandlerData
1802 .LSEH_info_gcm_init_clmul:
1803 .byte 0x01,0x08,0x03,0x00
1804 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1805 .byte 0x04,0x22,0x00,0x00 #sub rsp,0x18
1806 .LSEH_info_gcm_ghash_clmul:
1807 .byte 0x01,0x33,0x16,0x00
1808 .byte 0x33,0xf8,0x09,0x00 #movaps 0x90(rsp),xmm15
1809 .byte 0x2e,0xe8,0x08,0x00 #movaps 0x80(rsp),xmm14
1810 .byte 0x29,0xd8,0x07,0x00 #movaps 0x70(rsp),xmm13
1811 .byte 0x24,0xc8,0x06,0x00 #movaps 0x60(rsp),xmm12
1812 .byte 0x1f,0xb8,0x05,0x00 #movaps 0x50(rsp),xmm11
1813 .byte 0x1a,0xa8,0x04,0x00 #movaps 0x40(rsp),xmm10
1814 .byte 0x15,0x98,0x03,0x00 #movaps 0x30(rsp),xmm9
1815 .byte 0x10,0x88,0x02,0x00 #movaps 0x20(rsp),xmm8
1816 .byte 0x0c,0x78,0x01,0x00 #movaps 0x10(rsp),xmm7
1817 .byte 0x08,0x68,0x00,0x00 #movaps 0x00(rsp),xmm6
1818 .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
1822 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
1826 close STDOUT or die "error closing STDOUT: $!";