1 # Copyright 2021-2023 The OpenSSL Project Authors. All Rights Reserved.
2 # Copyright (c) 2021, Intel Corporation. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # This implementation is based on the AES-GCM code (AVX512VAES + VPCLMULQDQ)
11 # from Intel(R) Multi-Buffer Crypto for IPsec Library v1.1
12 # (https://github.com/intel/intel-ipsec-mb).
13 # Original author is Tomasz Kantecki <tomasz.kantecki@intel.com>.
16 # [1] Vinodh Gopal et. al. Optimized Galois-Counter-Mode Implementation on
17 # Intel Architecture Processors. August, 2010.
18 # [2] Erdinc Ozturk et. al. Enabling High-Performance Galois-Counter-Mode on
19 # Intel Architecture Processors. October, 2012.
20 # [3] Shay Gueron et. al. Intel Carry-Less Multiplication Instruction and its
21 # Usage for Computing the GCM Mode. May, 2010.
28 # GCM128_CONTEXT structure has storage for 16 hkeys only, but this
29 # implementation can use up to 48. To avoid extending the context size,
30 # precompute and store in the context first 16 hkeys only, and compute the rest
31 # on demand keeping them in the local frame.
33 #======================================================================
34 # $output is the last argument if it looks like a file (it has an extension)
35 # $flavour is the first argument if it doesn't look like a file
36 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
37 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
40 $win64 = 1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/;
46 ($xlate = "${dir}x86_64-xlate.pl" and -f $xlate)
47 or ($xlate = "${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate)
48 or die "can't locate x86_64-xlate.pl";
50 if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
51 $avx512vaes = ($1 >= 2.30);
56 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/)
57 && `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)(?:\.([0-9]+))?/)
59 $avx512vaes = ($1 == 2.13 && $2 >= 3) + ($1 >= 2.14);
62 if (!$avx512vaes && `$ENV{CC} -v 2>&1`
63 =~ /(Apple)?\s*((?:clang|LLVM) version|.*based on LLVM) ([0-9]+)\.([0-9]+)\.([0-9]+)?/) {
64 my $ver = $3 + $4/100.0 + $5/10000.0; # 3.1.0->3.01, 3.10.1->3.1001
66 # Apple conditions, they use a different version series, see
67 # https://en.wikipedia.org/wiki/Xcode#Xcode_7.0_-_10.x_(since_Free_On-Device_Development)_2
68 # clang 7.0.0 is Apple clang 10.0.1
69 $avx512vaes = ($ver>=10.0001)
71 $avx512vaes = ($ver>=7.0);
75 open OUT, "| \"$^X\" \"$xlate\" $flavour \"$output\""
76 or die "can't call $xlate: $!";
79 #======================================================================
80 if ($avx512vaes>0) { #<<<
83 .extern OPENSSL_ia32cap_P
84 .globl ossl_vaes_vpclmulqdq_capable
85 .type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
87 ossl_vaes_vpclmulqdq_capable:
88 mov OPENSSL_ia32cap_P+8(%rip), %rcx
89 # avx512vpclmulqdq + avx512vaes + avx512vl + avx512bw + avx512dq + avx512f
90 mov \$`1<<42|1<<41|1<<31|1<<30|1<<17|1<<16`,%rdx
96 .size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
99 # ; Mapping key length -> AES rounds count
105 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
106 # ;;; Code generation control switches
107 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
109 # ; ABI-aware zeroing of volatile registers in EPILOG().
110 # ; Disabled due to performance reasons.
111 my $CLEAR_SCRATCH_REGISTERS = 0;
113 # ; Zero HKeys storage from the stack if they are stored there
114 my $CLEAR_HKEYS_STORAGE_ON_EXIT = 1;
116 # ; Enable / disable check of function arguments for null pointer
117 # ; Currently disabled, as this check is handled outside.
118 my $CHECK_FUNCTION_ARGUMENTS = 0;
120 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
121 # ;;; Global constants
122 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
124 # AES block size in bytes
125 my $AES_BLOCK_SIZE = 16;
127 # Storage capacity in elements
128 my $HKEYS_STORAGE_CAPACITY = 48;
129 my $LOCAL_STORAGE_CAPACITY = 48;
130 my $HKEYS_CONTEXT_CAPACITY = 16;
132 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
133 # ;;; Stack frame definition
134 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
136 # (1) -> +64(Win)/+48(Lin)-byte space for pushed GPRs
137 # (2) -> +8-byte space for 16-byte alignment of XMM storage
138 # (3) -> Frame pointer (%RBP)
139 # (4) -> +160-byte XMM storage (Windows only, zero on Linux)
140 # (5) -> +48-byte space for 64-byte alignment of %RSP from p.8
141 # (6) -> +768-byte LOCAL storage (optional, can be omitted in some functions)
142 # (7) -> +768-byte HKEYS storage
143 # (8) -> Stack pointer (%RSP) aligned on 64-byte boundary
145 my $GP_STORAGE = $win64 ? 8 * 8 : 8 * 6; # ; space for saved non-volatile GP registers (pushed on stack)
146 my $XMM_STORAGE = $win64 ? (10 * 16) : 0; # ; space for saved XMM registers
147 my $HKEYS_STORAGE = ($HKEYS_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for HKeys^i, i=1..48
148 my $LOCAL_STORAGE = ($LOCAL_STORAGE_CAPACITY * $AES_BLOCK_SIZE); # ; space for up to 48 AES blocks
150 my $STACK_HKEYS_OFFSET = 0;
151 my $STACK_LOCAL_OFFSET = ($STACK_HKEYS_OFFSET + $HKEYS_STORAGE);
153 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
154 # ;;; Function arguments abstraction
155 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
156 my ($arg1, $arg2, $arg3, $arg4, $arg5, $arg6, $arg7, $arg8, $arg9, $arg10, $arg11);
158 # ; Counter used for assembly label generation
161 # ; This implementation follows the convention: for non-leaf functions (they
162 # ; must call PROLOG) %rbp is used as a frame pointer, and has fixed offset from
163 # ; the function entry: $GP_STORAGE + [8 bytes alignment (Windows only)]. This
164 # ; helps to facilitate SEH handlers writing.
166 # ; Leaf functions here do not use more than 4 input arguments.
172 $arg5 = "`$GP_STORAGE + 8 + 8*5`(%rbp)"; # +8 - alignment bytes
173 $arg6 = "`$GP_STORAGE + 8 + 8*6`(%rbp)";
174 $arg7 = "`$GP_STORAGE + 8 + 8*7`(%rbp)";
175 $arg8 = "`$GP_STORAGE + 8 + 8*8`(%rbp)";
176 $arg9 = "`$GP_STORAGE + 8 + 8*9`(%rbp)";
177 $arg10 = "`$GP_STORAGE + 8 + 8*10`(%rbp)";
178 $arg11 = "`$GP_STORAGE + 8 + 8*11`(%rbp)";
186 $arg7 = "`$GP_STORAGE + 8*1`(%rbp)";
187 $arg8 = "`$GP_STORAGE + 8*2`(%rbp)";
188 $arg9 = "`$GP_STORAGE + 8*3`(%rbp)";
189 $arg10 = "`$GP_STORAGE + 8*4`(%rbp)";
190 $arg11 = "`$GP_STORAGE + 8*5`(%rbp)";
193 # ; Offsets in gcm128_context structure (see include/crypto/modes.h)
194 my $CTX_OFFSET_CurCount = (16 * 0); # ; (Yi) Current counter for generation of encryption key
195 my $CTX_OFFSET_PEncBlock = (16 * 1); # ; (repurposed EKi field) Partial block buffer
196 my $CTX_OFFSET_EK0 = (16 * 2); # ; (EK0) Encrypted Y0 counter (see gcm spec notation)
197 my $CTX_OFFSET_AadLen = (16 * 3); # ; (len.u[0]) Length of Hash which has been input
198 my $CTX_OFFSET_InLen = ((16 * 3) + 8); # ; (len.u[1]) Length of input data which will be encrypted or decrypted
199 my $CTX_OFFSET_AadHash = (16 * 4); # ; (Xi) Current hash
200 my $CTX_OFFSET_HTable = (16 * 6); # ; (Htable) Precomputed table (allows 16 values)
202 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
203 # ;;; Helper functions
204 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
208 if ($reg =~ /%r[abcd]x/i) {
209 $reg =~ s/%r([abcd])x/%${1}l/i;
210 } elsif ($reg =~ /%r[sdb][ip]/i) {
211 $reg =~ s/%r([sdb][ip])/%${1}l/i;
212 } elsif ($reg =~ /%r[0-9]{1,2}/i) {
213 $reg =~ s/%(r[0-9]{1,2})/%${1}b/i;
215 die "BYTE: unknown register: $reg\n";
222 if ($reg =~ /%r[abcdsdb][xip]/i) {
223 $reg =~ s/%r([abcdsdb])([xip])/%${1}${2}/i;
224 } elsif ($reg =~ /%r[0-9]{1,2}/) {
225 $reg =~ s/%(r[0-9]{1,2})/%${1}w/i;
227 die "WORD: unknown register: $reg\n";
234 if ($reg =~ /%r[abcdsdb][xip]/i) {
235 $reg =~ s/%r([abcdsdb])([xip])/%e${1}${2}/i;
236 } elsif ($reg =~ /%r[0-9]{1,2}/i) {
237 $reg =~ s/%(r[0-9]{1,2})/%${1}d/i;
239 die "DWORD: unknown register: $reg\n";
246 if ($reg =~ /%[xyz]mm/i) {
247 $reg =~ s/%[xyz]mm/%xmm/i;
249 die "XWORD: unknown register: $reg\n";
256 if ($reg =~ /%[xyz]mm/i) {
257 $reg =~ s/%[xyz]mm/%ymm/i;
259 die "YWORD: unknown register: $reg\n";
266 if ($reg =~ /%[xyz]mm/i) {
267 $reg =~ s/%[xyz]mm/%zmm/i;
269 die "ZWORD: unknown register: $reg\n";
274 # ; Helper function to construct effective address based on two kinds of
275 # ; offsets: numerical or located in the register
276 sub EffectiveAddress {
277 my ($base, $offset, $displacement) = @_;
278 $displacement = 0 if (!$displacement);
280 if ($offset =~ /^\d+\z/) { # numerical offset
281 return "`$offset + $displacement`($base)";
282 } else { # offset resides in register
283 return "$displacement($base,$offset,1)";
287 # ; Provides memory location of corresponding HashKey power
289 my ($idx, $base) = @_;
290 my $base_str = ($base eq "%rsp") ? "frame" : "context";
292 my $offset = &HashKeyOffsetByIdx($idx, $base_str);
293 return "$offset($base)";
296 # ; Provides offset (in bytes) of corresponding HashKey power from the highest key in the storage
297 sub HashKeyOffsetByIdx {
298 my ($idx, $base) = @_;
299 die "HashKeyOffsetByIdx: base should be either 'frame' or 'context'; base = $base"
300 if (($base ne "frame") && ($base ne "context"));
304 if ($base eq "frame") { # frame storage
305 die "HashKeyOffsetByIdx: idx out of bounds (1..48)! idx = $idx\n" if ($idx > $HKEYS_STORAGE_CAPACITY || $idx < 1);
306 $offset_base = $STACK_HKEYS_OFFSET;
307 $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_STORAGE_CAPACITY - $idx));
308 } else { # context storage
309 die "HashKeyOffsetByIdx: idx out of bounds (1..16)! idx = $idx\n" if ($idx > $HKEYS_CONTEXT_CAPACITY || $idx < 1);
310 $offset_base = $CTX_OFFSET_HTable;
311 $offset_idx = ($AES_BLOCK_SIZE * ($HKEYS_CONTEXT_CAPACITY - $idx));
313 return $offset_base + $offset_idx;
316 # ; Creates local frame and does back up of non-volatile registers.
317 # ; Holds stack unwinding directives.
319 my ($need_hkeys_stack_storage, $need_aes_stack_storage, $func_name) = @_;
321 my $DYNAMIC_STACK_ALLOC_SIZE = 0;
322 my $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE = $win64 ? 48 : 52;
324 if ($need_hkeys_stack_storage) {
325 $DYNAMIC_STACK_ALLOC_SIZE += $HKEYS_STORAGE;
328 if ($need_aes_stack_storage) {
329 if (!$need_hkeys_stack_storage) {
330 die "PROLOG: unsupported case - aes storage without hkeys one";
332 $DYNAMIC_STACK_ALLOC_SIZE += $LOCAL_STORAGE;
338 .L${func_name}_seh_push_rbx:
341 .L${func_name}_seh_push_rbp:
344 .L${func_name}_seh_push_r12:
347 .L${func_name}_seh_push_r13:
350 .L${func_name}_seh_push_r14:
353 .L${func_name}_seh_push_r15:
359 .L${func_name}_seh_push_rdi:
361 .L${func_name}_seh_push_rsi:
363 sub \$`$XMM_STORAGE+8`,%rsp # +8 alignment
364 .L${func_name}_seh_allocstack_xmm:
368 # ; %rbp contains stack pointer right after GP regs pushed at stack + [8
369 # ; bytes of alignment (Windows only)]. It serves as a frame pointer in SEH
370 # ; handlers. The requirement for a frame pointer is that its offset from
371 # ; RSP shall be multiple of 16, and not exceed 240 bytes. The frame pointer
372 # ; itself seems to be reasonable to use here, because later we do 64-byte stack
373 # ; alignment which gives us non-determinate offsets and complicates writing
376 # ; It also serves as an anchor for retrieving stack arguments on both Linux
378 lea `$XMM_STORAGE`(%rsp),%rbp
379 .cfi_def_cfa_register %rbp
380 .L${func_name}_seh_setfp:
384 # ; xmm6:xmm15 need to be preserved on Windows
385 foreach my $reg_idx (6 .. 15) {
386 my $xmm_reg_offset = ($reg_idx - 6) * 16;
388 vmovdqu %xmm${reg_idx},$xmm_reg_offset(%rsp)
389 .L${func_name}_seh_save_xmm${reg_idx}:
395 # Prolog ends here. Next stack allocation is treated as "dynamic".
396 .L${func_name}_seh_prolog_end:
399 if ($DYNAMIC_STACK_ALLOC_SIZE) {
401 sub \$`$DYNAMIC_STACK_ALLOC_SIZE + $DYNAMIC_STACK_ALLOC_ALIGNMENT_SPACE`,%rsp
407 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
408 # ;;; Restore register content for the caller.
409 # ;;; And cleanup stack.
410 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
412 my ($hkeys_storage_on_stack, $payload_len) = @_;
414 my $label_suffix = $label_count++;
416 if ($hkeys_storage_on_stack && $CLEAR_HKEYS_STORAGE_ON_EXIT) {
418 # ; There is no need in hkeys cleanup if payload len was small, i.e. no hkeys
419 # ; were stored in the local frame storage
421 cmpq \$`16*16`,$payload_len
422 jbe .Lskip_hkeys_cleanup_${label_suffix}
423 vpxor %xmm0,%xmm0,%xmm0
425 for (my $i = 0; $i < int($HKEYS_STORAGE / 64); $i++) {
426 $code .= "vmovdqa64 %zmm0,`$STACK_HKEYS_OFFSET + 64*$i`(%rsp)\n";
428 $code .= ".Lskip_hkeys_cleanup_${label_suffix}:\n";
431 if ($CLEAR_SCRATCH_REGISTERS) {
432 &clear_scratch_gps_asm();
433 &clear_scratch_zmms_asm();
435 $code .= "vzeroupper\n";
440 # ; restore xmm15:xmm6
441 for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
442 my $xmm_reg_offset = -$XMM_STORAGE + ($reg_idx - 6) * 16;
444 vmovdqu $xmm_reg_offset(%rbp),%xmm${reg_idx},
451 # Forming valid epilog for SEH with use of frame pointer.
452 # https://docs.microsoft.com/en-us/cpp/build/prolog-and-epilog?view=msvc-160#epilog-code
453 $code .= "lea 8(%rbp),%rsp\n";
455 $code .= "lea (%rbp),%rsp\n";
456 $code .= ".cfi_def_cfa_register %rsp\n";
483 # ; Clears all scratch ZMM registers
485 # ; It should be called before restoring the XMM registers
486 # ; for Windows (XMM6-XMM15).
488 sub clear_scratch_zmms_asm {
490 # ; On Linux, all ZMM registers are scratch registers
492 $code .= "vzeroall\n";
494 foreach my $i (0 .. 5) {
495 $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
498 foreach my $i (16 .. 31) {
499 $code .= "vpxorq %xmm${i},%xmm${i},%xmm${i}\n";
503 # Clears all scratch GP registers
504 sub clear_scratch_gps_asm {
505 foreach my $reg ("%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11") {
506 $code .= "xor $reg,$reg\n";
509 foreach my $reg ("%rsi", "%rdi") {
510 $code .= "xor $reg,$reg\n";
515 sub precompute_hkeys_on_stack {
516 my $GCM128_CTX = $_[0];
517 my $HKEYS_READY = $_[1];
525 my $HKEYS_RANGE = $_[9]; # ; "first16", "mid16", "all", "first32", "last32"
527 die "precompute_hkeys_on_stack: Unexpected value of HKEYS_RANGE: $HKEYS_RANGE"
528 if ($HKEYS_RANGE ne "first16"
529 && $HKEYS_RANGE ne "mid16"
530 && $HKEYS_RANGE ne "all"
531 && $HKEYS_RANGE ne "first32"
532 && $HKEYS_RANGE ne "last32");
534 my $label_suffix = $label_count++;
537 test $HKEYS_READY,$HKEYS_READY
538 jnz .L_skip_hkeys_precomputation_${label_suffix}
541 if ($HKEYS_RANGE eq "first16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "all") {
543 # ; Fill the stack with the first 16 hkeys from the context
545 # ; Move 16 hkeys from the context to stack
546 vmovdqu64 @{[HashKeyByIdx(4,$GCM128_CTX)]},$ZTMP0
547 vmovdqu64 $ZTMP0,@{[HashKeyByIdx(4,"%rsp")]}
549 vmovdqu64 @{[HashKeyByIdx(8,$GCM128_CTX)]},$ZTMP1
550 vmovdqu64 $ZTMP1,@{[HashKeyByIdx(8,"%rsp")]}
552 # ; broadcast HashKey^8
553 vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
555 vmovdqu64 @{[HashKeyByIdx(12,$GCM128_CTX)]},$ZTMP2
556 vmovdqu64 $ZTMP2,@{[HashKeyByIdx(12,"%rsp")]}
558 vmovdqu64 @{[HashKeyByIdx(16,$GCM128_CTX)]},$ZTMP3
559 vmovdqu64 $ZTMP3,@{[HashKeyByIdx(16,"%rsp")]}
563 if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "last32") {
565 vmovdqu64 @{[HashKeyByIdx(8,"%rsp")]},$ZTMP1
567 # ; broadcast HashKey^8
568 vshufi64x2 \$0x00,$ZTMP1,$ZTMP1,$ZTMP1
570 vmovdqu64 @{[HashKeyByIdx(12,"%rsp")]},$ZTMP2
571 vmovdqu64 @{[HashKeyByIdx(16,"%rsp")]},$ZTMP3
576 if ($HKEYS_RANGE eq "mid16" || $HKEYS_RANGE eq "first32" || $HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
578 # ; Precompute hkeys^i, i=17..32
580 foreach (1 .. int((32 - 16) / 8)) {
582 # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
583 &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
584 $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
587 # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
588 &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
589 $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
594 if ($HKEYS_RANGE eq "last32" || $HKEYS_RANGE eq "all") {
596 # ; Precompute hkeys^i, i=33..48 (HKEYS_STORAGE_CAPACITY = 48)
598 foreach (1 .. int((48 - 32) / 8)) {
600 # ;; compute HashKey^(4 + n), HashKey^(3 + n), ... HashKey^(1 + n)
601 &GHASH_MUL($ZTMP2, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
602 $code .= "vmovdqu64 $ZTMP2,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
605 # ;; compute HashKey^(8 + n), HashKey^(7 + n), ... HashKey^(5 + n)
606 &GHASH_MUL($ZTMP3, $ZTMP1, $ZTMP4, $ZTMP5, $ZTMP6);
607 $code .= "vmovdqu64 $ZTMP3,@{[HashKeyByIdx($i,\"%rsp\")]}\n";
612 $code .= ".L_skip_hkeys_precomputation_${label_suffix}:\n";
615 # ;; =============================================================================
616 # ;; Generic macro to produce code that executes $OPCODE instruction
617 # ;; on selected number of AES blocks (16 bytes long ) between 0 and 16.
618 # ;; All three operands of the instruction come from registers.
619 # ;; Note: if 3 blocks are left at the end instruction is produced to operate all
620 # ;; 4 blocks (full width of ZMM)
621 sub ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16 {
622 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
623 my $OPCODE = $_[1]; # [in] instruction name
625 $DST[0] = $_[2]; # [out] destination ZMM register
626 $DST[1] = $_[3]; # [out] destination ZMM register
627 $DST[2] = $_[4]; # [out] destination ZMM register
628 $DST[3] = $_[5]; # [out] destination ZMM register
630 $SRC1[0] = $_[6]; # [in] source 1 ZMM register
631 $SRC1[1] = $_[7]; # [in] source 1 ZMM register
632 $SRC1[2] = $_[8]; # [in] source 1 ZMM register
633 $SRC1[3] = $_[9]; # [in] source 1 ZMM register
635 $SRC2[0] = $_[10]; # [in] source 2 ZMM register
636 $SRC2[1] = $_[11]; # [in] source 2 ZMM register
637 $SRC2[2] = $_[12]; # [in] source 2 ZMM register
638 $SRC2[3] = $_[13]; # [in] source 2 ZMM register
640 die "ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
641 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
644 my $blocks_left = $NUM_BLOCKS;
646 foreach (1 .. ($NUM_BLOCKS / 4)) {
647 $code .= "$OPCODE $SRC2[$reg_idx],$SRC1[$reg_idx],$DST[$reg_idx]\n";
652 my $DSTREG = $DST[$reg_idx];
653 my $SRC1REG = $SRC1[$reg_idx];
654 my $SRC2REG = $SRC2[$reg_idx];
656 if ($blocks_left == 1) {
657 $code .= "$OPCODE @{[XWORD($SRC2REG)]},@{[XWORD($SRC1REG)]},@{[XWORD($DSTREG)]}\n";
658 } elsif ($blocks_left == 2) {
659 $code .= "$OPCODE @{[YWORD($SRC2REG)]},@{[YWORD($SRC1REG)]},@{[YWORD($DSTREG)]}\n";
660 } elsif ($blocks_left == 3) {
661 $code .= "$OPCODE $SRC2REG,$SRC1REG,$DSTREG\n";
665 # ;; =============================================================================
666 # ;; Loads specified number of AES blocks into ZMM registers using mask register
667 # ;; for the last loaded register (xmm, ymm or zmm).
668 # ;; Loads take place at 1 byte granularity.
669 sub ZMM_LOAD_MASKED_BLOCKS_0_16 {
670 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
671 my $INP = $_[1]; # [in] input data pointer to read from
672 my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
674 $DST[0] = $_[3]; # [out] ZMM register with loaded data
675 $DST[1] = $_[4]; # [out] ZMM register with loaded data
676 $DST[2] = $_[5]; # [out] ZMM register with loaded data
677 $DST[3] = $_[6]; # [out] ZMM register with loaded data
678 my $MASK = $_[7]; # [in] mask register
680 die "ZMM_LOAD_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
681 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
685 my $blocks_left = $NUM_BLOCKS;
687 if ($NUM_BLOCKS > 0) {
688 foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
689 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DST[$dst_idx]\n";
696 my $DSTREG = $DST[$dst_idx];
698 if ($blocks_left == 1) {
699 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[XWORD($DSTREG)]}\{$MASK\}{z}\n";
700 } elsif ($blocks_left == 2) {
701 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},@{[YWORD($DSTREG)]}\{$MASK\}{z}\n";
702 } elsif (($blocks_left == 3 || $blocks_left == 4)) {
703 $code .= "vmovdqu8 @{[EffectiveAddress($INP,$DATA_OFFSET,$src_offset)]},$DSTREG\{$MASK\}{z}\n";
707 # ;; =============================================================================
708 # ;; Stores specified number of AES blocks from ZMM registers with mask register
709 # ;; for the last loaded register (xmm, ymm or zmm).
710 # ;; Stores take place at 1 byte granularity.
711 sub ZMM_STORE_MASKED_BLOCKS_0_16 {
712 my $NUM_BLOCKS = $_[0]; # [in] numerical value, number of AES blocks (0 to 16)
713 my $OUTP = $_[1]; # [in] output data pointer to write to
714 my $DATA_OFFSET = $_[2]; # [in] offset to the output pointer (GP or numerical)
716 $SRC[0] = $_[3]; # [in] ZMM register with data to store
717 $SRC[1] = $_[4]; # [in] ZMM register with data to store
718 $SRC[2] = $_[5]; # [in] ZMM register with data to store
719 $SRC[3] = $_[6]; # [in] ZMM register with data to store
720 my $MASK = $_[7]; # [in] mask register
722 die "ZMM_STORE_MASKED_BLOCKS_0_16: num_blocks is out of bounds = $NUM_BLOCKS\n"
723 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
727 my $blocks_left = $NUM_BLOCKS;
729 if ($NUM_BLOCKS > 0) {
730 foreach (1 .. (int(($NUM_BLOCKS + 3) / 4) - 1)) {
731 $code .= "vmovdqu8 $SRC[$src_idx],`$dst_offset`($OUTP,$DATA_OFFSET,1)\n";
738 my $SRCREG = $SRC[$src_idx];
740 if ($blocks_left == 1) {
741 $code .= "vmovdqu8 @{[XWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
742 } elsif ($blocks_left == 2) {
743 $code .= "vmovdqu8 @{[YWORD($SRCREG)]},`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
744 } elsif ($blocks_left == 3 || $blocks_left == 4) {
745 $code .= "vmovdqu8 $SRCREG,`$dst_offset`($OUTP,$DATA_OFFSET,1){$MASK}\n";
749 # ;;; ===========================================================================
750 # ;;; Handles AES encryption rounds
751 # ;;; It handles special cases: the last and first rounds
752 # ;;; Optionally, it performs XOR with data after the last AES round.
753 # ;;; Uses NROUNDS parameter to check what needs to be done for the current round.
754 # ;;; If 3 blocks are trailing then operation on whole ZMM is performed (4 blocks).
755 sub ZMM_AESENC_ROUND_BLOCKS_0_16 {
756 my $L0B0_3 = $_[0]; # [in/out] zmm; blocks 0 to 3
757 my $L0B4_7 = $_[1]; # [in/out] zmm; blocks 4 to 7
758 my $L0B8_11 = $_[2]; # [in/out] zmm; blocks 8 to 11
759 my $L0B12_15 = $_[3]; # [in/out] zmm; blocks 12 to 15
760 my $KEY = $_[4]; # [in] zmm containing round key
761 my $ROUND = $_[5]; # [in] round number
762 my $D0_3 = $_[6]; # [in] zmm or no_data; plain/cipher text blocks 0-3
763 my $D4_7 = $_[7]; # [in] zmm or no_data; plain/cipher text blocks 4-7
764 my $D8_11 = $_[8]; # [in] zmm or no_data; plain/cipher text blocks 8-11
765 my $D12_15 = $_[9]; # [in] zmm or no_data; plain/cipher text blocks 12-15
766 my $NUMBL = $_[10]; # [in] number of blocks; numerical value
767 my $NROUNDS = $_[11]; # [in] number of rounds; numerical value
769 # ;;; === first AES round
773 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
774 $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
775 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
778 # ;;; === middle AES rounds
779 if ($ROUND >= 1 && $ROUND <= $NROUNDS) {
781 # ;; rounds 1 to 9/11/13
782 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
783 $NUMBL, "vaesenc", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
784 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
787 # ;;; === last AES round
788 if ($ROUND > $NROUNDS) {
790 # ;; the last round - mix enclast with text xor's
791 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
792 $NUMBL, "vaesenclast", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
793 $L0B4_7, $L0B8_11, $L0B12_15, $KEY, $KEY, $KEY, $KEY);
795 # ;;; === XOR with data
796 if ( ($D0_3 ne "no_data")
797 && ($D4_7 ne "no_data")
798 && ($D8_11 ne "no_data")
799 && ($D12_15 ne "no_data"))
801 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
802 $NUMBL, "vpxorq", $L0B0_3, $L0B4_7, $L0B8_11, $L0B12_15, $L0B0_3,
803 $L0B4_7, $L0B8_11, $L0B12_15, $D0_3, $D4_7, $D8_11, $D12_15);
808 # ;;; Horizontal XOR - 4 x 128bits xored together
810 my $REG = $_[0]; # [in/out] ZMM with 4x128bits to xor; 128bit output
811 my $TMP = $_[1]; # [clobbered] ZMM temporary register
813 vextracti64x4 \$1,$REG,@{[YWORD($TMP)]}
814 vpxorq @{[YWORD($TMP)]},@{[YWORD($REG)]},@{[YWORD($REG)]}
815 vextracti32x4 \$1,@{[YWORD($REG)]},@{[XWORD($TMP)]}
816 vpxorq @{[XWORD($TMP)]},@{[XWORD($REG)]},@{[XWORD($REG)]}
820 # ;;; AVX512 reduction macro
822 my $OUT = $_[0]; # [out] zmm/ymm/xmm: result (must not be $TMP1 or $HI128)
823 my $POLY = $_[1]; # [in] zmm/ymm/xmm: polynomial
824 my $HI128 = $_[2]; # [in] zmm/ymm/xmm: high 128b of hash to reduce
825 my $LO128 = $_[3]; # [in] zmm/ymm/xmm: low 128b of hash to reduce
826 my $TMP0 = $_[4]; # [in] zmm/ymm/xmm: temporary register
827 my $TMP1 = $_[5]; # [in] zmm/ymm/xmm: temporary register
830 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
831 # ;; first phase of the reduction
832 vpclmulqdq \$0x01,$LO128,$POLY,$TMP0
833 vpslldq \$8,$TMP0,$TMP0 # ; shift-L 2 DWs
834 vpxorq $TMP0,$LO128,$TMP0 # ; first phase of the reduction complete
835 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
836 # ;; second phase of the reduction
837 vpclmulqdq \$0x00,$TMP0,$POLY,$TMP1
838 vpsrldq \$4,$TMP1,$TMP1 # ; shift-R only 1-DW to obtain 2-DWs shift-R
839 vpclmulqdq \$0x10,$TMP0,$POLY,$OUT
840 vpslldq \$4,$OUT,$OUT # ; shift-L 1-DW to obtain result with no shifts
841 vpternlogq \$0x96,$HI128,$TMP1,$OUT # ; OUT/GHASH = OUT xor TMP1 xor HI128
842 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
846 # ;; ===========================================================================
847 # ;; schoolbook multiply of 16 blocks (16 x 16 bytes)
848 # ;; - it is assumed that data read from $INPTR is already shuffled and
849 # ;; $INPTR address is 64 byte aligned
850 # ;; - there is an option to pass ready blocks through ZMM registers too.
851 # ;; 4 extra parameters need to be passed in such case and 21st ($ZTMP9) argument can be empty
853 my $TYPE = $_[0]; # [in] ghash type: start (xor hash), mid, end (same as mid; no reduction),
854 # end_reduce (end with reduction), start_reduce
855 my $GH = $_[1]; # [in/out] ZMM ghash sum: high 128-bits
856 my $GM = $_[2]; # [in/out] ZMM ghash sum: middle 128-bits
857 my $GL = $_[3]; # [in/out] ZMM ghash sum: low 128-bits
858 my $INPTR = $_[4]; # [in] data input pointer
859 my $INOFF = $_[5]; # [in] data input offset
860 my $INDIS = $_[6]; # [in] data input displacement
861 my $HKPTR = $_[7]; # [in] hash key pointer
862 my $HKOFF = $_[8]; # [in] hash key offset (can be either numerical offset, or register containing offset)
863 my $HKDIS = $_[9]; # [in] hash key displacement
864 my $HASH = $_[10]; # [in/out] ZMM hash value in/out
865 my $ZTMP0 = $_[11]; # [clobbered] temporary ZMM
866 my $ZTMP1 = $_[12]; # [clobbered] temporary ZMM
867 my $ZTMP2 = $_[13]; # [clobbered] temporary ZMM
868 my $ZTMP3 = $_[14]; # [clobbered] temporary ZMM
869 my $ZTMP4 = $_[15]; # [clobbered] temporary ZMM
870 my $ZTMP5 = $_[16]; # [clobbered] temporary ZMM
871 my $ZTMP6 = $_[17]; # [clobbered] temporary ZMM
872 my $ZTMP7 = $_[18]; # [clobbered] temporary ZMM
873 my $ZTMP8 = $_[19]; # [clobbered] temporary ZMM
874 my $ZTMP9 = $_[20]; # [clobbered] temporary ZMM, can be empty if 4 extra parameters below are provided
875 my $DAT0 = $_[21]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
876 my $DAT1 = $_[22]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
877 my $DAT2 = $_[23]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
878 my $DAT3 = $_[24]; # [in] ZMM with 4 blocks of input data (INPTR, INOFF, INDIS unused)
881 my $do_reduction = 0;
882 if ($TYPE eq "start") {
886 if ($TYPE eq "start_reduce") {
891 if ($TYPE eq "end_reduce") {
895 # ;; ghash blocks 0-3
896 if (scalar(@_) == 21) {
897 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+0*64))]},$ZTMP9\n";
902 if ($start_ghash != 0) {
903 $code .= "vpxorq $HASH,$ZTMP9,$ZTMP9\n";
906 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+0*64))]},$ZTMP8
907 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
908 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
909 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
910 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
913 # ;; ghash blocks 4-7
914 if (scalar(@_) == 21) {
915 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+1*64))]},$ZTMP9\n";
920 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+1*64))]},$ZTMP8
921 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
922 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
923 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
924 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
928 if ($start_ghash != 0) {
930 vpxorq $ZTMP6,$ZTMP2,$GM # ; GM = T0M1 + T1M1
931 vpxorq $ZTMP4,$ZTMP0,$GH # ; GH = T0H + T1H
932 vpxorq $ZTMP5,$ZTMP1,$GL # ; GL = T0L + T1L
933 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM = T0M2 + T1M1
935 } else { # ;; mid, end, end_reduce
937 vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
938 vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
939 vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
940 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
944 # ;; ghash blocks 8-11
945 if (scalar(@_) == 21) {
946 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+2*64))]},$ZTMP9\n";
951 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+2*64))]},$ZTMP8
952 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP0 # ; T0H = a1*b1
953 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP1 # ; T0L = a0*b0
954 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP2 # ; T0M1 = a1*b0
955 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP3 # ; T0M2 = a0*b1
958 # ;; ghash blocks 12-15
959 if (scalar(@_) == 21) {
960 $code .= "vmovdqa64 @{[EffectiveAddress($INPTR,$INOFF,($INDIS+3*64))]},$ZTMP9\n";
965 vmovdqu64 @{[EffectiveAddress($HKPTR,$HKOFF,($HKDIS+3*64))]},$ZTMP8
966 vpclmulqdq \$0x11,$ZTMP8,$ZTMP9,$ZTMP4 # ; T1H = a1*b1
967 vpclmulqdq \$0x00,$ZTMP8,$ZTMP9,$ZTMP5 # ; T1L = a0*b0
968 vpclmulqdq \$0x01,$ZTMP8,$ZTMP9,$ZTMP6 # ; T1M1 = a1*b0
969 vpclmulqdq \$0x10,$ZTMP8,$ZTMP9,$ZTMP7 # ; T1M2 = a0*b1
971 vpternlogq \$0x96,$ZTMP6,$ZTMP2,$GM # ; GM += T0M1 + T1M1
972 vpternlogq \$0x96,$ZTMP4,$ZTMP0,$GH # ; GH += T0H + T1H
973 vpternlogq \$0x96,$ZTMP5,$ZTMP1,$GL # ; GL += T0L + T1L
974 vpternlogq \$0x96,$ZTMP7,$ZTMP3,$GM # ; GM += T0M2 + T1M1
976 if ($do_reduction != 0) {
978 # ;; integrate GM into GH and GL
979 vpsrldq \$8,$GM,$ZTMP0
980 vpslldq \$8,$GM,$ZTMP1
981 vpxorq $ZTMP0,$GH,$GH
982 vpxorq $ZTMP1,$GL,$GL
985 # ;; add GH and GL 128-bit words horizontally
986 &VHPXORI4x128($GH, $ZTMP0);
987 &VHPXORI4x128($GL, $ZTMP1);
990 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZTMP2)]}\n";
991 &VCLMUL_REDUCE(&XWORD($HASH), &XWORD($ZTMP2), &XWORD($GH), &XWORD($GL), &XWORD($ZTMP0), &XWORD($ZTMP1));
995 # ;; ===========================================================================
996 # ;; GHASH 1 to 16 blocks of cipher text
997 # ;; - performs reduction at the end
998 # ;; - it doesn't load the data and it assumed it is already loaded and shuffled
1000 my $GCM128_CTX = $_[0]; # [in] pointer to expanded keys
1001 my $GHASH = $_[1]; # [out] ghash output
1002 my $T0H = $_[2]; # [clobbered] temporary ZMM
1003 my $T0L = $_[3]; # [clobbered] temporary ZMM
1004 my $T0M1 = $_[4]; # [clobbered] temporary ZMM
1005 my $T0M2 = $_[5]; # [clobbered] temporary ZMM
1006 my $T1H = $_[6]; # [clobbered] temporary ZMM
1007 my $T1L = $_[7]; # [clobbered] temporary ZMM
1008 my $T1M1 = $_[8]; # [clobbered] temporary ZMM
1009 my $T1M2 = $_[9]; # [clobbered] temporary ZMM
1010 my $HK = $_[10]; # [clobbered] temporary ZMM
1011 my $AAD_HASH_IN = $_[11]; # [in] input hash value
1013 $CIPHER_IN[0] = $_[12]; # [in] ZMM with cipher text blocks 0-3
1014 $CIPHER_IN[1] = $_[13]; # [in] ZMM with cipher text blocks 4-7
1015 $CIPHER_IN[2] = $_[14]; # [in] ZMM with cipher text blocks 8-11
1016 $CIPHER_IN[3] = $_[15]; # [in] ZMM with cipher text blocks 12-15
1017 my $NUM_BLOCKS = $_[16]; # [in] numerical value, number of blocks
1018 my $GH = $_[17]; # [in] ZMM with hi product part
1019 my $GM = $_[18]; # [in] ZMM with mid product part
1020 my $GL = $_[19]; # [in] ZMM with lo product part
1022 die "GHASH_1_TO_16: num_blocks is out of bounds = $NUM_BLOCKS\n" if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
1024 if (scalar(@_) == 17) {
1025 $code .= "vpxorq $AAD_HASH_IN,$CIPHER_IN[0],$CIPHER_IN[0]\n";
1028 if ($NUM_BLOCKS == 16) {
1030 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1031 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1032 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1033 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1034 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1035 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1036 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1037 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1038 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1039 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1040 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
1041 vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
1042 vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
1043 vpternlogq \$0x96,$T1H,$CIPHER_IN[0],$T0H
1044 vpternlogq \$0x96,$T1L,$CIPHER_IN[1],$T0L
1045 vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
1046 vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
1047 vpternlogq \$0x96,$T1M1,$CIPHER_IN[0],$T0M1
1048 vpternlogq \$0x96,$T1M2,$CIPHER_IN[1],$T0M2
1049 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-3*4, $GCM128_CTX)]},$HK
1050 vpclmulqdq \$0x11,$HK,$CIPHER_IN[3],$T1H # ; H = a1*b1
1051 vpclmulqdq \$0x00,$HK,$CIPHER_IN[3],$T1L # ; L = a0*b0
1052 vpclmulqdq \$0x01,$HK,$CIPHER_IN[3],$T1M1 # ; M1 = a1*b0
1053 vpclmulqdq \$0x10,$HK,$CIPHER_IN[3],$T1M2 # ; M2 = a0*b1
1054 vpxorq $T1H,$T0H,$T1H
1055 vpxorq $T1L,$T0L,$T1L
1056 vpxorq $T1M1,$T0M1,$T1M1
1057 vpxorq $T1M2,$T0M2,$T1M2
1059 } elsif ($NUM_BLOCKS >= 12) {
1061 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1062 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1063 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1064 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1065 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1066 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1067 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1068 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1069 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1070 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1071 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-2*4, $GCM128_CTX)]},$HK
1072 vpclmulqdq \$0x11,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; H = a1*b1
1073 vpclmulqdq \$0x00,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; L = a0*b0
1074 vpternlogq \$0x96,$T0H,$CIPHER_IN[0],$T1H
1075 vpternlogq \$0x96,$T0L,$CIPHER_IN[1],$T1L
1076 vpclmulqdq \$0x01,$HK,$CIPHER_IN[2],$CIPHER_IN[0] # ; M1 = a1*b0
1077 vpclmulqdq \$0x10,$HK,$CIPHER_IN[2],$CIPHER_IN[1] # ; M2 = a0*b1
1078 vpternlogq \$0x96,$T0M1,$CIPHER_IN[0],$T1M1
1079 vpternlogq \$0x96,$T0M2,$CIPHER_IN[1],$T1M2
1081 } elsif ($NUM_BLOCKS >= 8) {
1083 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1084 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T0H # ; H = a1*b1
1085 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T0L # ; L = a0*b0
1086 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T0M1 # ; M1 = a1*b0
1087 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T0M2 # ; M2 = a0*b1
1088 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS-1*4, $GCM128_CTX)]},$HK
1089 vpclmulqdq \$0x11,$HK,$CIPHER_IN[1],$T1H # ; H = a1*b1
1090 vpclmulqdq \$0x00,$HK,$CIPHER_IN[1],$T1L # ; L = a0*b0
1091 vpclmulqdq \$0x01,$HK,$CIPHER_IN[1],$T1M1 # ; M1 = a1*b0
1092 vpclmulqdq \$0x10,$HK,$CIPHER_IN[1],$T1M2 # ; M2 = a0*b1
1093 vpxorq $T1H,$T0H,$T1H
1094 vpxorq $T1L,$T0L,$T1L
1095 vpxorq $T1M1,$T0M1,$T1M1
1096 vpxorq $T1M2,$T0M2,$T1M2
1098 } elsif ($NUM_BLOCKS >= 4) {
1100 vmovdqu64 @{[HashKeyByIdx($NUM_BLOCKS, $GCM128_CTX)]},$HK
1101 vpclmulqdq \$0x11,$HK,$CIPHER_IN[0],$T1H # ; H = a1*b1
1102 vpclmulqdq \$0x00,$HK,$CIPHER_IN[0],$T1L # ; L = a0*b0
1103 vpclmulqdq \$0x01,$HK,$CIPHER_IN[0],$T1M1 # ; M1 = a1*b0
1104 vpclmulqdq \$0x10,$HK,$CIPHER_IN[0],$T1M2 # ; M2 = a0*b1
1108 # ;; T1H/L/M1/M2 - hold current product sums (provided $NUM_BLOCKS >= 4)
1109 my $blocks_left = ($NUM_BLOCKS % 4);
1110 if ($blocks_left > 0) {
1112 # ;; =====================================================
1113 # ;; There are 1, 2 or 3 blocks left to process.
1114 # ;; It may also be that they are the only blocks to process.
1116 # ;; Set hash key and register index position for the remaining 1 to 3 blocks
1117 my $reg_idx = ($NUM_BLOCKS / 4);
1118 my $REG_IN = $CIPHER_IN[$reg_idx];
1120 if ($blocks_left == 1) {
1122 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[XWORD($HK)]}
1123 vpclmulqdq \$0x01,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M1)]} # ; M1 = a1*b0
1124 vpclmulqdq \$0x10,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0M2)]} # ; M2 = a0*b1
1125 vpclmulqdq \$0x11,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0H)]} # ; H = a1*b1
1126 vpclmulqdq \$0x00,@{[XWORD($HK)]},@{[XWORD($REG_IN)]},@{[XWORD($T0L)]} # ; L = a0*b0
1128 } elsif ($blocks_left == 2) {
1130 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
1131 vpclmulqdq \$0x01,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M1)]} # ; M1 = a1*b0
1132 vpclmulqdq \$0x10,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0M2)]} # ; M2 = a0*b1
1133 vpclmulqdq \$0x11,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0H)]} # ; H = a1*b1
1134 vpclmulqdq \$0x00,@{[YWORD($HK)]},@{[YWORD($REG_IN)]},@{[YWORD($T0L)]} # ; L = a0*b0
1136 } else { # ; blocks_left == 3
1138 vmovdqu64 @{[HashKeyByIdx($blocks_left, $GCM128_CTX)]},@{[YWORD($HK)]}
1139 vinserti64x2 \$2,@{[HashKeyByIdx($blocks_left-2, $GCM128_CTX)]},$HK,$HK
1140 vpclmulqdq \$0x01,$HK,$REG_IN,$T0M1 # ; M1 = a1*b0
1141 vpclmulqdq \$0x10,$HK,$REG_IN,$T0M2 # ; M2 = a0*b1
1142 vpclmulqdq \$0x11,$HK,$REG_IN,$T0H # ; H = a1*b1
1143 vpclmulqdq \$0x00,$HK,$REG_IN,$T0L # ; L = a0*b0
1147 if (scalar(@_) == 20) {
1149 # ;; *** GH/GM/GL passed as arguments
1150 if ($NUM_BLOCKS >= 4) {
1152 # ;; add ghash product sums from the first 4, 8 or 12 blocks
1153 vpxorq $T1M1,$T0M1,$T0M1
1154 vpternlogq \$0x96,$T1M2,$GM,$T0M2
1155 vpternlogq \$0x96,$T1H,$GH,$T0H
1156 vpternlogq \$0x96,$T1L,$GL,$T0L
1160 vpxorq $GM,$T0M1,$T0M1
1161 vpxorq $GH,$T0H,$T0H
1162 vpxorq $GL,$T0L,$T0L
1167 # ;; *** GH/GM/GL NOT passed as arguments
1168 if ($NUM_BLOCKS >= 4) {
1170 # ;; add ghash product sums from the first 4, 8 or 12 blocks
1171 vpxorq $T1M1,$T0M1,$T0M1
1172 vpxorq $T1M2,$T0M2,$T0M2
1173 vpxorq $T1H,$T0H,$T0H
1174 vpxorq $T1L,$T0L,$T0L
1179 # ;; integrate TM into TH and TL
1180 vpxorq $T0M2,$T0M1,$T0M1
1181 vpsrldq \$8,$T0M1,$T1M1
1182 vpslldq \$8,$T0M1,$T1M2
1183 vpxorq $T1M1,$T0H,$T0H
1184 vpxorq $T1M2,$T0L,$T0L
1188 # ;; =====================================================
1189 # ;; number of blocks is 4, 8, 12 or 16
1190 # ;; T1H/L/M1/M2 include product sums not T0H/L/M1/M2
1191 if (scalar(@_) == 20) {
1193 # ;; *** GH/GM/GL passed as arguments
1194 vpxorq $GM,$T1M1,$T1M1
1195 vpxorq $GH,$T1H,$T1H
1196 vpxorq $GL,$T1L,$T1L
1200 # ;; integrate TM into TH and TL
1201 vpxorq $T1M2,$T1M1,$T1M1
1202 vpsrldq \$8,$T1M1,$T0M1
1203 vpslldq \$8,$T1M1,$T0M2
1204 vpxorq $T0M1,$T1H,$T0H
1205 vpxorq $T0M2,$T1L,$T0L
1209 # ;; add TH and TL 128-bit words horizontally
1210 &VHPXORI4x128($T0H, $T1M1);
1211 &VHPXORI4x128($T0L, $T1M2);
1214 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($HK)]}\n";
1224 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1225 # ;; GHASH_MUL MACRO to implement: Data*HashKey mod (x^128 + x^127 + x^126 +x^121 + 1)
1226 # ;; Input: A and B (128-bits each, bit-reflected)
1227 # ;; Output: C = A*B*x mod poly, (i.e. >>1 )
1228 # ;; To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
1229 # ;; GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
1231 # ;; Refer to [3] for more details.
1232 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1234 my $GH = $_[0]; #; [in/out] xmm/ymm/zmm with multiply operand(s) (128-bits)
1235 my $HK = $_[1]; #; [in] xmm/ymm/zmm with hash key value(s) (128-bits)
1236 my $T1 = $_[2]; #; [clobbered] xmm/ymm/zmm
1237 my $T2 = $_[3]; #; [clobbered] xmm/ymm/zmm
1238 my $T3 = $_[4]; #; [clobbered] xmm/ymm/zmm
1241 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1242 vpclmulqdq \$0x11,$HK,$GH,$T1 # ; $T1 = a1*b1
1243 vpclmulqdq \$0x00,$HK,$GH,$T2 # ; $T2 = a0*b0
1244 vpclmulqdq \$0x01,$HK,$GH,$T3 # ; $T3 = a1*b0
1245 vpclmulqdq \$0x10,$HK,$GH,$GH # ; $GH = a0*b1
1248 vpsrldq \$8,$GH,$T3 # ; shift-R $GH 2 DWs
1249 vpslldq \$8,$GH,$GH # ; shift-L $GH 2 DWs
1253 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1254 # ;first phase of the reduction
1255 vmovdqu64 POLY2(%rip),$T3
1257 vpclmulqdq \$0x01,$GH,$T3,$T2
1258 vpslldq \$8,$T2,$T2 # ; shift-L $T2 2 DWs
1259 vpxorq $T2,$GH,$GH # ; first phase of the reduction complete
1261 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1262 # ;second phase of the reduction
1263 vpclmulqdq \$0x00,$GH,$T3,$T2
1264 vpsrldq \$4,$T2,$T2 # ; shift-R only 1-DW to obtain 2-DWs shift-R
1265 vpclmulqdq \$0x10,$GH,$T3,$GH
1266 vpslldq \$4,$GH,$GH # ; Shift-L 1-DW to obtain result with no shifts
1267 # ; second phase of the reduction complete, the result is in $GH
1268 vpternlogq \$0x96,$T2,$T1,$GH # ; GH = GH xor T1 xor T2
1269 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1273 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1274 # ;;; PRECOMPUTE computes HashKey_i
1276 my $GCM128_CTX = $_[0]; #; [in/out] context pointer, hkeys content updated
1277 my $HK = $_[1]; #; [in] xmm, hash key
1278 my $T1 = $_[2]; #; [clobbered] xmm
1279 my $T2 = $_[3]; #; [clobbered] xmm
1280 my $T3 = $_[4]; #; [clobbered] xmm
1281 my $T4 = $_[5]; #; [clobbered] xmm
1282 my $T5 = $_[6]; #; [clobbered] xmm
1283 my $T6 = $_[7]; #; [clobbered] xmm
1285 my $ZT1 = &ZWORD($T1);
1286 my $ZT2 = &ZWORD($T2);
1287 my $ZT3 = &ZWORD($T3);
1288 my $ZT4 = &ZWORD($T4);
1289 my $ZT5 = &ZWORD($T5);
1290 my $ZT6 = &ZWORD($T6);
1292 my $YT1 = &YWORD($T1);
1293 my $YT2 = &YWORD($T2);
1294 my $YT3 = &YWORD($T3);
1295 my $YT4 = &YWORD($T4);
1296 my $YT5 = &YWORD($T5);
1297 my $YT6 = &YWORD($T6);
1300 vshufi32x4 \$0x00,@{[YWORD($HK)]},@{[YWORD($HK)]},$YT5
1304 # ;; calculate HashKey^2<<1 mod poly
1305 &GHASH_MUL($YT4, $YT5, $YT1, $YT2, $YT3);
1308 vmovdqu64 $T4,@{[HashKeyByIdx(2,$GCM128_CTX)]}
1309 vinserti64x2 \$1,$HK,$YT4,$YT5
1310 vmovdqa64 $YT5,$YT6 # ;; YT6 = HashKey | HashKey^2
1313 # ;; use 2x128-bit computation
1314 # ;; calculate HashKey^4<<1 mod poly, HashKey^3<<1 mod poly
1315 &GHASH_MUL($YT5, $YT4, $YT1, $YT2, $YT3); # ;; YT5 = HashKey^3 | HashKey^4
1318 vmovdqu64 $YT5,@{[HashKeyByIdx(4,$GCM128_CTX)]}
1320 vinserti64x4 \$1,$YT6,$ZT5,$ZT5 # ;; ZT5 = YT6 | YT5
1322 # ;; switch to 4x128-bit computations now
1323 vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^4 across all ZT4
1324 vmovdqa64 $ZT5,$ZT6 # ;; save HashKey^4 to HashKey^1 in ZT6
1327 # ;; calculate HashKey^5<<1 mod poly, HashKey^6<<1 mod poly, ... HashKey^8<<1 mod poly
1328 &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
1330 vmovdqu64 $ZT5,@{[HashKeyByIdx(8,$GCM128_CTX)]} # ;; HashKey^8 to HashKey^5 in ZT5 now
1331 vshufi64x2 \$0x00,$ZT5,$ZT5,$ZT4 # ;; broadcast HashKey^8 across all ZT4
1334 # ;; calculate HashKey^9<<1 mod poly, HashKey^10<<1 mod poly, ... HashKey^16<<1 mod poly
1335 # ;; use HashKey^8 as multiplier against ZT6 and ZT5 - this allows deeper ooo execution
1337 # ;; compute HashKey^(12), HashKey^(11), ... HashKey^(9)
1338 &GHASH_MUL($ZT6, $ZT4, $ZT1, $ZT2, $ZT3);
1339 $code .= "vmovdqu64 $ZT6,@{[HashKeyByIdx(12,$GCM128_CTX)]}\n";
1341 # ;; compute HashKey^(16), HashKey^(15), ... HashKey^(13)
1342 &GHASH_MUL($ZT5, $ZT4, $ZT1, $ZT2, $ZT3);
1343 $code .= "vmovdqu64 $ZT5,@{[HashKeyByIdx(16,$GCM128_CTX)]}\n";
1345 # ; Hkeys 17..48 will be precomputed somewhere else as context can hold only 16 hkeys
1348 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1349 # ;; READ_SMALL_DATA_INPUT
1350 # ;; Packs xmm register with data when data input is less or equal to 16 bytes
1351 # ;; Returns 0 if data has length 0
1352 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1353 sub READ_SMALL_DATA_INPUT {
1354 my $OUTPUT = $_[0]; # [out] xmm register
1355 my $INPUT = $_[1]; # [in] buffer pointer to read from
1356 my $LENGTH = $_[2]; # [in] number of bytes to read
1357 my $TMP1 = $_[3]; # [clobbered]
1358 my $TMP2 = $_[4]; # [clobbered]
1359 my $MASK = $_[5]; # [out] k1 to k7 register to store the partial block mask
1362 mov \$16,@{[DWORD($TMP2)]}
1363 lea byte_len_to_mask_table(%rip),$TMP1
1374 $code .= "kmovw ($TMP1,$TMP2,2),$MASK\n";
1376 $code .= "vmovdqu8 ($INPUT),${OUTPUT}{$MASK}{z}\n";
1379 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1380 # CALC_AAD_HASH: Calculates the hash of the data which will not be encrypted.
1381 # Input: The input data (A_IN), that data's length (A_LEN), and the hash key (HASH_KEY).
1382 # Output: The hash of the data (AAD_HASH).
1383 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1385 my $A_IN = $_[0]; # [in] AAD text pointer
1386 my $A_LEN = $_[1]; # [in] AAD length
1387 my $AAD_HASH = $_[2]; # [in/out] xmm ghash value
1388 my $GCM128_CTX = $_[3]; # [in] pointer to context
1389 my $ZT0 = $_[4]; # [clobbered] ZMM register
1390 my $ZT1 = $_[5]; # [clobbered] ZMM register
1391 my $ZT2 = $_[6]; # [clobbered] ZMM register
1392 my $ZT3 = $_[7]; # [clobbered] ZMM register
1393 my $ZT4 = $_[8]; # [clobbered] ZMM register
1394 my $ZT5 = $_[9]; # [clobbered] ZMM register
1395 my $ZT6 = $_[10]; # [clobbered] ZMM register
1396 my $ZT7 = $_[11]; # [clobbered] ZMM register
1397 my $ZT8 = $_[12]; # [clobbered] ZMM register
1398 my $ZT9 = $_[13]; # [clobbered] ZMM register
1399 my $ZT10 = $_[14]; # [clobbered] ZMM register
1400 my $ZT11 = $_[15]; # [clobbered] ZMM register
1401 my $ZT12 = $_[16]; # [clobbered] ZMM register
1402 my $ZT13 = $_[17]; # [clobbered] ZMM register
1403 my $ZT14 = $_[18]; # [clobbered] ZMM register
1404 my $ZT15 = $_[19]; # [clobbered] ZMM register
1405 my $ZT16 = $_[20]; # [clobbered] ZMM register
1406 my $T1 = $_[21]; # [clobbered] GP register
1407 my $T2 = $_[22]; # [clobbered] GP register
1408 my $T3 = $_[23]; # [clobbered] GP register
1409 my $MASKREG = $_[24]; # [clobbered] mask register
1411 my $HKEYS_READY = "%rbx";
1415 my $label_suffix = $label_count++;
1418 mov $A_IN,$T1 # ; T1 = AAD
1419 mov $A_LEN,$T2 # ; T2 = aadLen
1421 jz .L_CALC_AAD_done_${label_suffix}
1423 xor $HKEYS_READY,$HKEYS_READY
1424 vmovdqa64 SHUF_MASK(%rip),$SHFMSK
1426 .L_get_AAD_loop48x16_${label_suffix}:
1428 jl .L_exit_AAD_loop48x16_${label_suffix}
1432 vmovdqu64 `64*0`($T1),$ZT1 # ; Blocks 0-3
1433 vmovdqu64 `64*1`($T1),$ZT2 # ; Blocks 4-7
1434 vmovdqu64 `64*2`($T1),$ZT3 # ; Blocks 8-11
1435 vmovdqu64 `64*3`($T1),$ZT4 # ; Blocks 12-15
1436 vpshufb $SHFMSK,$ZT1,$ZT1
1437 vpshufb $SHFMSK,$ZT2,$ZT2
1438 vpshufb $SHFMSK,$ZT3,$ZT3
1439 vpshufb $SHFMSK,$ZT4,$ZT4
1442 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "all");
1443 $code .= "mov \$1,$HKEYS_READY\n";
1446 "start", $ZT5, $ZT6, $ZT7,
1447 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1448 &HashKeyOffsetByIdx(48, "frame"), 0, "@{[ZWORD($AAD_HASH)]}", $ZT0,
1449 $ZT8, $ZT9, $ZT10, $ZT11,
1450 $ZT12, $ZT14, $ZT15, $ZT16,
1451 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1455 vmovdqu64 `16*16 + 64*0`($T1),$ZT1 # ; Blocks 16-19
1456 vmovdqu64 `16*16 + 64*1`($T1),$ZT2 # ; Blocks 20-23
1457 vmovdqu64 `16*16 + 64*2`($T1),$ZT3 # ; Blocks 24-27
1458 vmovdqu64 `16*16 + 64*3`($T1),$ZT4 # ; Blocks 28-31
1459 vpshufb $SHFMSK,$ZT1,$ZT1
1460 vpshufb $SHFMSK,$ZT2,$ZT2
1461 vpshufb $SHFMSK,$ZT3,$ZT3
1462 vpshufb $SHFMSK,$ZT4,$ZT4
1466 "mid", $ZT5, $ZT6, $ZT7,
1467 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1468 &HashKeyOffsetByIdx(32, "frame"), 0, "NO_HASH_IN_OUT", $ZT0,
1469 $ZT8, $ZT9, $ZT10, $ZT11,
1470 $ZT12, $ZT14, $ZT15, $ZT16,
1471 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1475 vmovdqu64 `32*16 + 64*0`($T1),$ZT1 # ; Blocks 32-35
1476 vmovdqu64 `32*16 + 64*1`($T1),$ZT2 # ; Blocks 36-39
1477 vmovdqu64 `32*16 + 64*2`($T1),$ZT3 # ; Blocks 40-43
1478 vmovdqu64 `32*16 + 64*3`($T1),$ZT4 # ; Blocks 44-47
1479 vpshufb $SHFMSK,$ZT1,$ZT1
1480 vpshufb $SHFMSK,$ZT2,$ZT2
1481 vpshufb $SHFMSK,$ZT3,$ZT3
1482 vpshufb $SHFMSK,$ZT4,$ZT4
1486 "end_reduce", $ZT5, $ZT6, $ZT7,
1487 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1488 &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1489 $ZT8, $ZT9, $ZT10, $ZT11,
1490 $ZT12, $ZT14, $ZT15, $ZT16,
1491 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1496 je .L_CALC_AAD_done_${label_suffix}
1499 jmp .L_get_AAD_loop48x16_${label_suffix}
1501 .L_exit_AAD_loop48x16_${label_suffix}:
1502 # ; Less than 48x16 bytes remaining
1504 jl .L_less_than_32x16_${label_suffix}
1508 # ; Get next 16 blocks
1509 vmovdqu64 `64*0`($T1),$ZT1
1510 vmovdqu64 `64*1`($T1),$ZT2
1511 vmovdqu64 `64*2`($T1),$ZT3
1512 vmovdqu64 `64*3`($T1),$ZT4
1513 vpshufb $SHFMSK,$ZT1,$ZT1
1514 vpshufb $SHFMSK,$ZT2,$ZT2
1515 vpshufb $SHFMSK,$ZT3,$ZT3
1516 vpshufb $SHFMSK,$ZT4,$ZT4
1519 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZT0, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT14, "first32");
1520 $code .= "mov \$1,$HKEYS_READY\n";
1523 "start", $ZT5, $ZT6, $ZT7,
1524 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1525 &HashKeyOffsetByIdx(32, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1526 $ZT8, $ZT9, $ZT10, $ZT11,
1527 $ZT12, $ZT14, $ZT15, $ZT16,
1528 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1532 vmovdqu64 `16*16 + 64*0`($T1),$ZT1
1533 vmovdqu64 `16*16 + 64*1`($T1),$ZT2
1534 vmovdqu64 `16*16 + 64*2`($T1),$ZT3
1535 vmovdqu64 `16*16 + 64*3`($T1),$ZT4
1536 vpshufb $SHFMSK,$ZT1,$ZT1
1537 vpshufb $SHFMSK,$ZT2,$ZT2
1538 vpshufb $SHFMSK,$ZT3,$ZT3
1539 vpshufb $SHFMSK,$ZT4,$ZT4
1543 "end_reduce", $ZT5, $ZT6, $ZT7,
1544 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", "%rsp",
1545 &HashKeyOffsetByIdx(16, "frame"), 0, &ZWORD($AAD_HASH), $ZT0,
1546 $ZT8, $ZT9, $ZT10, $ZT11,
1547 $ZT12, $ZT14, $ZT15, $ZT16,
1548 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1553 je .L_CALC_AAD_done_${label_suffix}
1556 jmp .L_less_than_16x16_${label_suffix}
1558 .L_less_than_32x16_${label_suffix}:
1560 jl .L_less_than_16x16_${label_suffix}
1561 # ; Get next 16 blocks
1562 vmovdqu64 `64*0`($T1),$ZT1
1563 vmovdqu64 `64*1`($T1),$ZT2
1564 vmovdqu64 `64*2`($T1),$ZT3
1565 vmovdqu64 `64*3`($T1),$ZT4
1566 vpshufb $SHFMSK,$ZT1,$ZT1
1567 vpshufb $SHFMSK,$ZT2,$ZT2
1568 vpshufb $SHFMSK,$ZT3,$ZT3
1569 vpshufb $SHFMSK,$ZT4,$ZT4
1572 # ; This code path does not use more than 16 hkeys, so they can be taken from the context
1573 # ; (not from the stack storage)
1575 "start_reduce", $ZT5, $ZT6, $ZT7,
1576 "NO_INPUT_PTR", "NO_INPUT_PTR", "NO_INPUT_PTR", $GCM128_CTX,
1577 &HashKeyOffsetByIdx(16, "context"), 0, &ZWORD($AAD_HASH), $ZT0,
1578 $ZT8, $ZT9, $ZT10, $ZT11,
1579 $ZT12, $ZT14, $ZT15, $ZT16,
1580 "NO_ZMM", $ZT1, $ZT2, $ZT3,
1585 je .L_CALC_AAD_done_${label_suffix}
1588 # ; Less than 16x16 bytes remaining
1589 .L_less_than_16x16_${label_suffix}:
1590 # ;; prep mask source address
1591 lea byte64_len_to_mask_table(%rip),$T3
1594 # ;; calculate number of blocks to ghash (including partial bytes)
1595 add \$15,@{[DWORD($T2)]}
1596 shr \$4,@{[DWORD($T2)]}
1597 cmp \$2,@{[DWORD($T2)]}
1598 jb .L_AAD_blocks_1_${label_suffix}
1599 je .L_AAD_blocks_2_${label_suffix}
1600 cmp \$4,@{[DWORD($T2)]}
1601 jb .L_AAD_blocks_3_${label_suffix}
1602 je .L_AAD_blocks_4_${label_suffix}
1603 cmp \$6,@{[DWORD($T2)]}
1604 jb .L_AAD_blocks_5_${label_suffix}
1605 je .L_AAD_blocks_6_${label_suffix}
1606 cmp \$8,@{[DWORD($T2)]}
1607 jb .L_AAD_blocks_7_${label_suffix}
1608 je .L_AAD_blocks_8_${label_suffix}
1609 cmp \$10,@{[DWORD($T2)]}
1610 jb .L_AAD_blocks_9_${label_suffix}
1611 je .L_AAD_blocks_10_${label_suffix}
1612 cmp \$12,@{[DWORD($T2)]}
1613 jb .L_AAD_blocks_11_${label_suffix}
1614 je .L_AAD_blocks_12_${label_suffix}
1615 cmp \$14,@{[DWORD($T2)]}
1616 jb .L_AAD_blocks_13_${label_suffix}
1617 je .L_AAD_blocks_14_${label_suffix}
1618 cmp \$15,@{[DWORD($T2)]}
1619 je .L_AAD_blocks_15_${label_suffix}
1622 # ;; fall through for 16 blocks
1624 # ;; The flow of each of these cases is identical:
1625 # ;; - load blocks plain text
1626 # ;; - shuffle loaded blocks
1627 # ;; - xor in current hash value into block 0
1628 # ;; - perform up multiplications with ghash keys
1629 # ;; - jump to reduction code
1631 for (my $aad_blocks = 16; $aad_blocks > 0; $aad_blocks--) {
1632 $code .= ".L_AAD_blocks_${aad_blocks}_${label_suffix}:\n";
1633 if ($aad_blocks > 12) {
1634 $code .= "sub \$`12*16*8`, $T3\n";
1635 } elsif ($aad_blocks > 8) {
1636 $code .= "sub \$`8*16*8`, $T3\n";
1637 } elsif ($aad_blocks > 4) {
1638 $code .= "sub \$`4*16*8`, $T3\n";
1640 $code .= "kmovq ($T3),$MASKREG\n";
1642 &ZMM_LOAD_MASKED_BLOCKS_0_16($aad_blocks, $T1, 0, $ZT1, $ZT2, $ZT3, $ZT4, $MASKREG);
1644 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16($aad_blocks, "vpshufb", $ZT1, $ZT2, $ZT3, $ZT4,
1645 $ZT1, $ZT2, $ZT3, $ZT4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
1647 &GHASH_1_TO_16($GCM128_CTX, &ZWORD($AAD_HASH),
1648 $ZT0, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, &ZWORD($AAD_HASH), $ZT1, $ZT2, $ZT3, $ZT4, $aad_blocks);
1650 if ($aad_blocks > 1) {
1652 # ;; fall through to CALC_AAD_done in 1 block case
1653 $code .= "jmp .L_CALC_AAD_done_${label_suffix}\n";
1657 $code .= ".L_CALC_AAD_done_${label_suffix}:\n";
1659 # ;; result in AAD_HASH
1662 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1664 # ;; Handles encryption/decryption and the tag partial blocks between
1666 # ;; Requires the input data be at least 1 byte long.
1668 # ;; A cipher/plain of the first partial block (CIPH_PLAIN_OUT),
1669 # ;; AAD_HASH and updated GCM128_CTX
1670 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1672 my $GCM128_CTX = $_[0]; # [in] key pointer
1673 my $PBLOCK_LEN = $_[1]; # [in] partial block length
1674 my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
1675 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
1676 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
1677 my $DATA_OFFSET = $_[5]; # [out] data offset (gets set)
1678 my $AAD_HASH = $_[6]; # [out] updated GHASH value
1679 my $ENC_DEC = $_[7]; # [in] cipher direction
1680 my $GPTMP0 = $_[8]; # [clobbered] GP temporary register
1681 my $GPTMP1 = $_[9]; # [clobbered] GP temporary register
1682 my $GPTMP2 = $_[10]; # [clobbered] GP temporary register
1683 my $ZTMP0 = $_[11]; # [clobbered] ZMM temporary register
1684 my $ZTMP1 = $_[12]; # [clobbered] ZMM temporary register
1685 my $ZTMP2 = $_[13]; # [clobbered] ZMM temporary register
1686 my $ZTMP3 = $_[14]; # [clobbered] ZMM temporary register
1687 my $ZTMP4 = $_[15]; # [clobbered] ZMM temporary register
1688 my $ZTMP5 = $_[16]; # [clobbered] ZMM temporary register
1689 my $ZTMP6 = $_[17]; # [clobbered] ZMM temporary register
1690 my $ZTMP7 = $_[18]; # [clobbered] ZMM temporary register
1691 my $MASKREG = $_[19]; # [clobbered] mask temporary register
1693 my $XTMP0 = &XWORD($ZTMP0);
1694 my $XTMP1 = &XWORD($ZTMP1);
1695 my $XTMP2 = &XWORD($ZTMP2);
1696 my $XTMP3 = &XWORD($ZTMP3);
1697 my $XTMP4 = &XWORD($ZTMP4);
1698 my $XTMP5 = &XWORD($ZTMP5);
1699 my $XTMP6 = &XWORD($ZTMP6);
1700 my $XTMP7 = &XWORD($ZTMP7);
1702 my $LENGTH = $DATA_OFFSET;
1707 my $label_suffix = $label_count++;
1710 # ;; if no partial block present then LENGTH/DATA_OFFSET will be set to zero
1711 mov ($PBLOCK_LEN),$LENGTH
1713 je .L_partial_block_done_${label_suffix} # ;Leave Macro if no partial blocks
1716 &READ_SMALL_DATA_INPUT($XTMP0, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $IA0, $IA2, $MASKREG);
1719 # ;; XTMP1 = my_ctx_data.partial_block_enc_key
1720 vmovdqu64 $CTX_OFFSET_PEncBlock($GCM128_CTX),$XTMP1
1721 vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},$XTMP2
1723 # ;; adjust the shuffle mask pointer to be able to shift right $LENGTH bytes
1724 # ;; (16 - $LENGTH) is the number of bytes in plaintext mod 16)
1725 lea SHIFT_MASK(%rip),$IA0
1727 vmovdqu64 ($IA0),$XTMP3 # ; shift right shuffle mask
1728 vpshufb $XTMP3,$XTMP1,$XTMP1
1731 if ($ENC_DEC eq "DEC") {
1733 # ;; keep copy of cipher text in $XTMP4
1734 vmovdqa64 $XTMP0,$XTMP4
1738 vpxorq $XTMP0,$XTMP1,$XTMP1 # ; Ciphertext XOR E(K, Yn)
1739 # ;; Set $IA1 to be the amount of data left in CIPH_PLAIN_IN after filling the block
1740 # ;; Determine if partial block is not being filled and shift mask accordingly
1744 mov $PLAIN_CIPH_LEN,$IA1
1748 $code .= "lea ($PLAIN_CIPH_LEN, $LENGTH, 1),$IA1\n";
1752 jge .L_no_extra_mask_${label_suffix}
1754 .L_no_extra_mask_${label_suffix}:
1755 # ;; get the appropriate mask to mask out bottom $LENGTH bytes of $XTMP1
1756 # ;; - mask out bottom $LENGTH bytes of $XTMP1
1757 # ;; sizeof(SHIFT_MASK) == 16 bytes
1758 vmovdqu64 16($IA0),$XTMP0
1759 vpand $XTMP0,$XTMP1,$XTMP1
1762 if ($ENC_DEC eq "DEC") {
1764 vpand $XTMP0,$XTMP4,$XTMP4
1765 vpshufb SHUF_MASK(%rip),$XTMP4,$XTMP4
1766 vpshufb $XTMP3,$XTMP4,$XTMP4
1767 vpxorq $XTMP4,$AAD_HASH,$AAD_HASH
1771 vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
1772 vpshufb $XTMP3,$XTMP1,$XTMP1
1773 vpxorq $XTMP1,$AAD_HASH,$AAD_HASH
1778 jl .L_partial_incomplete_${label_suffix}
1781 # ;; GHASH computation for the last <16 Byte block
1782 &GHASH_MUL($AAD_HASH, $XTMP2, $XTMP5, $XTMP6, $XTMP7);
1785 movq \$0, ($PBLOCK_LEN)
1786 # ;; Set $LENGTH to be the number of bytes to write out
1790 jmp .L_enc_dec_done_${label_suffix}
1792 .L_partial_incomplete_${label_suffix}:
1796 mov $PLAIN_CIPH_LEN,$IA0
1797 add $IA0,($PBLOCK_LEN)
1800 $code .= "add $PLAIN_CIPH_LEN,($PBLOCK_LEN)\n";
1803 mov $PLAIN_CIPH_LEN,$LENGTH
1805 .L_enc_dec_done_${label_suffix}:
1806 # ;; output encrypted Bytes
1808 lea byte_len_to_mask_table(%rip),$IA0
1809 kmovw ($IA0,$LENGTH,2),$MASKREG
1810 vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)
1813 if ($ENC_DEC eq "ENC") {
1815 # ;; shuffle XTMP1 back to output as ciphertext
1816 vpshufb SHUF_MASK(%rip),$XTMP1,$XTMP1
1817 vpshufb $XTMP3,$XTMP1,$XTMP1
1821 mov $CIPH_PLAIN_OUT,$IA0
1822 vmovdqu8 $XTMP1,($IA0){$MASKREG}
1823 .L_partial_block_done_${label_suffix}:
1827 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1828 # ;; Ciphers 1 to 16 blocks and prepares them for later GHASH compute operation
1829 sub INITIAL_BLOCKS_PARTIAL_CIPHER {
1830 my $AES_KEYS = $_[0]; # [in] key pointer
1831 my $GCM128_CTX = $_[1]; # [in] context pointer
1832 my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
1833 my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
1834 my $LENGTH = $_[4]; # [in/clobbered] length in bytes
1835 my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
1836 my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
1837 my $CTR = $_[7]; # [in/out] current counter value
1838 my $ENC_DEC = $_[8]; # [in] cipher direction (ENC/DEC)
1839 my $DAT0 = $_[9]; # [out] ZMM with cipher text shuffled for GHASH
1840 my $DAT1 = $_[10]; # [out] ZMM with cipher text shuffled for GHASH
1841 my $DAT2 = $_[11]; # [out] ZMM with cipher text shuffled for GHASH
1842 my $DAT3 = $_[12]; # [out] ZMM with cipher text shuffled for GHASH
1843 my $LAST_CIPHER_BLK = $_[13]; # [out] XMM to put ciphered counter block partially xor'ed with text
1844 my $LAST_GHASH_BLK = $_[14]; # [out] XMM to put last cipher text block shuffled for GHASH
1845 my $CTR0 = $_[15]; # [clobbered] ZMM temporary
1846 my $CTR1 = $_[16]; # [clobbered] ZMM temporary
1847 my $CTR2 = $_[17]; # [clobbered] ZMM temporary
1848 my $CTR3 = $_[18]; # [clobbered] ZMM temporary
1849 my $ZT1 = $_[19]; # [clobbered] ZMM temporary
1850 my $IA0 = $_[20]; # [clobbered] GP temporary
1851 my $IA1 = $_[21]; # [clobbered] GP temporary
1852 my $MASKREG = $_[22]; # [clobbered] mask register
1853 my $SHUFMASK = $_[23]; # [out] ZMM loaded with BE/LE shuffle mask
1855 if ($NUM_BLOCKS == 1) {
1856 $code .= "vmovdqa64 SHUF_MASK(%rip),@{[XWORD($SHUFMASK)]}\n";
1857 } elsif ($NUM_BLOCKS == 2) {
1858 $code .= "vmovdqa64 SHUF_MASK(%rip),@{[YWORD($SHUFMASK)]}\n";
1860 $code .= "vmovdqa64 SHUF_MASK(%rip),$SHUFMASK\n";
1863 # ;; prepare AES counter blocks
1864 if ($NUM_BLOCKS == 1) {
1865 $code .= "vpaddd ONE(%rip),$CTR,@{[XWORD($CTR0)]}\n";
1866 } elsif ($NUM_BLOCKS == 2) {
1868 vshufi64x2 \$0,@{[YWORD($CTR)]},@{[YWORD($CTR)]},@{[YWORD($CTR0)]}
1869 vpaddd ddq_add_1234(%rip),@{[YWORD($CTR0)]},@{[YWORD($CTR0)]}
1873 vshufi64x2 \$0,@{[ZWORD($CTR)]},@{[ZWORD($CTR)]},@{[ZWORD($CTR)]}
1874 vpaddd ddq_add_1234(%rip),@{[ZWORD($CTR)]},$CTR0
1876 if ($NUM_BLOCKS > 4) {
1877 $code .= "vpaddd ddq_add_5678(%rip),@{[ZWORD($CTR)]},$CTR1\n";
1879 if ($NUM_BLOCKS > 8) {
1880 $code .= "vpaddd ddq_add_8888(%rip),$CTR0,$CTR2\n";
1882 if ($NUM_BLOCKS > 12) {
1883 $code .= "vpaddd ddq_add_8888(%rip),$CTR1,$CTR3\n";
1887 # ;; get load/store mask
1889 lea byte64_len_to_mask_table(%rip),$IA0
1892 if ($NUM_BLOCKS > 12) {
1893 $code .= "sub \$`3*64`,$IA1\n";
1894 } elsif ($NUM_BLOCKS > 8) {
1895 $code .= "sub \$`2*64`,$IA1\n";
1896 } elsif ($NUM_BLOCKS > 4) {
1897 $code .= "sub \$`1*64`,$IA1\n";
1899 $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
1901 # ;; extract new counter value
1902 # ;; shuffle the counters for AES rounds
1903 if ($NUM_BLOCKS <= 4) {
1904 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$CTR\n";
1905 } elsif ($NUM_BLOCKS <= 8) {
1906 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$CTR\n";
1907 } elsif ($NUM_BLOCKS <= 12) {
1908 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$CTR\n";
1910 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$CTR\n";
1912 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1913 $NUM_BLOCKS, "vpshufb", $CTR0, $CTR1, $CTR2, $CTR3, $CTR0,
1914 $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1916 # ;; load plain/cipher text
1917 &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DAT0, $DAT1, $DAT2, $DAT3, $MASKREG);
1919 # ;; AES rounds and XOR with plain/cipher text
1920 foreach my $j (0 .. ($NROUNDS + 1)) {
1921 $code .= "vbroadcastf64x2 `($j * 16)`($AES_KEYS),$ZT1\n";
1922 &ZMM_AESENC_ROUND_BLOCKS_0_16($CTR0, $CTR1, $CTR2, $CTR3, $ZT1, $j,
1923 $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $NROUNDS);
1926 # ;; retrieve the last cipher counter block (partially XOR'ed with text)
1927 # ;; - this is needed for partial block cases
1928 if ($NUM_BLOCKS <= 4) {
1929 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$CTR0,$LAST_CIPHER_BLK\n";
1930 } elsif ($NUM_BLOCKS <= 8) {
1931 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$CTR1,$LAST_CIPHER_BLK\n";
1932 } elsif ($NUM_BLOCKS <= 12) {
1933 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$CTR2,$LAST_CIPHER_BLK\n";
1935 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$CTR3,$LAST_CIPHER_BLK\n";
1938 # ;; write cipher/plain text back to output and
1939 $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
1940 &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $CTR0, $CTR1, $CTR2, $CTR3, $MASKREG);
1942 # ;; zero bytes outside the mask before hashing
1943 if ($NUM_BLOCKS <= 4) {
1944 $code .= "vmovdqu8 $CTR0,${CTR0}{$MASKREG}{z}\n";
1945 } elsif ($NUM_BLOCKS <= 8) {
1946 $code .= "vmovdqu8 $CTR1,${CTR1}{$MASKREG}{z}\n";
1947 } elsif ($NUM_BLOCKS <= 12) {
1948 $code .= "vmovdqu8 $CTR2,${CTR2}{$MASKREG}{z}\n";
1950 $code .= "vmovdqu8 $CTR3,${CTR3}{$MASKREG}{z}\n";
1953 # ;; Shuffle the cipher text blocks for hashing part
1954 # ;; ZT5 and ZT6 are expected outputs with blocks for hashing
1955 if ($ENC_DEC eq "DEC") {
1958 # ;; - cipher blocks are in ZT5 & ZT6
1959 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1960 $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $DAT0,
1961 $DAT1, $DAT2, $DAT3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1965 # ;; - cipher blocks are in CTR0-CTR3
1966 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
1967 $NUM_BLOCKS, "vpshufb", $DAT0, $DAT1, $DAT2, $DAT3, $CTR0,
1968 $CTR1, $CTR2, $CTR3, $SHUFMASK, $SHUFMASK, $SHUFMASK, $SHUFMASK);
1971 # ;; Extract the last block for partials and multi_call cases
1972 if ($NUM_BLOCKS <= 4) {
1973 $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DAT0,$LAST_GHASH_BLK\n";
1974 } elsif ($NUM_BLOCKS <= 8) {
1975 $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DAT1,$LAST_GHASH_BLK\n";
1976 } elsif ($NUM_BLOCKS <= 12) {
1977 $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DAT2,$LAST_GHASH_BLK\n";
1979 $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DAT3,$LAST_GHASH_BLK\n";
1984 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
1985 # ;; Computes GHASH on 1 to 16 blocks
1986 sub INITIAL_BLOCKS_PARTIAL_GHASH {
1987 my $AES_KEYS = $_[0]; # [in] key pointer
1988 my $GCM128_CTX = $_[1]; # [in] context pointer
1989 my $LENGTH = $_[2]; # [in/clobbered] length in bytes
1990 my $NUM_BLOCKS = $_[3]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
1991 my $HASH_IN_OUT = $_[4]; # [in/out] XMM ghash in/out value
1992 my $ENC_DEC = $_[5]; # [in] cipher direction (ENC/DEC)
1993 my $DAT0 = $_[6]; # [in] ZMM with cipher text shuffled for GHASH
1994 my $DAT1 = $_[7]; # [in] ZMM with cipher text shuffled for GHASH
1995 my $DAT2 = $_[8]; # [in] ZMM with cipher text shuffled for GHASH
1996 my $DAT3 = $_[9]; # [in] ZMM with cipher text shuffled for GHASH
1997 my $LAST_CIPHER_BLK = $_[10]; # [in] XMM with ciphered counter block partially xor'ed with text
1998 my $LAST_GHASH_BLK = $_[11]; # [in] XMM with last cipher text block shuffled for GHASH
1999 my $ZT0 = $_[12]; # [clobbered] ZMM temporary
2000 my $ZT1 = $_[13]; # [clobbered] ZMM temporary
2001 my $ZT2 = $_[14]; # [clobbered] ZMM temporary
2002 my $ZT3 = $_[15]; # [clobbered] ZMM temporary
2003 my $ZT4 = $_[16]; # [clobbered] ZMM temporary
2004 my $ZT5 = $_[17]; # [clobbered] ZMM temporary
2005 my $ZT6 = $_[18]; # [clobbered] ZMM temporary
2006 my $ZT7 = $_[19]; # [clobbered] ZMM temporary
2007 my $ZT8 = $_[20]; # [clobbered] ZMM temporary
2008 my $PBLOCK_LEN = $_[21]; # [in] partial block length
2009 my $GH = $_[22]; # [in] ZMM with hi product part
2010 my $GM = $_[23]; # [in] ZMM with mid product part
2011 my $GL = $_[24]; # [in] ZMM with lo product part
2013 my $label_suffix = $label_count++;
2015 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2016 # ;;; - Hash all but the last partial block of data
2017 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2019 # ;; update data offset
2020 if ($NUM_BLOCKS > 1) {
2022 # ;; The final block of data may be <16B
2023 $code .= "sub \$16 * ($NUM_BLOCKS - 1),$LENGTH\n";
2026 if ($NUM_BLOCKS < 16) {
2028 # ;; NOTE: the 'jl' is always taken for num_initial_blocks = 16.
2029 # ;; This is run in the context of GCM_ENC_DEC_SMALL for length < 256.
2031 jl .L_small_initial_partial_block_${label_suffix}
2033 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2034 # ;;; Handle a full length final block - encrypt and hash all blocks
2035 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2038 movq \$0,($PBLOCK_LEN)
2041 # ;; Hash all of the data
2042 if (scalar(@_) == 22) {
2044 # ;; start GHASH compute
2045 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2046 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS);
2047 } elsif (scalar(@_) == 25) {
2049 # ;; continue GHASH compute
2050 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2051 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $NUM_BLOCKS, $GH, $GM, $GL);
2053 $code .= "jmp .L_small_initial_compute_done_${label_suffix}\n";
2057 .L_small_initial_partial_block_${label_suffix}:
2059 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2060 # ;;; Handle ghash for a <16B final block
2061 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2063 # ;; As it's an init / update / finalize series we need to leave the
2064 # ;; last block if it's less than a full block of data.
2066 mov $LENGTH,($PBLOCK_LEN)
2067 vmovdqu64 $LAST_CIPHER_BLK,$CTX_OFFSET_PEncBlock($GCM128_CTX)
2070 my $k = ($NUM_BLOCKS - 1);
2071 my $last_block_to_hash = 1;
2072 if (($NUM_BLOCKS > $last_block_to_hash)) {
2074 # ;; ZT12-ZT20 - temporary registers
2075 if (scalar(@_) == 22) {
2077 # ;; start GHASH compute
2078 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2079 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k);
2080 } elsif (scalar(@_) == 25) {
2082 # ;; continue GHASH compute
2083 &GHASH_1_TO_16($GCM128_CTX, $HASH_IN_OUT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
2084 $ZT5, $ZT6, $ZT7, $ZT8, &ZWORD($HASH_IN_OUT), $DAT0, $DAT1, $DAT2, $DAT3, $k, $GH, $GM, $GL);
2087 # ;; just fall through no jmp needed
2090 if (scalar(@_) == 25) {
2092 # ;; Reduction is required in this case.
2093 # ;; Integrate GM into GH and GL.
2094 vpsrldq \$8,$GM,$ZT0
2095 vpslldq \$8,$GM,$ZT1
2100 # ;; Add GH and GL 128-bit words horizontally
2101 &VHPXORI4x128($GH, $ZT0);
2102 &VHPXORI4x128($GL, $ZT1);
2104 # ;; 256-bit to 128-bit reduction
2105 $code .= "vmovdqa64 POLY2(%rip),@{[XWORD($ZT0)]}\n";
2106 &VCLMUL_REDUCE(&XWORD($HASH_IN_OUT), &XWORD($ZT0), &XWORD($GH), &XWORD($GL), &XWORD($ZT1), &XWORD($ZT2));
2109 # ;; Record that a reduction is not needed -
2110 # ;; In this case no hashes are computed because there
2111 # ;; is only one initial block and it is < 16B in length.
2112 # ;; We only need to check if a reduction is needed if
2113 # ;; initial_blocks == 1 and init/update/final is being used.
2114 # ;; In this case we may just have a partial block, and that
2115 # ;; gets hashed in finalize.
2117 # ;; The hash should end up in HASH_IN_OUT.
2118 # ;; The only way we should get here is if there is
2119 # ;; a partial block of data, so xor that into the hash.
2120 vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT
2121 # ;; The result is in $HASH_IN_OUT
2122 jmp .L_after_reduction_${label_suffix}
2126 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2127 # ;;; After GHASH reduction
2128 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2130 $code .= ".L_small_initial_compute_done_${label_suffix}:\n";
2132 # ;; If using init/update/finalize, we need to xor any partial block data
2134 if ($NUM_BLOCKS > 1) {
2136 # ;; NOTE: for $NUM_BLOCKS = 0 the xor never takes place
2137 if ($NUM_BLOCKS != 16) {
2139 # ;; NOTE: for $NUM_BLOCKS = 16, $LENGTH, stored in [PBlockLen] is never zero
2141 je .L_after_reduction_${label_suffix}
2144 $code .= "vpxorq $LAST_GHASH_BLK,$HASH_IN_OUT,$HASH_IN_OUT\n";
2147 $code .= ".L_after_reduction_${label_suffix}:\n";
2149 # ;; Final hash is now in HASH_IN_OUT
2152 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2153 # ;; INITIAL_BLOCKS_PARTIAL macro with support for a partial final block.
2154 # ;; It may look similar to INITIAL_BLOCKS but its usage is different:
2155 # ;; - first encrypts/decrypts required number of blocks and then
2156 # ;; ghashes these blocks
2157 # ;; - Small packets or left over data chunks (<256 bytes)
2158 # ;; - Remaining data chunks below 256 bytes (multi buffer code)
2160 # ;; num_initial_blocks is expected to include the partial final block
2162 sub INITIAL_BLOCKS_PARTIAL {
2163 my $AES_KEYS = $_[0]; # [in] key pointer
2164 my $GCM128_CTX = $_[1]; # [in] context pointer
2165 my $CIPH_PLAIN_OUT = $_[2]; # [in] text output pointer
2166 my $PLAIN_CIPH_IN = $_[3]; # [in] text input pointer
2167 my $LENGTH = $_[4]; # [in/clobbered] length in bytes
2168 my $DATA_OFFSET = $_[5]; # [in/out] current data offset (updated)
2169 my $NUM_BLOCKS = $_[6]; # [in] can only be 1, 2, 3, 4, 5, ..., 15 or 16 (not 0)
2170 my $CTR = $_[7]; # [in/out] current counter value
2171 my $HASH_IN_OUT = $_[8]; # [in/out] XMM ghash in/out value
2172 my $ENC_DEC = $_[9]; # [in] cipher direction (ENC/DEC)
2173 my $CTR0 = $_[10]; # [clobbered] ZMM temporary
2174 my $CTR1 = $_[11]; # [clobbered] ZMM temporary
2175 my $CTR2 = $_[12]; # [clobbered] ZMM temporary
2176 my $CTR3 = $_[13]; # [clobbered] ZMM temporary
2177 my $DAT0 = $_[14]; # [clobbered] ZMM temporary
2178 my $DAT1 = $_[15]; # [clobbered] ZMM temporary
2179 my $DAT2 = $_[16]; # [clobbered] ZMM temporary
2180 my $DAT3 = $_[17]; # [clobbered] ZMM temporary
2181 my $LAST_CIPHER_BLK = $_[18]; # [clobbered] ZMM temporary
2182 my $LAST_GHASH_BLK = $_[19]; # [clobbered] ZMM temporary
2183 my $ZT0 = $_[20]; # [clobbered] ZMM temporary
2184 my $ZT1 = $_[21]; # [clobbered] ZMM temporary
2185 my $ZT2 = $_[22]; # [clobbered] ZMM temporary
2186 my $ZT3 = $_[23]; # [clobbered] ZMM temporary
2187 my $ZT4 = $_[24]; # [clobbered] ZMM temporary
2188 my $IA0 = $_[25]; # [clobbered] GP temporary
2189 my $IA1 = $_[26]; # [clobbered] GP temporary
2190 my $MASKREG = $_[27]; # [clobbered] mask register
2191 my $SHUFMASK = $_[28]; # [clobbered] ZMM for BE/LE shuffle mask
2192 my $PBLOCK_LEN = $_[29]; # [in] partial block length
2194 &INITIAL_BLOCKS_PARTIAL_CIPHER(
2195 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
2196 $LENGTH, $DATA_OFFSET, $NUM_BLOCKS, $CTR,
2197 $ENC_DEC, $DAT0, $DAT1, $DAT2,
2198 $DAT3, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK), $CTR0,
2199 $CTR1, $CTR2, $CTR3, $ZT0,
2200 $IA0, $IA1, $MASKREG, $SHUFMASK);
2202 &INITIAL_BLOCKS_PARTIAL_GHASH($AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS, $HASH_IN_OUT, $ENC_DEC, $DAT0,
2203 $DAT1, $DAT2, $DAT3, &XWORD($LAST_CIPHER_BLK),
2204 &XWORD($LAST_GHASH_BLK), $CTR0, $CTR1, $CTR2, $CTR3, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $PBLOCK_LEN);
2207 # ;; ===========================================================================
2208 # ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
2209 # ;; followed with GHASH of the N blocks.
2210 sub GHASH_16_ENCRYPT_N_GHASH_N {
2211 my $AES_KEYS = $_[0]; # [in] key pointer
2212 my $GCM128_CTX = $_[1]; # [in] context pointer
2213 my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
2214 my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
2215 my $DATA_OFFSET = $_[4]; # [in] data offset
2216 my $LENGTH = $_[5]; # [in] data length
2217 my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
2218 my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
2219 my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
2220 # (can be in form of register or numerical value)
2221 my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
2222 my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
2223 my $B00_03 = $_[11]; # [clobbered] temporary ZMM
2224 my $B04_07 = $_[12]; # [clobbered] temporary ZMM
2225 my $B08_11 = $_[13]; # [clobbered] temporary ZMM
2226 my $B12_15 = $_[14]; # [clobbered] temporary ZMM
2227 my $GH1H_UNUSED = $_[15]; # [clobbered] temporary ZMM
2228 my $GH1L = $_[16]; # [clobbered] temporary ZMM
2229 my $GH1M = $_[17]; # [clobbered] temporary ZMM
2230 my $GH1T = $_[18]; # [clobbered] temporary ZMM
2231 my $GH2H = $_[19]; # [clobbered] temporary ZMM
2232 my $GH2L = $_[20]; # [clobbered] temporary ZMM
2233 my $GH2M = $_[21]; # [clobbered] temporary ZMM
2234 my $GH2T = $_[22]; # [clobbered] temporary ZMM
2235 my $GH3H = $_[23]; # [clobbered] temporary ZMM
2236 my $GH3L = $_[24]; # [clobbered] temporary ZMM
2237 my $GH3M = $_[25]; # [clobbered] temporary ZMM
2238 my $GH3T = $_[26]; # [clobbered] temporary ZMM
2239 my $AESKEY1 = $_[27]; # [clobbered] temporary ZMM
2240 my $AESKEY2 = $_[28]; # [clobbered] temporary ZMM
2241 my $GHKEY1 = $_[29]; # [clobbered] temporary ZMM
2242 my $GHKEY2 = $_[30]; # [clobbered] temporary ZMM
2243 my $GHDAT1 = $_[31]; # [clobbered] temporary ZMM
2244 my $GHDAT2 = $_[32]; # [clobbered] temporary ZMM
2245 my $ZT01 = $_[33]; # [clobbered] temporary ZMM
2246 my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
2247 my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2248 my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
2249 my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
2250 my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
2251 my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
2252 my $ENC_DEC = $_[40]; # [in] cipher direction
2253 my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
2254 my $IA0 = $_[42]; # [clobbered] GP temporary
2255 my $IA1 = $_[43]; # [clobbered] GP temporary
2256 my $MASKREG = $_[44]; # [clobbered] mask register
2257 my $NUM_BLOCKS = $_[45]; # [in] numerical value with number of blocks to be encrypted/ghashed (1 to 16)
2258 my $PBLOCK_LEN = $_[46]; # [in] partial block length
2260 die "GHASH_16_ENCRYPT_N_GHASH_N: num_blocks is out of bounds = $NUM_BLOCKS\n"
2261 if ($NUM_BLOCKS > 16 || $NUM_BLOCKS < 0);
2263 my $label_suffix = $label_count++;
2265 my $GH1H = $HASH_IN_OUT;
2267 # ; this is to avoid additional move in do_reduction case
2269 my $LAST_GHASH_BLK = $GH1L;
2270 my $LAST_CIPHER_BLK = $GH1T;
2272 my $RED_POLY = $GH2T;
2282 # ;; do reduction after the 16 blocks ?
2283 my $do_reduction = 0;
2285 # ;; is 16 block chunk a start?
2288 if ($GHASH_TYPE eq "start_reduce") {
2293 if ($GHASH_TYPE eq "start") {
2297 if ($GHASH_TYPE eq "end_reduce") {
2301 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2302 # ;; - get load/store mask
2303 # ;; - load plain/cipher text
2304 # ;; get load/store mask
2306 lea byte64_len_to_mask_table(%rip),$IA0
2309 if ($NUM_BLOCKS > 12) {
2310 $code .= "sub \$`3*64`,$IA1\n";
2311 } elsif ($NUM_BLOCKS > 8) {
2312 $code .= "sub \$`2*64`,$IA1\n";
2313 } elsif ($NUM_BLOCKS > 4) {
2314 $code .= "sub \$`1*64`,$IA1\n";
2316 $code .= "kmovq ($IA0,$IA1,8),$MASKREG\n";
2318 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2319 # ;; prepare counter blocks
2322 cmp \$`(256 - $NUM_BLOCKS)`,@{[DWORD($CTR_CHECK)]}
2323 jae .L_16_blocks_overflow_${label_suffix}
2326 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2327 $NUM_BLOCKS, "vpaddd", $B00_03, $B04_07, $B08_11, $B12_15, $CTR_BE,
2328 $B00_03, $B04_07, $B08_11, $ADDBE_1234, $ADDBE_4x4, $ADDBE_4x4, $ADDBE_4x4);
2330 jmp .L_16_blocks_ok_${label_suffix}
2332 .L_16_blocks_overflow_${label_suffix}:
2333 vpshufb $SHFMSK,$CTR_BE,$CTR_BE
2334 vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
2336 if ($NUM_BLOCKS > 4) {
2338 vmovdqa64 ddq_add_4444(%rip),$B12_15
2339 vpaddd $B12_15,$B00_03,$B04_07
2342 if ($NUM_BLOCKS > 8) {
2343 $code .= "vpaddd $B12_15,$B04_07,$B08_11\n";
2345 if ($NUM_BLOCKS > 12) {
2346 $code .= "vpaddd $B12_15,$B08_11,$B12_15\n";
2348 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2349 $NUM_BLOCKS, "vpshufb", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2350 $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2352 .L_16_blocks_ok_${label_suffix}:
2354 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2355 # ;; - pre-load constants
2356 # ;; - add current hash into the 1st block
2357 vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1
2359 if ($is_start != 0) {
2360 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$HASH_IN_OUT,$GHDAT1\n";
2362 $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
2365 $code .= "vmovdqu64 @{[EffectiveAddress(\"%rsp\",$HASHKEY_OFFSET,0*64)]},$GHKEY1\n";
2367 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2368 # ;; save counter for the next round
2369 # ;; increment counter overflow check register
2370 if ($NUM_BLOCKS <= 4) {
2371 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($CTR_BE)]}\n";
2372 } elsif ($NUM_BLOCKS <= 8) {
2373 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($CTR_BE)]}\n";
2374 } elsif ($NUM_BLOCKS <= 12) {
2375 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($CTR_BE)]}\n";
2377 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($CTR_BE)]}\n";
2379 $code .= "vshufi64x2 \$0b00000000,$CTR_BE,$CTR_BE,$CTR_BE\n";
2382 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2383 # ;; pre-load constants
2384 vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
2385 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,1*64)]},$GHKEY2
2386 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
2389 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2390 # ;; stitch AES rounds with GHASH
2392 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2393 # ;; AES round 0 - ARK
2395 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2396 $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2397 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2398 $code .= "vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1\n";
2401 # ;;==================================================
2402 # ;; GHASH 4 blocks (15 to 12)
2403 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
2404 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
2405 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
2406 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
2407 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,2*64)]},$GHKEY1
2408 vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
2411 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2413 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2414 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2415 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2416 $code .= "vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2\n";
2419 # ;; =================================================
2420 # ;; GHASH 4 blocks (11 to 8)
2421 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
2422 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
2423 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
2424 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
2425 vmovdqu64 @{[EffectiveAddress("%rsp",$HASHKEY_OFFSET,3*64)]},$GHKEY2
2426 vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
2429 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2431 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2432 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2433 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2434 $code .= "vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1\n";
2437 # ;; =================================================
2438 # ;; GHASH 4 blocks (7 to 4)
2439 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
2440 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
2441 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
2442 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
2445 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2447 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2448 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2449 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2450 $code .= "vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2\n";
2453 # ;; =================================================
2454 # ;; Gather (XOR) GHASH for 12 blocks
2455 vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
2456 vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
2457 vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
2458 vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
2461 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2463 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2464 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2465 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2466 $code .= "vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1\n";
2468 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2469 # ;; load plain/cipher text
2470 &ZMM_LOAD_MASKED_BLOCKS_0_16($NUM_BLOCKS, $PLAIN_CIPH_IN, $DATA_OFFSET, $DATA1, $DATA2, $DATA3, $DATA4, $MASKREG);
2472 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2474 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2475 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2476 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2477 $code .= "vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2\n";
2480 # ;; =================================================
2481 # ;; GHASH 4 blocks (3 to 0)
2482 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
2483 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
2484 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
2485 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
2488 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2490 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2491 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2492 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2493 $code .= "vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1\n";
2495 # ;; =================================================
2496 # ;; gather GHASH in GH1L (low), GH1H (high), GH1M (mid)
2497 # ;; - add GH2[MTLH] to GH1[MTLH]
2498 $code .= "vpternlogq \$0x96,$GH2T,$GH1T,$GH1M\n";
2499 if ($do_reduction != 0) {
2501 if ($is_start != 0) {
2502 $code .= "vpxorq $GH2M,$GH1M,$GH1M\n";
2505 vpternlogq \$0x96,$GH2H,$TO_REDUCE_H,$GH1H
2506 vpternlogq \$0x96,$GH2L,$TO_REDUCE_L,$GH1L
2507 vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
2513 # ;; Update H/M/L hash sums if not carrying reduction
2514 if ($is_start != 0) {
2516 vpxorq $GH2H,$GH1H,$TO_REDUCE_H
2517 vpxorq $GH2L,$GH1L,$TO_REDUCE_L
2518 vpxorq $GH2M,$GH1M,$TO_REDUCE_M
2522 vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H
2523 vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L
2524 vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M
2530 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2532 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2533 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2534 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2535 $code .= "vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2\n";
2537 # ;; =================================================
2538 # ;; prepare mid sum for adding to high & low
2539 # ;; load polynomial constant for reduction
2540 if ($do_reduction != 0) {
2542 vpsrldq \$8,$GH1M,$GH2M
2543 vpslldq \$8,$GH1M,$GH1M
2545 vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
2549 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2551 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2552 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2553 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2554 $code .= "vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1\n";
2556 # ;; =================================================
2557 # ;; Add mid product to high and low
2558 if ($do_reduction != 0) {
2559 if ($is_start != 0) {
2561 vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
2562 vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
2566 vpxorq $GH2M,$GH1H,$GH1H # ; TH = TH1 + TM>>64
2567 vpxorq $GH1M,$GH1L,$GH1L # ; TL = TL1 + TM<<64
2572 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2574 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2575 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2576 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2578 # ;; =================================================
2579 # ;; horizontal xor of low and high 4x128
2580 if ($do_reduction != 0) {
2581 &VHPXORI4x128($GH1H, $GH2H);
2582 &VHPXORI4x128($GH1L, $GH2L);
2585 if (($NROUNDS >= 11)) {
2586 $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
2589 # ;; =================================================
2590 # ;; first phase of reduction
2591 if ($do_reduction != 0) {
2593 vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
2594 vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
2595 vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
2599 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2600 # ;; AES rounds up to 11 (AES192) or 13 (AES256)
2602 if (($NROUNDS >= 11)) {
2603 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2604 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2605 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2606 $code .= "vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1\n";
2608 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2609 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2610 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2611 if (($NROUNDS == 13)) {
2612 $code .= "vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2\n";
2614 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2615 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2616 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2617 $code .= "vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1\n";
2619 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2620 $NUM_BLOCKS, "vaesenc", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2621 $B04_07, $B08_11, $B12_15, $AESKEY2, $AESKEY2, $AESKEY2, $AESKEY2);
2625 # ;; =================================================
2626 # ;; second phase of the reduction
2627 if ($do_reduction != 0) {
2629 vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
2630 vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
2631 vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
2632 vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
2633 # ;; GH1H = GH1H + RED_T1 + RED_T2
2634 vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
2638 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2639 # ;; the last AES round
2640 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2641 $NUM_BLOCKS, "vaesenclast", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2642 $B04_07, $B08_11, $B12_15, $AESKEY1, $AESKEY1, $AESKEY1, $AESKEY1);
2644 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2645 # ;; XOR against plain/cipher text
2646 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2647 $NUM_BLOCKS, "vpxorq", $B00_03, $B04_07, $B08_11, $B12_15, $B00_03,
2648 $B04_07, $B08_11, $B12_15, $DATA1, $DATA2, $DATA3, $DATA4);
2650 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2651 # ;; retrieve the last cipher counter block (partially XOR'ed with text)
2652 # ;; - this is needed for partial block cases
2653 if ($NUM_BLOCKS <= 4) {
2654 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 1)`,$B00_03,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2655 } elsif ($NUM_BLOCKS <= 8) {
2656 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 5)`,$B04_07,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2657 } elsif ($NUM_BLOCKS <= 12) {
2658 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 9)`,$B08_11,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2660 $code .= "vextracti32x4 \$`($NUM_BLOCKS - 13)`,$B12_15,@{[XWORD($LAST_CIPHER_BLK)]}\n";
2663 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2664 # ;; store cipher/plain text
2665 $code .= "mov $CIPH_PLAIN_OUT,$IA0\n";
2666 &ZMM_STORE_MASKED_BLOCKS_0_16($NUM_BLOCKS, $IA0, $DATA_OFFSET, $B00_03, $B04_07, $B08_11, $B12_15, $MASKREG);
2668 # ;; =================================================
2669 # ;; shuffle cipher text blocks for GHASH computation
2670 if ($ENC_DEC eq "ENC") {
2672 # ;; zero bytes outside the mask before hashing
2673 if ($NUM_BLOCKS <= 4) {
2674 $code .= "vmovdqu8 $B00_03,${B00_03}{$MASKREG}{z}\n";
2675 } elsif ($NUM_BLOCKS <= 8) {
2676 $code .= "vmovdqu8 $B04_07,${B04_07}{$MASKREG}{z}\n";
2677 } elsif ($NUM_BLOCKS <= 12) {
2678 $code .= "vmovdqu8 $B08_11,${B08_11}{$MASKREG}{z}\n";
2680 $code .= "vmovdqu8 $B12_15,${B12_15}{$MASKREG}{z}\n";
2683 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2684 $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $B00_03,
2685 $B04_07, $B08_11, $B12_15, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2688 # ;; zero bytes outside the mask before hashing
2689 if ($NUM_BLOCKS <= 4) {
2690 $code .= "vmovdqu8 $DATA1,${DATA1}{$MASKREG}{z}\n";
2691 } elsif ($NUM_BLOCKS <= 8) {
2692 $code .= "vmovdqu8 $DATA2,${DATA2}{$MASKREG}{z}\n";
2693 } elsif ($NUM_BLOCKS <= 12) {
2694 $code .= "vmovdqu8 $DATA3,${DATA3}{$MASKREG}{z}\n";
2696 $code .= "vmovdqu8 $DATA4,${DATA4}{$MASKREG}{z}\n";
2699 &ZMM_OPCODE3_DSTR_SRC1R_SRC2R_BLOCKS_0_16(
2700 $NUM_BLOCKS, "vpshufb", $DATA1, $DATA2, $DATA3, $DATA4, $DATA1,
2701 $DATA2, $DATA3, $DATA4, $SHFMSK, $SHFMSK, $SHFMSK, $SHFMSK);
2704 # ;; =================================================
2705 # ;; Extract the last block for partial / multi_call cases
2706 if ($NUM_BLOCKS <= 4) {
2707 $code .= "vextracti32x4 \$`($NUM_BLOCKS-1)`,$DATA1,@{[XWORD($LAST_GHASH_BLK)]}\n";
2708 } elsif ($NUM_BLOCKS <= 8) {
2709 $code .= "vextracti32x4 \$`($NUM_BLOCKS-5)`,$DATA2,@{[XWORD($LAST_GHASH_BLK)]}\n";
2710 } elsif ($NUM_BLOCKS <= 12) {
2711 $code .= "vextracti32x4 \$`($NUM_BLOCKS-9)`,$DATA3,@{[XWORD($LAST_GHASH_BLK)]}\n";
2713 $code .= "vextracti32x4 \$`($NUM_BLOCKS-13)`,$DATA4,@{[XWORD($LAST_GHASH_BLK)]}\n";
2716 if ($do_reduction != 0) {
2718 # ;; GH1H holds reduced hash value
2719 # ;; - normally do "vmovdqa64 &XWORD($GH1H), &XWORD($HASH_IN_OUT)"
2720 # ;; - register rename trick obsoletes the above move
2723 # ;; =================================================
2724 # ;; GHASH last N blocks
2725 # ;; - current hash value in HASH_IN_OUT or
2726 # ;; product parts in TO_REDUCE_H/M/L
2727 # ;; - DATA1-DATA4 include blocks for GHASH
2729 if ($do_reduction == 0) {
2730 &INITIAL_BLOCKS_PARTIAL_GHASH(
2731 $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
2732 &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
2733 $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
2734 $B00_03, $B04_07, $B08_11, $B12_15,
2735 $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
2736 $GHKEY1, $PBLOCK_LEN, $TO_REDUCE_H, $TO_REDUCE_M,
2739 &INITIAL_BLOCKS_PARTIAL_GHASH(
2740 $AES_KEYS, $GCM128_CTX, $LENGTH, $NUM_BLOCKS,
2741 &XWORD($HASH_IN_OUT), $ENC_DEC, $DATA1, $DATA2,
2742 $DATA3, $DATA4, &XWORD($LAST_CIPHER_BLK), &XWORD($LAST_GHASH_BLK),
2743 $B00_03, $B04_07, $B08_11, $B12_15,
2744 $GHDAT1, $GHDAT2, $AESKEY1, $AESKEY2,
2745 $GHKEY1, $PBLOCK_LEN);
2749 # ;; ===========================================================================
2750 # ;; ===========================================================================
2751 # ;; Stitched GHASH of 16 blocks (with reduction) with encryption of N blocks
2752 # ;; followed with GHASH of the N blocks.
2753 sub GCM_ENC_DEC_LAST {
2754 my $AES_KEYS = $_[0]; # [in] key pointer
2755 my $GCM128_CTX = $_[1]; # [in] context pointer
2756 my $CIPH_PLAIN_OUT = $_[2]; # [in] pointer to output buffer
2757 my $PLAIN_CIPH_IN = $_[3]; # [in] pointer to input buffer
2758 my $DATA_OFFSET = $_[4]; # [in] data offset
2759 my $LENGTH = $_[5]; # [in/clobbered] data length
2760 my $CTR_BE = $_[6]; # [in/out] ZMM counter blocks (last 4) in big-endian
2761 my $CTR_CHECK = $_[7]; # [in/out] GP with 8-bit counter for overflow check
2762 my $HASHKEY_OFFSET = $_[8]; # [in] numerical offset for the highest hash key
2763 # (can be register or numerical offset)
2764 my $GHASHIN_BLK_OFFSET = $_[9]; # [in] numerical offset for GHASH blocks in
2765 my $SHFMSK = $_[10]; # [in] ZMM with byte swap mask for pshufb
2766 my $ZT00 = $_[11]; # [clobbered] temporary ZMM
2767 my $ZT01 = $_[12]; # [clobbered] temporary ZMM
2768 my $ZT02 = $_[13]; # [clobbered] temporary ZMM
2769 my $ZT03 = $_[14]; # [clobbered] temporary ZMM
2770 my $ZT04 = $_[15]; # [clobbered] temporary ZMM
2771 my $ZT05 = $_[16]; # [clobbered] temporary ZMM
2772 my $ZT06 = $_[17]; # [clobbered] temporary ZMM
2773 my $ZT07 = $_[18]; # [clobbered] temporary ZMM
2774 my $ZT08 = $_[19]; # [clobbered] temporary ZMM
2775 my $ZT09 = $_[20]; # [clobbered] temporary ZMM
2776 my $ZT10 = $_[21]; # [clobbered] temporary ZMM
2777 my $ZT11 = $_[22]; # [clobbered] temporary ZMM
2778 my $ZT12 = $_[23]; # [clobbered] temporary ZMM
2779 my $ZT13 = $_[24]; # [clobbered] temporary ZMM
2780 my $ZT14 = $_[25]; # [clobbered] temporary ZMM
2781 my $ZT15 = $_[26]; # [clobbered] temporary ZMM
2782 my $ZT16 = $_[27]; # [clobbered] temporary ZMM
2783 my $ZT17 = $_[28]; # [clobbered] temporary ZMM
2784 my $ZT18 = $_[29]; # [clobbered] temporary ZMM
2785 my $ZT19 = $_[30]; # [clobbered] temporary ZMM
2786 my $ZT20 = $_[31]; # [clobbered] temporary ZMM
2787 my $ZT21 = $_[32]; # [clobbered] temporary ZMM
2788 my $ZT22 = $_[33]; # [clobbered] temporary ZMM
2789 my $ADDBE_4x4 = $_[34]; # [in] ZMM with 4x128bits 4 in big-endian
2790 my $ADDBE_1234 = $_[35]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2791 my $GHASH_TYPE = $_[36]; # [in] "start", "start_reduce", "mid", "end_reduce"
2792 my $TO_REDUCE_L = $_[37]; # [in] ZMM for low 4x128-bit GHASH sum
2793 my $TO_REDUCE_H = $_[38]; # [in] ZMM for hi 4x128-bit GHASH sum
2794 my $TO_REDUCE_M = $_[39]; # [in] ZMM for medium 4x128-bit GHASH sum
2795 my $ENC_DEC = $_[40]; # [in] cipher direction
2796 my $HASH_IN_OUT = $_[41]; # [in/out] XMM ghash in/out value
2797 my $IA0 = $_[42]; # [clobbered] GP temporary
2798 my $IA1 = $_[43]; # [clobbered] GP temporary
2799 my $MASKREG = $_[44]; # [clobbered] mask register
2800 my $PBLOCK_LEN = $_[45]; # [in] partial block length
2802 my $label_suffix = $label_count++;
2805 mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
2806 add \$15,@{[DWORD($IA0)]}
2807 shr \$4,@{[DWORD($IA0)]}
2808 je .L_last_num_blocks_is_0_${label_suffix}
2810 cmp \$8,@{[DWORD($IA0)]}
2811 je .L_last_num_blocks_is_8_${label_suffix}
2812 jb .L_last_num_blocks_is_7_1_${label_suffix}
2815 cmp \$12,@{[DWORD($IA0)]}
2816 je .L_last_num_blocks_is_12_${label_suffix}
2817 jb .L_last_num_blocks_is_11_9_${label_suffix}
2819 # ;; 16, 15, 14 or 13
2820 cmp \$15,@{[DWORD($IA0)]}
2821 je .L_last_num_blocks_is_15_${label_suffix}
2822 ja .L_last_num_blocks_is_16_${label_suffix}
2823 cmp \$14,@{[DWORD($IA0)]}
2824 je .L_last_num_blocks_is_14_${label_suffix}
2825 jmp .L_last_num_blocks_is_13_${label_suffix}
2827 .L_last_num_blocks_is_11_9_${label_suffix}:
2829 cmp \$10,@{[DWORD($IA0)]}
2830 je .L_last_num_blocks_is_10_${label_suffix}
2831 ja .L_last_num_blocks_is_11_${label_suffix}
2832 jmp .L_last_num_blocks_is_9_${label_suffix}
2834 .L_last_num_blocks_is_7_1_${label_suffix}:
2835 cmp \$4,@{[DWORD($IA0)]}
2836 je .L_last_num_blocks_is_4_${label_suffix}
2837 jb .L_last_num_blocks_is_3_1_${label_suffix}
2839 cmp \$6,@{[DWORD($IA0)]}
2840 ja .L_last_num_blocks_is_7_${label_suffix}
2841 je .L_last_num_blocks_is_6_${label_suffix}
2842 jmp .L_last_num_blocks_is_5_${label_suffix}
2844 .L_last_num_blocks_is_3_1_${label_suffix}:
2846 cmp \$2,@{[DWORD($IA0)]}
2847 ja .L_last_num_blocks_is_3_${label_suffix}
2848 je .L_last_num_blocks_is_2_${label_suffix}
2851 # ;; fall through for `jmp .L_last_num_blocks_is_1`
2853 # ;; Use rep to generate different block size variants
2854 # ;; - one block size has to be the first one
2855 for my $num_blocks (1 .. 16) {
2856 $code .= ".L_last_num_blocks_is_${num_blocks}_${label_suffix}:\n";
2857 &GHASH_16_ENCRYPT_N_GHASH_N(
2858 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET,
2859 $LENGTH, $CTR_BE, $CTR_CHECK, $HASHKEY_OFFSET, $GHASHIN_BLK_OFFSET,
2860 $SHFMSK, $ZT00, $ZT01, $ZT02, $ZT03,
2861 $ZT04, $ZT05, $ZT06, $ZT07, $ZT08,
2862 $ZT09, $ZT10, $ZT11, $ZT12, $ZT13,
2863 $ZT14, $ZT15, $ZT16, $ZT17, $ZT18,
2864 $ZT19, $ZT20, $ZT21, $ZT22, $ADDBE_4x4,
2865 $ADDBE_1234, $GHASH_TYPE, $TO_REDUCE_L, $TO_REDUCE_H, $TO_REDUCE_M,
2866 $ENC_DEC, $HASH_IN_OUT, $IA0, $IA1, $MASKREG,
2867 $num_blocks, $PBLOCK_LEN);
2869 $code .= "jmp .L_last_blocks_done_${label_suffix}\n";
2872 $code .= ".L_last_num_blocks_is_0_${label_suffix}:\n";
2874 # ;; if there is 0 blocks to cipher then there are only 16 blocks for ghash and reduction
2875 # ;; - convert mid into end_reduce
2876 # ;; - convert start into start_reduce
2877 if ($GHASH_TYPE eq "mid") {
2878 $GHASH_TYPE = "end_reduce";
2880 if ($GHASH_TYPE eq "start") {
2881 $GHASH_TYPE = "start_reduce";
2884 &GHASH_16($GHASH_TYPE, $TO_REDUCE_H, $TO_REDUCE_M, $TO_REDUCE_L, "%rsp",
2885 $GHASHIN_BLK_OFFSET, 0, "%rsp", $HASHKEY_OFFSET, 0, $HASH_IN_OUT, $ZT00, $ZT01,
2886 $ZT02, $ZT03, $ZT04, $ZT05, $ZT06, $ZT07, $ZT08, $ZT09);
2888 $code .= ".L_last_blocks_done_${label_suffix}:\n";
2891 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2892 # ;; Main GCM macro stitching cipher with GHASH
2893 # ;; - operates on single stream
2894 # ;; - encrypts 16 blocks at a time
2895 # ;; - ghash the 16 previously encrypted ciphertext blocks
2896 # ;; - no partial block or multi_call handling here
2897 sub GHASH_16_ENCRYPT_16_PARALLEL {
2898 my $AES_KEYS = $_[0]; # [in] key pointer
2899 my $CIPH_PLAIN_OUT = $_[1]; # [in] pointer to output buffer
2900 my $PLAIN_CIPH_IN = $_[2]; # [in] pointer to input buffer
2901 my $DATA_OFFSET = $_[3]; # [in] data offset
2902 my $CTR_BE = $_[4]; # [in/out] ZMM counter blocks (last 4) in big-endian
2903 my $CTR_CHECK = $_[5]; # [in/out] GP with 8-bit counter for overflow check
2904 my $HASHKEY_OFFSET = $_[6]; # [in] numerical offset for the highest hash key (hash key index value)
2905 my $AESOUT_BLK_OFFSET = $_[7]; # [in] numerical offset for AES-CTR out
2906 my $GHASHIN_BLK_OFFSET = $_[8]; # [in] numerical offset for GHASH blocks in
2907 my $SHFMSK = $_[9]; # [in] ZMM with byte swap mask for pshufb
2908 my $ZT1 = $_[10]; # [clobbered] temporary ZMM (cipher)
2909 my $ZT2 = $_[11]; # [clobbered] temporary ZMM (cipher)
2910 my $ZT3 = $_[12]; # [clobbered] temporary ZMM (cipher)
2911 my $ZT4 = $_[13]; # [clobbered] temporary ZMM (cipher)
2912 my $ZT5 = $_[14]; # [clobbered/out] temporary ZMM or GHASH OUT (final_reduction)
2913 my $ZT6 = $_[15]; # [clobbered] temporary ZMM (cipher)
2914 my $ZT7 = $_[16]; # [clobbered] temporary ZMM (cipher)
2915 my $ZT8 = $_[17]; # [clobbered] temporary ZMM (cipher)
2916 my $ZT9 = $_[18]; # [clobbered] temporary ZMM (cipher)
2917 my $ZT10 = $_[19]; # [clobbered] temporary ZMM (ghash)
2918 my $ZT11 = $_[20]; # [clobbered] temporary ZMM (ghash)
2919 my $ZT12 = $_[21]; # [clobbered] temporary ZMM (ghash)
2920 my $ZT13 = $_[22]; # [clobbered] temporary ZMM (ghash)
2921 my $ZT14 = $_[23]; # [clobbered] temporary ZMM (ghash)
2922 my $ZT15 = $_[24]; # [clobbered] temporary ZMM (ghash)
2923 my $ZT16 = $_[25]; # [clobbered] temporary ZMM (ghash)
2924 my $ZT17 = $_[26]; # [clobbered] temporary ZMM (ghash)
2925 my $ZT18 = $_[27]; # [clobbered] temporary ZMM (ghash)
2926 my $ZT19 = $_[28]; # [clobbered] temporary ZMM
2927 my $ZT20 = $_[29]; # [clobbered] temporary ZMM
2928 my $ZT21 = $_[30]; # [clobbered] temporary ZMM
2929 my $ZT22 = $_[31]; # [clobbered] temporary ZMM
2930 my $ZT23 = $_[32]; # [clobbered] temporary ZMM
2931 my $ADDBE_4x4 = $_[33]; # [in] ZMM with 4x128bits 4 in big-endian
2932 my $ADDBE_1234 = $_[34]; # [in] ZMM with 4x128bits 1, 2, 3 and 4 in big-endian
2933 my $TO_REDUCE_L = $_[35]; # [in/out] ZMM for low 4x128-bit GHASH sum
2934 my $TO_REDUCE_H = $_[36]; # [in/out] ZMM for hi 4x128-bit GHASH sum
2935 my $TO_REDUCE_M = $_[37]; # [in/out] ZMM for medium 4x128-bit GHASH sum
2936 my $DO_REDUCTION = $_[38]; # [in] "no_reduction", "final_reduction", "first_time"
2937 my $ENC_DEC = $_[39]; # [in] cipher direction
2938 my $DATA_DISPL = $_[40]; # [in] fixed numerical data displacement/offset
2939 my $GHASH_IN = $_[41]; # [in] current GHASH value or "no_ghash_in"
2940 my $IA0 = $_[42]; # [clobbered] temporary GPR
2949 # ; @note: do not change this mapping
2959 my $RED_POLY = $GH2T;
2974 my $AESKEY1 = $ZT17;
2975 my $AESKEY2 = $ZT18;
2982 my $label_suffix = $label_count++;
2984 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
2985 # ;; prepare counter blocks
2988 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
2989 jae .L_16_blocks_overflow_${label_suffix}
2990 vpaddd $ADDBE_1234,$CTR_BE,$B00_03
2991 vpaddd $ADDBE_4x4,$B00_03,$B04_07
2992 vpaddd $ADDBE_4x4,$B04_07,$B08_11
2993 vpaddd $ADDBE_4x4,$B08_11,$B12_15
2994 jmp .L_16_blocks_ok_${label_suffix}
2995 .L_16_blocks_overflow_${label_suffix}:
2996 vpshufb $SHFMSK,$CTR_BE,$CTR_BE
2997 vmovdqa64 ddq_add_4444(%rip),$B12_15
2998 vpaddd ddq_add_1234(%rip),$CTR_BE,$B00_03
2999 vpaddd $B12_15,$B00_03,$B04_07
3000 vpaddd $B12_15,$B04_07,$B08_11
3001 vpaddd $B12_15,$B08_11,$B12_15
3002 vpshufb $SHFMSK,$B00_03,$B00_03
3003 vpshufb $SHFMSK,$B04_07,$B04_07
3004 vpshufb $SHFMSK,$B08_11,$B08_11
3005 vpshufb $SHFMSK,$B12_15,$B12_15
3006 .L_16_blocks_ok_${label_suffix}:
3009 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3010 # ;; pre-load constants
3011 $code .= "vbroadcastf64x2 `(16 * 0)`($AES_KEYS),$AESKEY1\n";
3012 if ($GHASH_IN ne "no_ghash_in") {
3013 $code .= "vpxorq `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHASH_IN,$GHDAT1\n";
3015 $code .= "vmovdqa64 `$GHASHIN_BLK_OFFSET + (0*64)`(%rsp),$GHDAT1\n";
3019 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (0*4)),"%rsp")]},$GHKEY1
3021 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3022 # ;; save counter for the next round
3023 # ;; increment counter overflow check register
3024 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR_BE
3025 addb \$16,@{[BYTE($CTR_CHECK)]}
3026 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3027 # ;; pre-load constants
3028 vbroadcastf64x2 `(16 * 1)`($AES_KEYS),$AESKEY2
3029 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (1*4)),"%rsp")]},$GHKEY2
3030 vmovdqa64 `$GHASHIN_BLK_OFFSET + (1*64)`(%rsp),$GHDAT2
3032 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3033 # ;; stitch AES rounds with GHASH
3035 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3036 # ;; AES round 0 - ARK
3038 vpxorq $AESKEY1,$B00_03,$B00_03
3039 vpxorq $AESKEY1,$B04_07,$B04_07
3040 vpxorq $AESKEY1,$B08_11,$B08_11
3041 vpxorq $AESKEY1,$B12_15,$B12_15
3042 vbroadcastf64x2 `(16 * 2)`($AES_KEYS),$AESKEY1
3044 # ;;==================================================
3045 # ;; GHASH 4 blocks (15 to 12)
3046 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH1H # ; a1*b1
3047 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH1L # ; a0*b0
3048 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH1M # ; a1*b0
3049 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH1T # ; a0*b1
3050 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (2*4)),"%rsp")]},$GHKEY1
3051 vmovdqa64 `$GHASHIN_BLK_OFFSET + (2*64)`(%rsp),$GHDAT1
3053 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3055 vaesenc $AESKEY2,$B00_03,$B00_03
3056 vaesenc $AESKEY2,$B04_07,$B04_07
3057 vaesenc $AESKEY2,$B08_11,$B08_11
3058 vaesenc $AESKEY2,$B12_15,$B12_15
3059 vbroadcastf64x2 `(16 * 3)`($AES_KEYS),$AESKEY2
3061 # ;; =================================================
3062 # ;; GHASH 4 blocks (11 to 8)
3063 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
3064 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
3065 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
3066 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
3067 vmovdqu64 @{[HashKeyByIdx(($HASHKEY_OFFSET - (3*4)),"%rsp")]},$GHKEY2
3068 vmovdqa64 `$GHASHIN_BLK_OFFSET + (3*64)`(%rsp),$GHDAT2
3070 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3072 vaesenc $AESKEY1,$B00_03,$B00_03
3073 vaesenc $AESKEY1,$B04_07,$B04_07
3074 vaesenc $AESKEY1,$B08_11,$B08_11
3075 vaesenc $AESKEY1,$B12_15,$B12_15
3076 vbroadcastf64x2 `(16 * 4)`($AES_KEYS),$AESKEY1
3078 # ;; =================================================
3079 # ;; GHASH 4 blocks (7 to 4)
3080 vpclmulqdq \$0x10,$GHKEY1,$GHDAT1,$GH3M # ; a0*b1
3081 vpclmulqdq \$0x01,$GHKEY1,$GHDAT1,$GH3T # ; a1*b0
3082 vpclmulqdq \$0x11,$GHKEY1,$GHDAT1,$GH3H # ; a1*b1
3083 vpclmulqdq \$0x00,$GHKEY1,$GHDAT1,$GH3L # ; a0*b0
3084 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3086 vaesenc $AESKEY2,$B00_03,$B00_03
3087 vaesenc $AESKEY2,$B04_07,$B04_07
3088 vaesenc $AESKEY2,$B08_11,$B08_11
3089 vaesenc $AESKEY2,$B12_15,$B12_15
3090 vbroadcastf64x2 `(16 * 5)`($AES_KEYS),$AESKEY2
3092 # ;; =================================================
3093 # ;; Gather (XOR) GHASH for 12 blocks
3094 vpternlogq \$0x96,$GH3H,$GH2H,$GH1H
3095 vpternlogq \$0x96,$GH3L,$GH2L,$GH1L
3096 vpternlogq \$0x96,$GH3T,$GH2T,$GH1T
3097 vpternlogq \$0x96,$GH3M,$GH2M,$GH1M
3099 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3101 vaesenc $AESKEY1,$B00_03,$B00_03
3102 vaesenc $AESKEY1,$B04_07,$B04_07
3103 vaesenc $AESKEY1,$B08_11,$B08_11
3104 vaesenc $AESKEY1,$B12_15,$B12_15
3105 vbroadcastf64x2 `(16 * 6)`($AES_KEYS),$AESKEY1
3107 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3108 # ;; load plain/cipher text (recycle GH3xx registers)
3109 vmovdqu8 `$DATA_DISPL + (0 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA1
3110 vmovdqu8 `$DATA_DISPL + (1 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA2
3111 vmovdqu8 `$DATA_DISPL + (2 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA3
3112 vmovdqu8 `$DATA_DISPL + (3 * 64)`($PLAIN_CIPH_IN,$DATA_OFFSET),$DATA4
3114 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3116 vaesenc $AESKEY2,$B00_03,$B00_03
3117 vaesenc $AESKEY2,$B04_07,$B04_07
3118 vaesenc $AESKEY2,$B08_11,$B08_11
3119 vaesenc $AESKEY2,$B12_15,$B12_15
3120 vbroadcastf64x2 `(16 * 7)`($AES_KEYS),$AESKEY2
3122 # ;; =================================================
3123 # ;; GHASH 4 blocks (3 to 0)
3124 vpclmulqdq \$0x10,$GHKEY2,$GHDAT2,$GH2M # ; a0*b1
3125 vpclmulqdq \$0x01,$GHKEY2,$GHDAT2,$GH2T # ; a1*b0
3126 vpclmulqdq \$0x11,$GHKEY2,$GHDAT2,$GH2H # ; a1*b1
3127 vpclmulqdq \$0x00,$GHKEY2,$GHDAT2,$GH2L # ; a0*b0
3128 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3130 vaesenc $AESKEY1,$B00_03,$B00_03
3131 vaesenc $AESKEY1,$B04_07,$B04_07
3132 vaesenc $AESKEY1,$B08_11,$B08_11
3133 vaesenc $AESKEY1,$B12_15,$B12_15
3134 vbroadcastf64x2 `(16 * 8)`($AES_KEYS),$AESKEY1
3137 # ;; =================================================
3138 # ;; gather GHASH in GH1L (low) and GH1H (high)
3139 if ($DO_REDUCTION eq "first_time") {
3141 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3142 vpxorq $GH2M,$GH1M,$TO_REDUCE_M # ; TM
3143 vpxorq $GH2H,$GH1H,$TO_REDUCE_H # ; TH
3144 vpxorq $GH2L,$GH1L,$TO_REDUCE_L # ; TL
3147 if ($DO_REDUCTION eq "no_reduction") {
3149 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3150 vpternlogq \$0x96,$GH2M,$GH1M,$TO_REDUCE_M # ; TM
3151 vpternlogq \$0x96,$GH2H,$GH1H,$TO_REDUCE_H # ; TH
3152 vpternlogq \$0x96,$GH2L,$GH1L,$TO_REDUCE_L # ; TL
3155 if ($DO_REDUCTION eq "final_reduction") {
3157 # ;; phase 1: add mid products together
3158 # ;; also load polynomial constant for reduction
3159 vpternlogq \$0x96,$GH2T,$GH1T,$GH1M # ; TM
3160 vpternlogq \$0x96,$GH2M,$TO_REDUCE_M,$GH1M
3162 vpsrldq \$8,$GH1M,$GH2M
3163 vpslldq \$8,$GH1M,$GH1M
3165 vmovdqa64 POLY2(%rip),@{[XWORD($RED_POLY)]}
3169 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3172 vaesenc $AESKEY2,$B00_03,$B00_03
3173 vaesenc $AESKEY2,$B04_07,$B04_07
3174 vaesenc $AESKEY2,$B08_11,$B08_11
3175 vaesenc $AESKEY2,$B12_15,$B12_15
3176 vbroadcastf64x2 `(16 * 9)`($AES_KEYS),$AESKEY2
3179 # ;; =================================================
3180 # ;; Add mid product to high and low
3181 if ($DO_REDUCTION eq "final_reduction") {
3183 vpternlogq \$0x96,$GH2M,$GH2H,$GH1H # ; TH = TH1 + TH2 + TM>>64
3184 vpxorq $TO_REDUCE_H,$GH1H,$GH1H
3185 vpternlogq \$0x96,$GH1M,$GH2L,$GH1L # ; TL = TL1 + TL2 + TM<<64
3186 vpxorq $TO_REDUCE_L,$GH1L,$GH1L
3190 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3193 vaesenc $AESKEY1,$B00_03,$B00_03
3194 vaesenc $AESKEY1,$B04_07,$B04_07
3195 vaesenc $AESKEY1,$B08_11,$B08_11
3196 vaesenc $AESKEY1,$B12_15,$B12_15
3197 vbroadcastf64x2 `(16 * 10)`($AES_KEYS),$AESKEY1
3200 # ;; =================================================
3201 # ;; horizontal xor of low and high 4x128
3202 if ($DO_REDUCTION eq "final_reduction") {
3203 &VHPXORI4x128($GH1H, $GH2H);
3204 &VHPXORI4x128($GH1L, $GH2L);
3207 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3210 vaesenc $AESKEY2,$B00_03,$B00_03
3211 vaesenc $AESKEY2,$B04_07,$B04_07
3212 vaesenc $AESKEY2,$B08_11,$B08_11
3213 vaesenc $AESKEY2,$B12_15,$B12_15
3215 if (($NROUNDS >= 11)) {
3216 $code .= "vbroadcastf64x2 `(16 * 11)`($AES_KEYS),$AESKEY2\n";
3219 # ;; =================================================
3220 # ;; first phase of reduction
3221 if ($DO_REDUCTION eq "final_reduction") {
3223 vpclmulqdq \$0x01,@{[XWORD($GH1L)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_P1)]}
3224 vpslldq \$8,@{[XWORD($RED_P1)]},@{[XWORD($RED_P1)]} # ; shift-L 2 DWs
3225 vpxorq @{[XWORD($RED_P1)]},@{[XWORD($GH1L)]},@{[XWORD($RED_P1)]} # ; first phase of the reduct
3229 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3230 # ;; AES rounds up to 11 (AES192) or 13 (AES256)
3232 if (($NROUNDS >= 11)) {
3234 vaesenc $AESKEY1,$B00_03,$B00_03
3235 vaesenc $AESKEY1,$B04_07,$B04_07
3236 vaesenc $AESKEY1,$B08_11,$B08_11
3237 vaesenc $AESKEY1,$B12_15,$B12_15
3238 vbroadcastf64x2 `(16 * 12)`($AES_KEYS),$AESKEY1
3240 vaesenc $AESKEY2,$B00_03,$B00_03
3241 vaesenc $AESKEY2,$B04_07,$B04_07
3242 vaesenc $AESKEY2,$B08_11,$B08_11
3243 vaesenc $AESKEY2,$B12_15,$B12_15
3245 if (($NROUNDS == 13)) {
3247 vbroadcastf64x2 `(16 * 13)`($AES_KEYS),$AESKEY2
3249 vaesenc $AESKEY1,$B00_03,$B00_03
3250 vaesenc $AESKEY1,$B04_07,$B04_07
3251 vaesenc $AESKEY1,$B08_11,$B08_11
3252 vaesenc $AESKEY1,$B12_15,$B12_15
3253 vbroadcastf64x2 `(16 * 14)`($AES_KEYS),$AESKEY1
3255 vaesenc $AESKEY2,$B00_03,$B00_03
3256 vaesenc $AESKEY2,$B04_07,$B04_07
3257 vaesenc $AESKEY2,$B08_11,$B08_11
3258 vaesenc $AESKEY2,$B12_15,$B12_15
3263 # ;; =================================================
3264 # ;; second phase of the reduction
3265 if ($DO_REDUCTION eq "final_reduction") {
3267 vpclmulqdq \$0x00,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T1)]}
3268 vpsrldq \$4,@{[XWORD($RED_T1)]},@{[XWORD($RED_T1)]} # ; shift-R 1-DW to obtain 2-DWs shift-R
3269 vpclmulqdq \$0x10,@{[XWORD($RED_P1)]},@{[XWORD($RED_POLY)]},@{[XWORD($RED_T2)]}
3270 vpslldq \$4,@{[XWORD($RED_T2)]},@{[XWORD($RED_T2)]} # ; shift-L 1-DW for result without shifts
3271 # ;; GH1H = GH1H x RED_T1 x RED_T2
3272 vpternlogq \$0x96,@{[XWORD($RED_T1)]},@{[XWORD($RED_T2)]},@{[XWORD($GH1H)]}
3276 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3277 # ;; the last AES round
3279 vaesenclast $AESKEY1,$B00_03,$B00_03
3280 vaesenclast $AESKEY1,$B04_07,$B04_07
3281 vaesenclast $AESKEY1,$B08_11,$B08_11
3282 vaesenclast $AESKEY1,$B12_15,$B12_15
3284 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3285 # ;; XOR against plain/cipher text
3286 vpxorq $DATA1,$B00_03,$B00_03
3287 vpxorq $DATA2,$B04_07,$B04_07
3288 vpxorq $DATA3,$B08_11,$B08_11
3289 vpxorq $DATA4,$B12_15,$B12_15
3291 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3292 # ;; store cipher/plain text
3293 mov $CIPH_PLAIN_OUT,$IA0
3294 vmovdqu8 $B00_03,`$DATA_DISPL + (0 * 64)`($IA0,$DATA_OFFSET,1)
3295 vmovdqu8 $B04_07,`$DATA_DISPL + (1 * 64)`($IA0,$DATA_OFFSET,1)
3296 vmovdqu8 $B08_11,`$DATA_DISPL + (2 * 64)`($IA0,$DATA_OFFSET,1)
3297 vmovdqu8 $B12_15,`$DATA_DISPL + (3 * 64)`($IA0,$DATA_OFFSET,1)
3300 # ;; =================================================
3301 # ;; shuffle cipher text blocks for GHASH computation
3302 if ($ENC_DEC eq "ENC") {
3304 vpshufb $SHFMSK,$B00_03,$B00_03
3305 vpshufb $SHFMSK,$B04_07,$B04_07
3306 vpshufb $SHFMSK,$B08_11,$B08_11
3307 vpshufb $SHFMSK,$B12_15,$B12_15
3311 vpshufb $SHFMSK,$DATA1,$B00_03
3312 vpshufb $SHFMSK,$DATA2,$B04_07
3313 vpshufb $SHFMSK,$DATA3,$B08_11
3314 vpshufb $SHFMSK,$DATA4,$B12_15
3318 # ;; =================================================
3319 # ;; store shuffled cipher text for ghashing
3321 vmovdqa64 $B00_03,`$AESOUT_BLK_OFFSET + (0*64)`(%rsp)
3322 vmovdqa64 $B04_07,`$AESOUT_BLK_OFFSET + (1*64)`(%rsp)
3323 vmovdqa64 $B08_11,`$AESOUT_BLK_OFFSET + (2*64)`(%rsp)
3324 vmovdqa64 $B12_15,`$AESOUT_BLK_OFFSET + (3*64)`(%rsp)
3328 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3329 # ;;; Encryption of a single block
3330 sub ENCRYPT_SINGLE_BLOCK {
3331 my $AES_KEY = $_[0]; # ; [in]
3332 my $XMM0 = $_[1]; # ; [in/out]
3333 my $GPR1 = $_[2]; # ; [clobbered]
3335 my $label_suffix = $label_count++;
3338 # ; load number of rounds from AES_KEY structure (offset in bytes is
3339 # ; size of the |rd_key| buffer)
3340 mov `4*15*4`($AES_KEY),@{[DWORD($GPR1)]}
3341 cmp \$9,@{[DWORD($GPR1)]}
3342 je .Laes_128_${label_suffix}
3343 cmp \$11,@{[DWORD($GPR1)]}
3344 je .Laes_192_${label_suffix}
3345 cmp \$13,@{[DWORD($GPR1)]}
3346 je .Laes_256_${label_suffix}
3347 jmp .Lexit_aes_${label_suffix}
3349 for my $keylen (sort keys %aes_rounds) {
3350 my $nr = $aes_rounds{$keylen};
3353 .Laes_${keylen}_${label_suffix}:
3355 $code .= "vpxorq `16*0`($AES_KEY),$XMM0, $XMM0\n\n";
3356 for (my $i = 1; $i <= $nr; $i++) {
3357 $code .= "vaesenc `16*$i`($AES_KEY),$XMM0,$XMM0\n\n";
3360 vaesenclast `16*($nr+1)`($AES_KEY),$XMM0,$XMM0
3361 jmp .Lexit_aes_${label_suffix}
3364 $code .= ".Lexit_aes_${label_suffix}:\n\n";
3368 my $GCM128_CTX = $_[0]; #; [in] Pointer to GCM context
3369 my $IV = $_[1]; #; [in] Pointer to IV
3370 my $IV_LEN = $_[2]; #; [in] IV length
3371 my $J0 = $_[3]; #; [out] XMM reg to contain J0
3372 my $ZT0 = $_[4]; #; [clobbered] ZMM register
3373 my $ZT1 = $_[5]; #; [clobbered] ZMM register
3374 my $ZT2 = $_[6]; #; [clobbered] ZMM register
3375 my $ZT3 = $_[7]; #; [clobbered] ZMM register
3376 my $ZT4 = $_[8]; #; [clobbered] ZMM register
3377 my $ZT5 = $_[9]; #; [clobbered] ZMM register
3378 my $ZT6 = $_[10]; #; [clobbered] ZMM register
3379 my $ZT7 = $_[11]; #; [clobbered] ZMM register
3380 my $ZT8 = $_[12]; #; [clobbered] ZMM register
3381 my $ZT9 = $_[13]; #; [clobbered] ZMM register
3382 my $ZT10 = $_[14]; #; [clobbered] ZMM register
3383 my $ZT11 = $_[15]; #; [clobbered] ZMM register
3384 my $ZT12 = $_[16]; #; [clobbered] ZMM register
3385 my $ZT13 = $_[17]; #; [clobbered] ZMM register
3386 my $ZT14 = $_[18]; #; [clobbered] ZMM register
3387 my $ZT15 = $_[19]; #; [clobbered] ZMM register
3388 my $ZT16 = $_[20]; #; [clobbered] ZMM register
3389 my $T1 = $_[21]; #; [clobbered] GP register
3390 my $T2 = $_[22]; #; [clobbered] GP register
3391 my $T3 = $_[23]; #; [clobbered] GP register
3392 my $MASKREG = $_[24]; #; [clobbered] mask register
3394 # ;; J0 = GHASH(IV || 0s+64 || len(IV)64)
3395 # ;; s = 16 * RoundUp(len(IV)/16) - len(IV) */
3397 # ;; Calculate GHASH of (IV || 0s)
3398 $code .= "vpxor $J0,$J0,$J0\n";
3399 &CALC_AAD_HASH($IV, $IV_LEN, $J0, $GCM128_CTX, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4,
3400 $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $T1, $T2, $T3, $MASKREG);
3402 # ;; Calculate GHASH of last 16-byte block (0 || len(IV)64)
3405 shl \$3,$T1 # ; IV length in bits
3406 vmovq $T1,@{[XWORD($ZT2)]}
3408 # ;; Might need shuffle of ZT2
3409 vpxorq $J0,@{[XWORD($ZT2)]},$J0
3411 vmovdqu64 @{[HashKeyByIdx(1,$GCM128_CTX)]},@{[XWORD($ZT0)]}
3413 &GHASH_MUL($J0, @{[XWORD($ZT0)]}, @{[XWORD($ZT1)]}, @{[XWORD($ZT2)]}, @{[XWORD($ZT3)]});
3415 $code .= "vpshufb SHUF_MASK(%rip),$J0,$J0 # ; perform a 16Byte swap\n";
3418 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3419 # ;;; GCM_INIT_IV performs an initialization of gcm128_ctx struct to prepare for
3420 # ;;; encoding/decoding.
3421 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3423 my $AES_KEYS = $_[0]; # [in] AES key schedule
3424 my $GCM128_CTX = $_[1]; # [in/out] GCM context
3425 my $IV = $_[2]; # [in] IV pointer
3426 my $IV_LEN = $_[3]; # [in] IV length
3427 my $GPR1 = $_[4]; # [clobbered] GP register
3428 my $GPR2 = $_[5]; # [clobbered] GP register
3429 my $GPR3 = $_[6]; # [clobbered] GP register
3430 my $MASKREG = $_[7]; # [clobbered] mask register
3431 my $CUR_COUNT = $_[8]; # [out] XMM with current counter
3432 my $ZT0 = $_[9]; # [clobbered] ZMM register
3433 my $ZT1 = $_[10]; # [clobbered] ZMM register
3434 my $ZT2 = $_[11]; # [clobbered] ZMM register
3435 my $ZT3 = $_[12]; # [clobbered] ZMM register
3436 my $ZT4 = $_[13]; # [clobbered] ZMM register
3437 my $ZT5 = $_[14]; # [clobbered] ZMM register
3438 my $ZT6 = $_[15]; # [clobbered] ZMM register
3439 my $ZT7 = $_[16]; # [clobbered] ZMM register
3440 my $ZT8 = $_[17]; # [clobbered] ZMM register
3441 my $ZT9 = $_[18]; # [clobbered] ZMM register
3442 my $ZT10 = $_[19]; # [clobbered] ZMM register
3443 my $ZT11 = $_[20]; # [clobbered] ZMM register
3444 my $ZT12 = $_[21]; # [clobbered] ZMM register
3445 my $ZT13 = $_[22]; # [clobbered] ZMM register
3446 my $ZT14 = $_[23]; # [clobbered] ZMM register
3447 my $ZT15 = $_[24]; # [clobbered] ZMM register
3448 my $ZT16 = $_[25]; # [clobbered] ZMM register
3451 $ZT0x =~ s/zmm/xmm/;
3455 je iv_len_12_init_IV
3458 # ;; IV is different than 12 bytes
3459 &CALC_J0($GCM128_CTX, $IV, $IV_LEN, $CUR_COUNT, $ZT0, $ZT1, $ZT2, $ZT3, $ZT4, $ZT5, $ZT6, $ZT7,
3460 $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13, $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
3462 jmp skip_iv_len_12_init_IV
3463 iv_len_12_init_IV: # ;; IV is 12 bytes
3464 # ;; read 12 IV bytes and pad with 0x00000001
3465 vmovdqu8 ONEf(%rip),$CUR_COUNT
3467 mov \$0x0000000000000fff,@{[DWORD($GPR1)]}
3468 kmovq $GPR1,$MASKREG
3469 vmovdqu8 ($GPR2),${CUR_COUNT}{$MASKREG} # ; ctr = IV | 0x1
3470 skip_iv_len_12_init_IV:
3471 vmovdqu $CUR_COUNT,$ZT0x
3473 &ENCRYPT_SINGLE_BLOCK($AES_KEYS, "$ZT0x", "$GPR1"); # ; E(K, Y0)
3475 vmovdqu $ZT0x,`$CTX_OFFSET_EK0`($GCM128_CTX) # ; save EK0 for finalization stage
3477 # ;; store IV as counter in LE format
3478 vpshufb SHUF_MASK(%rip),$CUR_COUNT,$CUR_COUNT
3479 vmovdqu $CUR_COUNT,`$CTX_OFFSET_CurCount`($GCM128_CTX) # ; save current counter Yi
3483 sub GCM_UPDATE_AAD {
3484 my $GCM128_CTX = $_[0]; # [in] GCM context pointer
3485 my $A_IN = $_[1]; # [in] AAD pointer
3486 my $A_LEN = $_[2]; # [in] AAD length in bytes
3487 my $GPR1 = $_[3]; # [clobbered] GP register
3488 my $GPR2 = $_[4]; # [clobbered] GP register
3489 my $GPR3 = $_[5]; # [clobbered] GP register
3490 my $MASKREG = $_[6]; # [clobbered] mask register
3491 my $AAD_HASH = $_[7]; # [out] XMM for AAD_HASH value
3492 my $ZT0 = $_[8]; # [clobbered] ZMM register
3493 my $ZT1 = $_[9]; # [clobbered] ZMM register
3494 my $ZT2 = $_[10]; # [clobbered] ZMM register
3495 my $ZT3 = $_[11]; # [clobbered] ZMM register
3496 my $ZT4 = $_[12]; # [clobbered] ZMM register
3497 my $ZT5 = $_[13]; # [clobbered] ZMM register
3498 my $ZT6 = $_[14]; # [clobbered] ZMM register
3499 my $ZT7 = $_[15]; # [clobbered] ZMM register
3500 my $ZT8 = $_[16]; # [clobbered] ZMM register
3501 my $ZT9 = $_[17]; # [clobbered] ZMM register
3502 my $ZT10 = $_[18]; # [clobbered] ZMM register
3503 my $ZT11 = $_[19]; # [clobbered] ZMM register
3504 my $ZT12 = $_[20]; # [clobbered] ZMM register
3505 my $ZT13 = $_[21]; # [clobbered] ZMM register
3506 my $ZT14 = $_[22]; # [clobbered] ZMM register
3507 my $ZT15 = $_[23]; # [clobbered] ZMM register
3508 my $ZT16 = $_[24]; # [clobbered] ZMM register
3510 # ; load current hash
3511 $code .= "vmovdqu64 $CTX_OFFSET_AadHash($GCM128_CTX),$AAD_HASH\n";
3513 &CALC_AAD_HASH($A_IN, $A_LEN, $AAD_HASH, $GCM128_CTX, $ZT0, $ZT1, $ZT2,
3514 $ZT3, $ZT4, $ZT5, $ZT6, $ZT7, $ZT8, $ZT9, $ZT10, $ZT11, $ZT12, $ZT13,
3515 $ZT14, $ZT15, $ZT16, $GPR1, $GPR2, $GPR3, $MASKREG);
3517 # ; load current hash
3518 $code .= "vmovdqu64 $AAD_HASH,$CTX_OFFSET_AadHash($GCM128_CTX)\n";
3521 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3522 # ;;; Cipher and ghash of payloads shorter than 256 bytes
3523 # ;;; - number of blocks in the message comes as argument
3524 # ;;; - depending on the number of blocks an optimized variant of
3525 # ;;; INITIAL_BLOCKS_PARTIAL is invoked
3526 sub GCM_ENC_DEC_SMALL {
3527 my $AES_KEYS = $_[0]; # [in] key pointer
3528 my $GCM128_CTX = $_[1]; # [in] context pointer
3529 my $CIPH_PLAIN_OUT = $_[2]; # [in] output buffer
3530 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer
3531 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
3532 my $ENC_DEC = $_[5]; # [in] cipher direction
3533 my $DATA_OFFSET = $_[6]; # [in] data offset
3534 my $LENGTH = $_[7]; # [in] data length
3535 my $NUM_BLOCKS = $_[8]; # [in] number of blocks to process 1 to 16
3536 my $CTR = $_[9]; # [in/out] XMM counter block
3537 my $HASH_IN_OUT = $_[10]; # [in/out] XMM GHASH value
3538 my $ZTMP0 = $_[11]; # [clobbered] ZMM register
3539 my $ZTMP1 = $_[12]; # [clobbered] ZMM register
3540 my $ZTMP2 = $_[13]; # [clobbered] ZMM register
3541 my $ZTMP3 = $_[14]; # [clobbered] ZMM register
3542 my $ZTMP4 = $_[15]; # [clobbered] ZMM register
3543 my $ZTMP5 = $_[16]; # [clobbered] ZMM register
3544 my $ZTMP6 = $_[17]; # [clobbered] ZMM register
3545 my $ZTMP7 = $_[18]; # [clobbered] ZMM register
3546 my $ZTMP8 = $_[19]; # [clobbered] ZMM register
3547 my $ZTMP9 = $_[20]; # [clobbered] ZMM register
3548 my $ZTMP10 = $_[21]; # [clobbered] ZMM register
3549 my $ZTMP11 = $_[22]; # [clobbered] ZMM register
3550 my $ZTMP12 = $_[23]; # [clobbered] ZMM register
3551 my $ZTMP13 = $_[24]; # [clobbered] ZMM register
3552 my $ZTMP14 = $_[25]; # [clobbered] ZMM register
3553 my $IA0 = $_[26]; # [clobbered] GP register
3554 my $IA1 = $_[27]; # [clobbered] GP register
3555 my $MASKREG = $_[28]; # [clobbered] mask register
3556 my $SHUFMASK = $_[29]; # [in] ZMM with BE/LE shuffle mask
3557 my $PBLOCK_LEN = $_[30]; # [in] partial block length
3559 my $label_suffix = $label_count++;
3563 je .L_small_initial_num_blocks_is_8_${label_suffix}
3564 jl .L_small_initial_num_blocks_is_7_1_${label_suffix}
3567 cmp \$12,$NUM_BLOCKS
3568 je .L_small_initial_num_blocks_is_12_${label_suffix}
3569 jl .L_small_initial_num_blocks_is_11_9_${label_suffix}
3571 # ;; 16, 15, 14 or 13
3572 cmp \$16,$NUM_BLOCKS
3573 je .L_small_initial_num_blocks_is_16_${label_suffix}
3574 cmp \$15,$NUM_BLOCKS
3575 je .L_small_initial_num_blocks_is_15_${label_suffix}
3576 cmp \$14,$NUM_BLOCKS
3577 je .L_small_initial_num_blocks_is_14_${label_suffix}
3578 jmp .L_small_initial_num_blocks_is_13_${label_suffix}
3580 .L_small_initial_num_blocks_is_11_9_${label_suffix}:
3582 cmp \$11,$NUM_BLOCKS
3583 je .L_small_initial_num_blocks_is_11_${label_suffix}
3584 cmp \$10,$NUM_BLOCKS
3585 je .L_small_initial_num_blocks_is_10_${label_suffix}
3586 jmp .L_small_initial_num_blocks_is_9_${label_suffix}
3588 .L_small_initial_num_blocks_is_7_1_${label_suffix}:
3590 je .L_small_initial_num_blocks_is_4_${label_suffix}
3591 jl .L_small_initial_num_blocks_is_3_1_${label_suffix}
3594 je .L_small_initial_num_blocks_is_7_${label_suffix}
3596 je .L_small_initial_num_blocks_is_6_${label_suffix}
3597 jmp .L_small_initial_num_blocks_is_5_${label_suffix}
3599 .L_small_initial_num_blocks_is_3_1_${label_suffix}:
3602 je .L_small_initial_num_blocks_is_3_${label_suffix}
3604 je .L_small_initial_num_blocks_is_2_${label_suffix}
3606 # ;; for $NUM_BLOCKS == 1, just fall through and no 'jmp' needed
3608 # ;; Generation of different block size variants
3609 # ;; - one block size has to be the first one
3612 for (my $num_blocks = 1; $num_blocks <= 16; $num_blocks++) {
3613 $code .= ".L_small_initial_num_blocks_is_${num_blocks}_${label_suffix}:\n";
3614 &INITIAL_BLOCKS_PARTIAL(
3615 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $LENGTH, $DATA_OFFSET,
3616 $num_blocks, $CTR, $HASH_IN_OUT, $ENC_DEC, $ZTMP0, $ZTMP1,
3617 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3618 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3619 $ZTMP14, $IA0, $IA1, $MASKREG, $SHUFMASK, $PBLOCK_LEN);
3621 if ($num_blocks != 16) {
3622 $code .= "jmp .L_small_initial_blocks_encrypted_${label_suffix}\n";
3626 $code .= ".L_small_initial_blocks_encrypted_${label_suffix}:\n";
3629 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3630 # ; GCM_ENC_DEC Encrypts/Decrypts given data. Assumes that the passed gcm128_context
3631 # ; struct has been initialized by GCM_INIT_IV
3632 # ; Requires the input data be at least 1 byte long because of READ_SMALL_INPUT_DATA.
3633 # ; Clobbers rax, r10-r15, and zmm0-zmm31, k1
3634 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
3636 my $AES_KEYS = $_[0]; # [in] AES Key schedule
3637 my $GCM128_CTX = $_[1]; # [in] context pointer
3638 my $PBLOCK_LEN = $_[2]; # [in] length of partial block at the moment of previous update
3639 my $PLAIN_CIPH_IN = $_[3]; # [in] input buffer pointer
3640 my $PLAIN_CIPH_LEN = $_[4]; # [in] buffer length
3641 my $CIPH_PLAIN_OUT = $_[5]; # [in] output buffer pointer
3642 my $ENC_DEC = $_[6]; # [in] cipher direction
3653 my $LENGTH = $win64 ? $IA2 : $PLAIN_CIPH_LEN;
3655 my $CTR_CHECK = $IA3;
3656 my $DATA_OFFSET = $IA4;
3657 my $HASHK_PTR = $IA6;
3659 my $HKEYS_READY = $IA7;
3661 my $CTR_BLOCKz = "%zmm2";
3662 my $CTR_BLOCKx = "%xmm2";
3664 # ; hardcoded in GCM_INIT
3666 my $AAD_HASHz = "%zmm14";
3667 my $AAD_HASHx = "%xmm14";
3669 # ; hardcoded in GCM_COMPLETE
3671 my $ZTMP0 = "%zmm0";
3672 my $ZTMP1 = "%zmm3";
3673 my $ZTMP2 = "%zmm4";
3674 my $ZTMP3 = "%zmm5";
3675 my $ZTMP4 = "%zmm6";
3676 my $ZTMP5 = "%zmm7";
3677 my $ZTMP6 = "%zmm10";
3678 my $ZTMP7 = "%zmm11";
3679 my $ZTMP8 = "%zmm12";
3680 my $ZTMP9 = "%zmm13";
3681 my $ZTMP10 = "%zmm15";
3682 my $ZTMP11 = "%zmm16";
3683 my $ZTMP12 = "%zmm17";
3685 my $ZTMP13 = "%zmm19";
3686 my $ZTMP14 = "%zmm20";
3687 my $ZTMP15 = "%zmm21";
3688 my $ZTMP16 = "%zmm30";
3689 my $ZTMP17 = "%zmm31";
3690 my $ZTMP18 = "%zmm1";
3691 my $ZTMP19 = "%zmm18";
3692 my $ZTMP20 = "%zmm8";
3693 my $ZTMP21 = "%zmm22";
3694 my $ZTMP22 = "%zmm23";
3699 my $SHUF_MASK = "%zmm29";
3701 # ; Unused in the small packet path
3702 my $ADDBE_4x4 = "%zmm27";
3703 my $ADDBE_1234 = "%zmm28";
3705 my $MASKREG = "%k1";
3707 my $label_suffix = $label_count++;
3709 # ;; reduction every 48 blocks, depth 32 blocks
3710 # ;; @note 48 blocks is the maximum capacity of the stack frame
3711 my $big_loop_nblocks = 48;
3712 my $big_loop_depth = 32;
3714 # ;;; Macro flow depending on packet size
3715 # ;;; - LENGTH <= 16 blocks
3716 # ;;; - cipher followed by hashing (reduction)
3717 # ;;; - 16 blocks < LENGTH < 32 blocks
3718 # ;;; - cipher 16 blocks
3719 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3720 # ;;; - 32 blocks < LENGTH < 48 blocks
3721 # ;;; - cipher 2 x 16 blocks
3722 # ;;; - hash 16 blocks
3723 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3724 # ;;; - LENGTH >= 48 blocks
3725 # ;;; - cipher 2 x 16 blocks
3726 # ;;; - while (data_to_cipher >= 48 blocks):
3727 # ;;; - cipher 16 blocks & hash 16 blocks
3728 # ;;; - cipher 16 blocks & hash 16 blocks
3729 # ;;; - cipher 16 blocks & hash 16 blocks (reduction)
3730 # ;;; - if (data_to_cipher >= 32 blocks):
3731 # ;;; - cipher 16 blocks & hash 16 blocks
3732 # ;;; - cipher 16 blocks & hash 16 blocks
3733 # ;;; - hash 16 blocks (reduction)
3734 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3735 # ;;; - elif (data_to_cipher >= 16 blocks):
3736 # ;;; - cipher 16 blocks & hash 16 blocks
3737 # ;;; - hash 16 blocks
3738 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3740 # ;;; - hash 16 blocks
3741 # ;;; - cipher N blocks & hash 16 blocks, hash N blocks (reduction)
3744 $code .= "cmpq \$0,$PLAIN_CIPH_LEN\n";
3746 $code .= "or $PLAIN_CIPH_LEN,$PLAIN_CIPH_LEN\n";
3748 $code .= "je .L_enc_dec_done_${label_suffix}\n";
3750 # Length value from context $CTX_OFFSET_InLen`($GCM128_CTX) is updated in
3751 # 'providers/implementations/ciphers/cipher_aes_gcm_hw_vaes_avx512.inc'
3753 $code .= "xor $HKEYS_READY, $HKEYS_READY\n";
3754 $code .= "vmovdqu64 `$CTX_OFFSET_AadHash`($GCM128_CTX),$AAD_HASHx\n";
3756 # ;; Used for the update flow - if there was a previous partial
3757 # ;; block fill the remaining bytes here.
3759 $GCM128_CTX, $PBLOCK_LEN, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN,
3760 $DATA_OFFSET, $AAD_HASHx, $ENC_DEC, $IA0, $IA1,
3761 $IA2, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3,
3762 $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $MASKREG);
3764 $code .= "vmovdqu64 `$CTX_OFFSET_CurCount`($GCM128_CTX),$CTR_BLOCKx\n";
3766 # ;; Save the amount of data left to process in $LENGTH
3767 # ;; NOTE: PLAIN_CIPH_LEN is a register on linux;
3769 $code .= "mov $PLAIN_CIPH_LEN,$LENGTH\n";
3772 # ;; There may be no more data if it was consumed in the partial block.
3774 sub $DATA_OFFSET,$LENGTH
3775 je .L_enc_dec_done_${label_suffix}
3779 cmp \$`(16 * 16)`,$LENGTH
3780 jbe .L_message_below_equal_16_blocks_${label_suffix}
3782 vmovdqa64 SHUF_MASK(%rip),$SHUF_MASK
3783 vmovdqa64 ddq_addbe_4444(%rip),$ADDBE_4x4
3784 vmovdqa64 ddq_addbe_1234(%rip),$ADDBE_1234
3786 # ;; start the pipeline
3787 # ;; - 32 blocks aes-ctr
3788 # ;; - 16 blocks ghash + aes-ctr
3790 # ;; set up CTR_CHECK
3791 vmovd $CTR_BLOCKx,@{[DWORD($CTR_CHECK)]}
3792 and \$255,@{[DWORD($CTR_CHECK)]}
3793 # ;; in LE format after init, convert to BE
3794 vshufi64x2 \$0,$CTR_BLOCKz,$CTR_BLOCKz,$CTR_BLOCKz
3795 vpshufb $SHUF_MASK,$CTR_BLOCKz,$CTR_BLOCKz
3798 # ;; ==== AES-CTR - first 16 blocks
3799 my $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3800 my $data_in_out_offset = 0;
3802 $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
3803 $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
3804 $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
3805 $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
3807 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3811 cmp \$`(32 * 16)`,$LENGTH
3812 jb .L_message_below_32_blocks_${label_suffix}
3815 # ;; ==== AES-CTR - next 16 blocks
3816 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3817 $data_in_out_offset = (16 * 16);
3819 $PLAIN_CIPH_IN, $CIPH_PLAIN_OUT, $AES_KEYS, $DATA_OFFSET, "no_ghash", $CTR_BLOCKz,
3820 $CTR_CHECK, $ADDBE_4x4, $ADDBE_1234, $ZTMP0, $ZTMP1, $ZTMP2,
3821 $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
3822 $SHUF_MASK, $ENC_DEC, $aesout_offset, $data_in_out_offset, $IA0);
3824 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3826 $code .= "mov \$1,$HKEYS_READY\n";
3829 add \$`(32 * 16)`,$DATA_OFFSET
3830 sub \$`(32 * 16)`,$LENGTH
3832 cmp \$`($big_loop_nblocks * 16)`,$LENGTH
3833 jb .L_no_more_big_nblocks_${label_suffix}
3837 # ;; ==== AES-CTR + GHASH - 48 blocks loop
3839 $code .= ".L_encrypt_big_nblocks_${label_suffix}:\n";
3841 # ;; ==== AES-CTR + GHASH - 16 blocks, start
3842 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3843 $data_in_out_offset = (0 * 16);
3844 my $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3845 &GHASH_16_ENCRYPT_16_PARALLEL(
3846 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3847 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3848 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3849 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3850 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3851 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3852 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
3855 # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
3856 $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3857 $data_in_out_offset = (16 * 16);
3858 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3859 &GHASH_16_ENCRYPT_16_PARALLEL(
3860 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3861 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3862 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3863 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3864 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3865 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3866 $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3869 # ;; ==== AES-CTR + GHASH - 16 blocks, reduction
3870 $aesout_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3871 $data_in_out_offset = (32 * 16);
3872 $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3873 &GHASH_16_ENCRYPT_16_PARALLEL(
3874 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3875 16, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3876 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3877 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3878 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3879 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3880 $GH, $GM, "final_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3883 # ;; === xor cipher block 0 with GHASH (ZT4)
3885 vmovdqa64 $ZTMP4,$AAD_HASHz
3887 add \$`($big_loop_nblocks * 16)`,$DATA_OFFSET
3888 sub \$`($big_loop_nblocks * 16)`,$LENGTH
3889 cmp \$`($big_loop_nblocks * 16)`,$LENGTH
3890 jae .L_encrypt_big_nblocks_${label_suffix}
3892 .L_no_more_big_nblocks_${label_suffix}:
3894 cmp \$`(32 * 16)`,$LENGTH
3895 jae .L_encrypt_32_blocks_${label_suffix}
3897 cmp \$`(16 * 16)`,$LENGTH
3898 jae .L_encrypt_16_blocks_${label_suffix}
3901 # ;; =====================================================
3902 # ;; =====================================================
3903 # ;; ==== GHASH 1 x 16 blocks
3904 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
3905 # ;; ==== then GHASH N blocks
3906 $code .= ".L_encrypt_0_blocks_ghash_32_${label_suffix}:\n";
3908 # ;; calculate offset to the right hash key
3910 mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}
3911 and \$~15,@{[DWORD($IA0)]}
3912 mov \$`@{[HashKeyOffsetByIdx(32,"frame")]}`,@{[DWORD($HASHK_PTR)]}
3913 sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
3916 # ;; ==== GHASH 32 blocks and follow with reduction
3917 &GHASH_16("start", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (0 * 16),
3918 "%rsp", $HASHK_PTR, 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
3920 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
3921 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3922 $code .= "add \$`(16 * 16)`,@{[DWORD($HASHK_PTR)]}\n";
3924 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
3925 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
3926 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3927 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
3928 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
3929 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
3930 "mid", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
3931 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
3933 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
3934 $code .= "jmp .L_ghash_done_${label_suffix}\n";
3936 # ;; =====================================================
3937 # ;; =====================================================
3938 # ;; ==== GHASH & encrypt 1 x 16 blocks
3939 # ;; ==== GHASH & encrypt 1 x 16 blocks
3940 # ;; ==== GHASH 1 x 16 blocks (reduction)
3941 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
3942 # ;; ==== then GHASH N blocks
3943 $code .= ".L_encrypt_32_blocks_${label_suffix}:\n";
3945 # ;; ==== AES-CTR + GHASH - 16 blocks, start
3946 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
3947 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3948 $data_in_out_offset = (0 * 16);
3949 &GHASH_16_ENCRYPT_16_PARALLEL(
3950 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3951 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3952 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3953 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3954 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3955 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3956 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
3959 # ;; ==== AES-CTR + GHASH - 16 blocks, no reduction
3960 $aesout_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3961 $ghashin_offset = ($STACK_LOCAL_OFFSET + (16 * 16));
3962 $data_in_out_offset = (16 * 16);
3963 &GHASH_16_ENCRYPT_16_PARALLEL(
3964 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
3965 32, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
3966 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
3967 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
3968 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
3969 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
3970 $GH, $GM, "no_reduction", $ENC_DEC, $data_in_out_offset, "no_ghash_in",
3973 # ;; ==== GHASH 16 blocks with reduction
3975 "end_reduce", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (32 * 16),
3976 "%rsp", &HashKeyOffsetByIdx(16, "frame"),
3977 0, $AAD_HASHz, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
3979 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
3980 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
3982 sub \$`(32 * 16)`,$LENGTH
3983 add \$`(32 * 16)`,$DATA_OFFSET
3986 # ;; calculate offset to the right hash key
3987 $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
3989 and \$~15,@{[DWORD($IA0)]}
3990 mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
3991 sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
3994 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
3995 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
3996 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
3997 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
3998 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
3999 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
4000 "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
4001 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
4003 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4004 $code .= "jmp .L_ghash_done_${label_suffix}\n";
4006 # ;; =====================================================
4007 # ;; =====================================================
4008 # ;; ==== GHASH & encrypt 16 blocks (done before)
4009 # ;; ==== GHASH 1 x 16 blocks
4010 # ;; ==== GHASH 1 x 16 blocks (reduction) & encrypt N blocks
4011 # ;; ==== then GHASH N blocks
4012 $code .= ".L_encrypt_16_blocks_${label_suffix}:\n";
4014 # ;; ==== AES-CTR + GHASH - 16 blocks, start
4015 $aesout_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
4016 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
4017 $data_in_out_offset = (0 * 16);
4018 &GHASH_16_ENCRYPT_16_PARALLEL(
4019 $AES_KEYS, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $CTR_BLOCKz, $CTR_CHECK,
4020 48, $aesout_offset, $ghashin_offset, $SHUF_MASK, $ZTMP0, $ZTMP1,
4021 $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7,
4022 $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12, $ZTMP13,
4023 $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18, $ZTMP19,
4024 $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234, $GL,
4025 $GH, $GM, "first_time", $ENC_DEC, $data_in_out_offset, $AAD_HASHz,
4028 # ;; ==== GHASH 1 x 16 blocks
4030 "mid", $GH, $GM, $GL, "%rsp", $STACK_LOCAL_OFFSET, (16 * 16),
4031 "%rsp", &HashKeyOffsetByIdx(32, "frame"),
4032 0, "no_hash_input", $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8, $ZTMP9);
4034 # ;; ==== GHASH 1 x 16 blocks with reduction + cipher and ghash on the reminder
4035 $ghashin_offset = ($STACK_LOCAL_OFFSET + (32 * 16));
4037 sub \$`(16 * 16)`,$LENGTH
4038 add \$`(16 * 16)`,$DATA_OFFSET
4041 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN,
4042 $DATA_OFFSET, $LENGTH, $CTR_BLOCKz, $CTR_CHECK,
4043 &HashKeyOffsetByIdx(16, "frame"), $ghashin_offset, $SHUF_MASK, $ZTMP0,
4044 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4,
4045 $ZTMP5, $ZTMP6, $ZTMP7, $ZTMP8,
4046 $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4047 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16,
4048 $ZTMP17, $ZTMP18, $ZTMP19, $ZTMP20,
4049 $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
4050 "end_reduce", $GL, $GH, $GM,
4051 $ENC_DEC, $AAD_HASHz, $IA0, $IA5,
4052 $MASKREG, $PBLOCK_LEN);
4054 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4056 jmp .L_ghash_done_${label_suffix}
4058 .L_message_below_32_blocks_${label_suffix}:
4059 # ;; 32 > number of blocks > 16
4061 sub \$`(16 * 16)`,$LENGTH
4062 add \$`(16 * 16)`,$DATA_OFFSET
4064 $ghashin_offset = ($STACK_LOCAL_OFFSET + (0 * 16));
4066 # ;; calculate offset to the right hash key
4067 $code .= "mov @{[DWORD($LENGTH)]},@{[DWORD($IA0)]}\n";
4069 &precompute_hkeys_on_stack($GCM128_CTX, $HKEYS_READY, $ZTMP0, $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4071 $code .= "mov \$1,$HKEYS_READY\n";
4074 and \$~15,@{[DWORD($IA0)]}
4075 mov \$`@{[HashKeyOffsetByIdx(16,"frame")]}`,@{[DWORD($HASHK_PTR)]}
4076 sub @{[DWORD($IA0)]},@{[DWORD($HASHK_PTR)]}
4080 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $DATA_OFFSET, $LENGTH,
4081 $CTR_BLOCKz, $CTR_CHECK, $HASHK_PTR, $ghashin_offset, $SHUF_MASK, $ZTMP0,
4082 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4083 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4084 $ZTMP13, $ZTMP14, $ZTMP15, $ZTMP16, $ZTMP17, $ZTMP18,
4085 $ZTMP19, $ZTMP20, $ZTMP21, $ZTMP22, $ADDBE_4x4, $ADDBE_1234,
4086 "start", $GL, $GH, $GM, $ENC_DEC, $AAD_HASHz,
4087 $IA0, $IA5, $MASKREG, $PBLOCK_LEN);
4089 $code .= "vpshufb @{[XWORD($SHUF_MASK)]},$CTR_BLOCKx,$CTR_BLOCKx\n";
4091 jmp .L_ghash_done_${label_suffix}
4093 .L_message_below_equal_16_blocks_${label_suffix}:
4094 # ;; Determine how many blocks to process
4095 # ;; - process one additional block if there is a partial block
4096 mov @{[DWORD($LENGTH)]},@{[DWORD($IA1)]}
4097 add \$15,@{[DWORD($IA1)]}
4098 shr \$4, @{[DWORD($IA1)]} # ; $IA1 can be in the range from 0 to 16
4101 $AES_KEYS, $GCM128_CTX, $CIPH_PLAIN_OUT, $PLAIN_CIPH_IN, $PLAIN_CIPH_LEN, $ENC_DEC,
4102 $DATA_OFFSET, $LENGTH, $IA1, $CTR_BLOCKx, $AAD_HASHx, $ZTMP0,
4103 $ZTMP1, $ZTMP2, $ZTMP3, $ZTMP4, $ZTMP5, $ZTMP6,
4104 $ZTMP7, $ZTMP8, $ZTMP9, $ZTMP10, $ZTMP11, $ZTMP12,
4105 $ZTMP13, $ZTMP14, $IA0, $IA3, $MASKREG, $SHUF_MASK,
4108 # ;; fall through to exit
4110 $code .= ".L_ghash_done_${label_suffix}:\n";
4112 # ;; save the last counter block
4113 $code .= "vmovdqu64 $CTR_BLOCKx,`$CTX_OFFSET_CurCount`($GCM128_CTX)\n";
4115 vmovdqu64 $AAD_HASHx,`$CTX_OFFSET_AadHash`($GCM128_CTX)
4116 .L_enc_dec_done_${label_suffix}:
4120 # ;;; ===========================================================================
4121 # ;;; Encrypt/decrypt the initial 16 blocks
4122 sub INITIAL_BLOCKS_16 {
4123 my $IN = $_[0]; # [in] input buffer
4124 my $OUT = $_[1]; # [in] output buffer
4125 my $AES_KEYS = $_[2]; # [in] pointer to expanded keys
4126 my $DATA_OFFSET = $_[3]; # [in] data offset
4127 my $GHASH = $_[4]; # [in] ZMM with AAD (low 128 bits)
4128 my $CTR = $_[5]; # [in] ZMM with CTR BE blocks 4x128 bits
4129 my $CTR_CHECK = $_[6]; # [in/out] GPR with counter overflow check
4130 my $ADDBE_4x4 = $_[7]; # [in] ZMM 4x128bits with value 4 (big endian)
4131 my $ADDBE_1234 = $_[8]; # [in] ZMM 4x128bits with values 1, 2, 3 & 4 (big endian)
4132 my $T0 = $_[9]; # [clobered] temporary ZMM register
4133 my $T1 = $_[10]; # [clobered] temporary ZMM register
4134 my $T2 = $_[11]; # [clobered] temporary ZMM register
4135 my $T3 = $_[12]; # [clobered] temporary ZMM register
4136 my $T4 = $_[13]; # [clobered] temporary ZMM register
4137 my $T5 = $_[14]; # [clobered] temporary ZMM register
4138 my $T6 = $_[15]; # [clobered] temporary ZMM register
4139 my $T7 = $_[16]; # [clobered] temporary ZMM register
4140 my $T8 = $_[17]; # [clobered] temporary ZMM register
4141 my $SHUF_MASK = $_[18]; # [in] ZMM with BE/LE shuffle mask
4142 my $ENC_DEC = $_[19]; # [in] ENC (encrypt) or DEC (decrypt) selector
4143 my $BLK_OFFSET = $_[20]; # [in] stack frame offset to ciphered blocks
4144 my $DATA_DISPL = $_[21]; # [in] fixed numerical data displacement/offset
4145 my $IA0 = $_[22]; # [clobered] temporary GP register
4152 my $label_suffix = $label_count++;
4154 my $stack_offset = $BLK_OFFSET;
4156 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4157 # ;; prepare counter blocks
4159 cmpb \$`(256 - 16)`,@{[BYTE($CTR_CHECK)]}
4160 jae .L_next_16_overflow_${label_suffix}
4161 vpaddd $ADDBE_1234,$CTR,$B00_03
4162 vpaddd $ADDBE_4x4,$B00_03,$B04_07
4163 vpaddd $ADDBE_4x4,$B04_07,$B08_11
4164 vpaddd $ADDBE_4x4,$B08_11,$B12_15
4165 jmp .L_next_16_ok_${label_suffix}
4166 .L_next_16_overflow_${label_suffix}:
4167 vpshufb $SHUF_MASK,$CTR,$CTR
4168 vmovdqa64 ddq_add_4444(%rip),$B12_15
4169 vpaddd ddq_add_1234(%rip),$CTR,$B00_03
4170 vpaddd $B12_15,$B00_03,$B04_07
4171 vpaddd $B12_15,$B04_07,$B08_11
4172 vpaddd $B12_15,$B08_11,$B12_15
4173 vpshufb $SHUF_MASK,$B00_03,$B00_03
4174 vpshufb $SHUF_MASK,$B04_07,$B04_07
4175 vpshufb $SHUF_MASK,$B08_11,$B08_11
4176 vpshufb $SHUF_MASK,$B12_15,$B12_15
4177 .L_next_16_ok_${label_suffix}:
4178 vshufi64x2 \$0b11111111,$B12_15,$B12_15,$CTR
4179 addb \$16,@{[BYTE($CTR_CHECK)]}
4180 # ;; === load 16 blocks of data
4181 vmovdqu8 `$DATA_DISPL + (64*0)`($IN,$DATA_OFFSET,1),$T0
4182 vmovdqu8 `$DATA_DISPL + (64*1)`($IN,$DATA_OFFSET,1),$T1
4183 vmovdqu8 `$DATA_DISPL + (64*2)`($IN,$DATA_OFFSET,1),$T2
4184 vmovdqu8 `$DATA_DISPL + (64*3)`($IN,$DATA_OFFSET,1),$T3
4186 # ;; move to AES encryption rounds
4187 vbroadcastf64x2 `(16*0)`($AES_KEYS),$T4
4188 vpxorq $T4,$B00_03,$B00_03
4189 vpxorq $T4,$B04_07,$B04_07
4190 vpxorq $T4,$B08_11,$B08_11
4191 vpxorq $T4,$B12_15,$B12_15
4193 foreach (1 .. ($NROUNDS)) {
4195 vbroadcastf64x2 `(16*$_)`($AES_KEYS),$T4
4196 vaesenc $T4,$B00_03,$B00_03
4197 vaesenc $T4,$B04_07,$B04_07
4198 vaesenc $T4,$B08_11,$B08_11
4199 vaesenc $T4,$B12_15,$B12_15
4203 vbroadcastf64x2 `(16*($NROUNDS+1))`($AES_KEYS),$T4
4204 vaesenclast $T4,$B00_03,$B00_03
4205 vaesenclast $T4,$B04_07,$B04_07
4206 vaesenclast $T4,$B08_11,$B08_11
4207 vaesenclast $T4,$B12_15,$B12_15
4209 # ;; xor against text
4210 vpxorq $T0,$B00_03,$B00_03
4211 vpxorq $T1,$B04_07,$B04_07
4212 vpxorq $T2,$B08_11,$B08_11
4213 vpxorq $T3,$B12_15,$B12_15
4217 vmovdqu8 $B00_03,`$DATA_DISPL + (64*0)`($IA0,$DATA_OFFSET,1)
4218 vmovdqu8 $B04_07,`$DATA_DISPL + (64*1)`($IA0,$DATA_OFFSET,1)
4219 vmovdqu8 $B08_11,`$DATA_DISPL + (64*2)`($IA0,$DATA_OFFSET,1)
4220 vmovdqu8 $B12_15,`$DATA_DISPL + (64*3)`($IA0,$DATA_OFFSET,1)
4222 if ($ENC_DEC eq "DEC") {
4224 # ;; decryption - cipher text needs to go to GHASH phase
4225 vpshufb $SHUF_MASK,$T0,$B00_03
4226 vpshufb $SHUF_MASK,$T1,$B04_07
4227 vpshufb $SHUF_MASK,$T2,$B08_11
4228 vpshufb $SHUF_MASK,$T3,$B12_15
4233 vpshufb $SHUF_MASK,$B00_03,$B00_03
4234 vpshufb $SHUF_MASK,$B04_07,$B04_07
4235 vpshufb $SHUF_MASK,$B08_11,$B08_11
4236 vpshufb $SHUF_MASK,$B12_15,$B12_15
4240 if ($GHASH ne "no_ghash") {
4242 # ;; === xor cipher block 0 with GHASH for the next GHASH round
4243 vpxorq $GHASH,$B00_03,$B00_03
4247 vmovdqa64 $B00_03,`$stack_offset + (0 * 64)`(%rsp)
4248 vmovdqa64 $B04_07,`$stack_offset + (1 * 64)`(%rsp)
4249 vmovdqa64 $B08_11,`$stack_offset + (2 * 64)`(%rsp)
4250 vmovdqa64 $B12_15,`$stack_offset + (3 * 64)`(%rsp)
4254 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4255 # ; GCM_COMPLETE Finishes ghash calculation
4256 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4258 my $GCM128_CTX = $_[0];
4259 my $PBLOCK_LEN = $_[1];
4261 my $label_suffix = $label_count++;
4264 vmovdqu @{[HashKeyByIdx(1,$GCM128_CTX)]},%xmm2
4265 vmovdqu $CTX_OFFSET_EK0($GCM128_CTX),%xmm3 # ; xmm3 = E(K,Y0)
4269 vmovdqu `$CTX_OFFSET_AadHash`($GCM128_CTX),%xmm4
4271 # ;; Process the final partial block.
4273 je .L_partial_done_${label_suffix}
4276 # ;GHASH computation for the last <16 Byte block
4277 &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
4280 .L_partial_done_${label_suffix}:
4281 vmovq `$CTX_OFFSET_InLen`($GCM128_CTX), %xmm5
4282 vpinsrq \$1, `$CTX_OFFSET_AadLen`($GCM128_CTX), %xmm5, %xmm5 # ; xmm5 = len(A)||len(C)
4283 vpsllq \$3, %xmm5, %xmm5 # ; convert bytes into bits
4285 vpxor %xmm5,%xmm4,%xmm4
4288 &GHASH_MUL("%xmm4", "%xmm2", "%xmm0", "%xmm16", "%xmm17");
4291 vpshufb SHUF_MASK(%rip),%xmm4,%xmm4 # ; perform a 16Byte swap
4292 vpxor %xmm4,%xmm3,%xmm3
4294 .L_return_T_${label_suffix}:
4295 vmovdqu %xmm3,`$CTX_OFFSET_AadHash`($GCM128_CTX)
4299 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4300 # ;;; Functions definitions
4301 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4305 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4306 # ;void ossl_aes_gcm_init_avx512 /
4307 # ; (const void *aes_keys,
4308 # ; void *gcm128ctx)
4310 # ; Precomputes hashkey table for GHASH optimization.
4311 # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4312 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4314 .globl ossl_aes_gcm_init_avx512
4315 .type ossl_aes_gcm_init_avx512,\@abi-omnipotent
4317 ossl_aes_gcm_init_avx512:
4321 if ($CHECK_FUNCTION_ARGUMENTS) {
4323 # ;; Check aes_keys != NULL
4327 # ;; Check gcm128ctx != NULL
4332 $code .= "vpxorq %xmm16,%xmm16,%xmm16\n";
4333 &ENCRYPT_SINGLE_BLOCK("$arg1", "%xmm16", "%rax"); # ; xmm16 = HashKey
4335 vpshufb SHUF_MASK(%rip),%xmm16,%xmm16
4336 # ;;; PRECOMPUTATION of HashKey<<1 mod poly from the HashKey ;;;
4337 vmovdqa64 %xmm16,%xmm2
4338 vpsllq \$1,%xmm16,%xmm16
4339 vpsrlq \$63,%xmm2,%xmm2
4341 vpslldq \$8,%xmm2,%xmm2
4342 vpsrldq \$8,%xmm1,%xmm1
4343 vporq %xmm2,%xmm16,%xmm16
4345 vpshufd \$0b00100100,%xmm1,%xmm2
4346 vpcmpeqd TWOONE(%rip),%xmm2,%xmm2
4347 vpand POLY(%rip),%xmm2,%xmm2
4348 vpxorq %xmm2,%xmm16,%xmm16 # ; xmm16 holds the HashKey<<1 mod poly
4349 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4350 vmovdqu64 %xmm16,@{[HashKeyByIdx(1,$arg2)]} # ; store HashKey<<1 mod poly
4352 &PRECOMPUTE("$arg2", "%xmm16", "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
4353 if ($CLEAR_SCRATCH_REGISTERS) {
4354 &clear_scratch_gps_asm();
4355 &clear_scratch_zmms_asm();
4357 $code .= "vzeroupper\n";
4363 .size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
4367 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4368 # ;void ossl_aes_gcm_setiv_avx512
4369 # ; (const void *aes_keys,
4370 # ; void *gcm128ctx,
4371 # ; const unsigned char *iv,
4374 # ; Computes E(K,Y0) for finalization, updates current counter Yi in gcm128_context structure.
4375 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4377 .globl ossl_aes_gcm_setiv_avx512
4378 .type ossl_aes_gcm_setiv_avx512,\@abi-omnipotent
4380 ossl_aes_gcm_setiv_avx512:
4385 if ($CHECK_FUNCTION_ARGUMENTS) {
4387 # ;; Check aes_keys != NULL
4391 # ;; Check gcm128ctx != NULL
4395 # ;; Check iv != NULL
4399 # ;; Check ivlen != 0
4405 # ; NOTE: code before PROLOG() must not modify any registers
4407 1, # allocate stack space for hkeys
4408 0, # do not allocate stack space for AES blocks
4411 "$arg1", "$arg2", "$arg3", "$arg4", "%r10", "%r11", "%r12", "%k1", "%xmm2", "%zmm1",
4412 "%zmm11", "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12",
4413 "%zmm13", "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
4415 1, # hkeys were allocated
4422 .size ossl_aes_gcm_setiv_avx512, .-ossl_aes_gcm_setiv_avx512
4425 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4426 # ;void ossl_aes_gcm_update_aad_avx512
4427 # ; (unsigned char *gcm128ctx,
4428 # ; const unsigned char *aad,
4431 # ; Updates AAD hash in gcm128_context structure.
4432 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4434 .globl ossl_aes_gcm_update_aad_avx512
4435 .type ossl_aes_gcm_update_aad_avx512,\@abi-omnipotent
4437 ossl_aes_gcm_update_aad_avx512:
4442 if ($CHECK_FUNCTION_ARGUMENTS) {
4444 # ;; Check gcm128ctx != NULL
4446 jz .Lexit_update_aad
4448 # ;; Check aad != NULL
4450 jz .Lexit_update_aad
4452 # ;; Check aadlen != 0
4454 jz .Lexit_update_aad
4458 # ; NOTE: code before PROLOG() must not modify any registers
4460 1, # allocate stack space for hkeys,
4461 0, # do not allocate stack space for AES blocks
4464 "$arg1", "$arg2", "$arg3", "%r10", "%r11", "%r12", "%k1", "%xmm14", "%zmm1", "%zmm11",
4465 "%zmm3", "%zmm4", "%zmm5", "%zmm6", "%zmm7", "%zmm8", "%zmm9", "%zmm10", "%zmm12", "%zmm13",
4466 "%zmm15", "%zmm16", "%zmm17", "%zmm18", "%zmm19");
4468 1, # hkeys were allocated
4475 .size ossl_aes_gcm_update_aad_avx512, .-ossl_aes_gcm_update_aad_avx512
4478 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4479 # ;void ossl_aes_gcm_encrypt_avx512
4480 # ; (const void* aes_keys,
4481 # ; void *gcm128ctx,
4482 # ; unsigned int *pblocklen,
4483 # ; const unsigned char *in,
4485 # ; unsigned char *out);
4487 # ; Performs encryption of data |in| of len |len|, and stores the output in |out|.
4488 # ; Stores encrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
4489 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4491 .globl ossl_aes_gcm_encrypt_avx512
4492 .type ossl_aes_gcm_encrypt_avx512,\@abi-omnipotent
4494 ossl_aes_gcm_encrypt_avx512:
4496 .Lencrypt_seh_begin:
4500 # ; NOTE: code before PROLOG() must not modify any registers
4502 1, # allocate stack space for hkeys
4503 1, # allocate stack space for AES blocks
4505 if ($CHECK_FUNCTION_ARGUMENTS) {
4507 # ;; Check aes_keys != NULL
4509 jz .Lexit_gcm_encrypt
4511 # ;; Check gcm128ctx != NULL
4513 jz .Lexit_gcm_encrypt
4515 # ;; Check pblocklen != NULL
4517 jz .Lexit_gcm_encrypt
4519 # ;; Check in != NULL
4521 jz .Lexit_gcm_encrypt
4523 # ;; Check if len != 0
4525 jz .Lexit_gcm_encrypt
4527 # ;; Check out != NULL
4529 jz .Lexit_gcm_encrypt
4533 # ; load number of rounds from AES_KEY structure (offset in bytes is
4534 # ; size of the |rd_key| buffer)
4535 mov `4*15*4`($arg1),%eax
4537 je .Laes_gcm_encrypt_128_avx512
4539 je .Laes_gcm_encrypt_192_avx512
4541 je .Laes_gcm_encrypt_256_avx512
4543 jmp .Lexit_gcm_encrypt
4545 for my $keylen (sort keys %aes_rounds) {
4546 $NROUNDS = $aes_rounds{$keylen};
4549 .Laes_gcm_encrypt_${keylen}_avx512:
4551 &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "ENC");
4552 $code .= "jmp .Lexit_gcm_encrypt\n";
4554 $code .= ".Lexit_gcm_encrypt:\n";
4560 .size ossl_aes_gcm_encrypt_avx512, .-ossl_aes_gcm_encrypt_avx512
4563 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4564 # ;void ossl_aes_gcm_decrypt_avx512
4565 # ; (const void* keys,
4566 # ; void *gcm128ctx,
4567 # ; unsigned int *pblocklen,
4568 # ; const unsigned char *in,
4570 # ; unsigned char *out);
4572 # ; Performs decryption of data |in| of len |len|, and stores the output in |out|.
4573 # ; Stores decrypted partial block (if any) in gcm128ctx and its length in |pblocklen|.
4574 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4576 .globl ossl_aes_gcm_decrypt_avx512
4577 .type ossl_aes_gcm_decrypt_avx512,\@abi-omnipotent
4579 ossl_aes_gcm_decrypt_avx512:
4581 .Ldecrypt_seh_begin:
4585 # ; NOTE: code before PROLOG() must not modify any registers
4587 1, # allocate stack space for hkeys
4588 1, # allocate stack space for AES blocks
4590 if ($CHECK_FUNCTION_ARGUMENTS) {
4592 # ;; Check keys != NULL
4594 jz .Lexit_gcm_decrypt
4596 # ;; Check gcm128ctx != NULL
4598 jz .Lexit_gcm_decrypt
4600 # ;; Check pblocklen != NULL
4602 jz .Lexit_gcm_decrypt
4604 # ;; Check in != NULL
4606 jz .Lexit_gcm_decrypt
4608 # ;; Check if len != 0
4610 jz .Lexit_gcm_decrypt
4612 # ;; Check out != NULL
4614 jz .Lexit_gcm_decrypt
4618 # ; load number of rounds from AES_KEY structure (offset in bytes is
4619 # ; size of the |rd_key| buffer)
4620 mov `4*15*4`($arg1),%eax
4622 je .Laes_gcm_decrypt_128_avx512
4624 je .Laes_gcm_decrypt_192_avx512
4626 je .Laes_gcm_decrypt_256_avx512
4628 jmp .Lexit_gcm_decrypt
4630 for my $keylen (sort keys %aes_rounds) {
4631 $NROUNDS = $aes_rounds{$keylen};
4634 .Laes_gcm_decrypt_${keylen}_avx512:
4636 &GCM_ENC_DEC("$arg1", "$arg2", "$arg3", "$arg4", "$arg5", "$arg6", "DEC");
4637 $code .= "jmp .Lexit_gcm_decrypt\n";
4639 $code .= ".Lexit_gcm_decrypt:\n";
4645 .size ossl_aes_gcm_decrypt_avx512, .-ossl_aes_gcm_decrypt_avx512
4648 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4649 # ;void ossl_aes_gcm_finalize_vaes_avx512
4650 # ; (void *gcm128ctx,
4651 # ; unsigned int pblocklen);
4653 # ; Finalizes encryption / decryption
4654 # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4655 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4657 .globl ossl_aes_gcm_finalize_avx512
4658 .type ossl_aes_gcm_finalize_avx512,\@abi-omnipotent
4660 ossl_aes_gcm_finalize_avx512:
4664 if ($CHECK_FUNCTION_ARGUMENTS) {
4666 # ;; Check gcm128ctx != NULL
4672 &GCM_COMPLETE("$arg1", "$arg2");
4678 .size ossl_aes_gcm_finalize_avx512, .-ossl_aes_gcm_finalize_avx512
4681 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4682 # ;void ossl_gcm_gmult_avx512(u64 Xi[2],
4683 # ; const void* gcm128ctx)
4685 # ; Leaf function (does not allocate stack space, does not use non-volatile registers).
4686 # ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
4688 .globl ossl_gcm_gmult_avx512
4689 .hidden ossl_gcm_gmult_avx512
4690 .type ossl_gcm_gmult_avx512,\@abi-omnipotent
4692 ossl_gcm_gmult_avx512:
4696 if ($CHECK_FUNCTION_ARGUMENTS) {
4698 # ;; Check Xi != NULL
4702 # ;; Check gcm128ctx != NULL
4707 $code .= "vmovdqu64 ($arg1),%xmm1\n";
4708 $code .= "vmovdqu64 @{[HashKeyByIdx(1,$arg2)]},%xmm2\n";
4710 &GHASH_MUL("%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5");
4712 $code .= "vmovdqu64 %xmm1,($arg1)\n";
4713 if ($CLEAR_SCRATCH_REGISTERS) {
4714 &clear_scratch_gps_asm();
4715 &clear_scratch_zmms_asm();
4717 $code .= "vzeroupper\n";
4723 .size ossl_gcm_gmult_avx512, .-ossl_gcm_gmult_avx512
4728 # Add unwind metadata for SEH.
4730 # See https://docs.microsoft.com/en-us/cpp/build/exception-handling-x64?view=msvc-160
4731 my $UWOP_PUSH_NONVOL = 0;
4732 my $UWOP_ALLOC_LARGE = 1;
4733 my $UWOP_SET_FPREG = 3;
4734 my $UWOP_SAVE_XMM128 = 8;
4735 my %UWOP_REG_NUMBER = (
4744 map(("r$_" => $_), (8 .. 15)));
4749 .rva .Lsetiv_seh_begin
4750 .rva .Lsetiv_seh_end
4751 .rva .Lsetiv_seh_info
4753 .rva .Lghash_seh_begin
4754 .rva .Lghash_seh_end
4755 .rva .Lghash_seh_info
4757 .rva .Lencrypt_seh_begin
4758 .rva .Lencrypt_seh_end
4759 .rva .Lencrypt_seh_info
4761 .rva .Ldecrypt_seh_begin
4762 .rva .Ldecrypt_seh_end
4763 .rva .Ldecrypt_seh_info
4768 foreach my $func_name ("setiv", "ghash", "encrypt", "decrypt") {
4771 .L${func_name}_seh_info:
4772 .byte 1 # version 1, no flags
4773 .byte .L${func_name}_seh_prolog_end-.L${func_name}_seh_begin
4774 .byte 31 # num_slots = 1*8 + 2 + 1 + 2*10
4775 # FR = rbp; Offset from RSP = $XMM_STORAGE scaled on 16
4776 .byte @{[$UWOP_REG_NUMBER{rbp} | (($XMM_STORAGE / 16 ) << 4)]}
4779 # Metadata for %xmm15-%xmm6
4780 # Occupy 2 slots each
4781 for (my $reg_idx = 15; $reg_idx >= 6; $reg_idx--) {
4783 # Scaled-by-16 stack offset
4784 my $xmm_reg_offset = ($reg_idx - 6);
4786 .byte .L${func_name}_seh_save_xmm${reg_idx}-.L${func_name}_seh_begin
4787 .byte @{[$UWOP_SAVE_XMM128 | (${reg_idx} << 4)]}
4788 .value $xmm_reg_offset
4793 # Frame pointer (occupy 1 slot)
4794 .byte .L${func_name}_seh_setfp-.L${func_name}_seh_begin
4795 .byte $UWOP_SET_FPREG
4797 # Occupy 2 slots, as stack allocation < 512K, but > 128 bytes
4798 .byte .L${func_name}_seh_allocstack_xmm-.L${func_name}_seh_begin
4799 .byte $UWOP_ALLOC_LARGE
4800 .value `($XMM_STORAGE + 8) / 8`
4803 # Metadata for GPR regs
4804 # Occupy 1 slot each
4805 foreach my $reg ("rsi", "rdi", "r15", "r14", "r13", "r12", "rbp", "rbx") {
4807 .byte .L${func_name}_seh_push_${reg}-.L${func_name}_seh_begin
4808 .byte @{[$UWOP_PUSH_NONVOL | ($UWOP_REG_NUMBER{$reg} << 4)]}
4817 POLY: .quad 0x0000000000000001, 0xC200000000000000
4821 .quad 0x00000001C2000000, 0xC200000000000000
4822 .quad 0x00000001C2000000, 0xC200000000000000
4823 .quad 0x00000001C2000000, 0xC200000000000000
4824 .quad 0x00000001C2000000, 0xC200000000000000
4827 TWOONE: .quad 0x0000000000000001, 0x0000000100000000
4829 # ;;; Order of these constants should not change.
4830 # ;;; More specifically, ALL_F should follow SHIFT_MASK, and ZERO should follow ALL_F
4833 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4834 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4835 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4836 .quad 0x08090A0B0C0D0E0F, 0x0001020304050607
4840 .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
4843 .quad 0xffffffffffffffff, 0xffffffffffffffff
4846 .quad 0x0000000000000000, 0x0000000000000000
4850 .quad 0x0000000000000001, 0x0000000000000000
4854 .quad 0x0000000000000000, 0x0100000000000000
4858 .quad 0x0000000000000001, 0x0000000000000000
4859 .quad 0x0000000000000002, 0x0000000000000000
4860 .quad 0x0000000000000003, 0x0000000000000000
4861 .quad 0x0000000000000004, 0x0000000000000000
4865 .quad 0x0000000000000005, 0x0000000000000000
4866 .quad 0x0000000000000006, 0x0000000000000000
4867 .quad 0x0000000000000007, 0x0000000000000000
4868 .quad 0x0000000000000008, 0x0000000000000000
4872 .quad 0x0000000000000004, 0x0000000000000000
4873 .quad 0x0000000000000004, 0x0000000000000000
4874 .quad 0x0000000000000004, 0x0000000000000000
4875 .quad 0x0000000000000004, 0x0000000000000000
4879 .quad 0x0000000000000008, 0x0000000000000000
4880 .quad 0x0000000000000008, 0x0000000000000000
4881 .quad 0x0000000000000008, 0x0000000000000000
4882 .quad 0x0000000000000008, 0x0000000000000000
4886 .quad 0x0000000000000000, 0x0100000000000000
4887 .quad 0x0000000000000000, 0x0200000000000000
4888 .quad 0x0000000000000000, 0x0300000000000000
4889 .quad 0x0000000000000000, 0x0400000000000000
4893 .quad 0x0000000000000000, 0x0400000000000000
4894 .quad 0x0000000000000000, 0x0400000000000000
4895 .quad 0x0000000000000000, 0x0400000000000000
4896 .quad 0x0000000000000000, 0x0400000000000000
4899 byte_len_to_mask_table:
4900 .value 0x0000, 0x0001, 0x0003, 0x0007
4901 .value 0x000f, 0x001f, 0x003f, 0x007f
4902 .value 0x00ff, 0x01ff, 0x03ff, 0x07ff
4903 .value 0x0fff, 0x1fff, 0x3fff, 0x7fff
4907 byte64_len_to_mask_table:
4908 .quad 0x0000000000000000, 0x0000000000000001
4909 .quad 0x0000000000000003, 0x0000000000000007
4910 .quad 0x000000000000000f, 0x000000000000001f
4911 .quad 0x000000000000003f, 0x000000000000007f
4912 .quad 0x00000000000000ff, 0x00000000000001ff
4913 .quad 0x00000000000003ff, 0x00000000000007ff
4914 .quad 0x0000000000000fff, 0x0000000000001fff
4915 .quad 0x0000000000003fff, 0x0000000000007fff
4916 .quad 0x000000000000ffff, 0x000000000001ffff
4917 .quad 0x000000000003ffff, 0x000000000007ffff
4918 .quad 0x00000000000fffff, 0x00000000001fffff
4919 .quad 0x00000000003fffff, 0x00000000007fffff
4920 .quad 0x0000000000ffffff, 0x0000000001ffffff
4921 .quad 0x0000000003ffffff, 0x0000000007ffffff
4922 .quad 0x000000000fffffff, 0x000000001fffffff
4923 .quad 0x000000003fffffff, 0x000000007fffffff
4924 .quad 0x00000000ffffffff, 0x00000001ffffffff
4925 .quad 0x00000003ffffffff, 0x00000007ffffffff
4926 .quad 0x0000000fffffffff, 0x0000001fffffffff
4927 .quad 0x0000003fffffffff, 0x0000007fffffffff
4928 .quad 0x000000ffffffffff, 0x000001ffffffffff
4929 .quad 0x000003ffffffffff, 0x000007ffffffffff
4930 .quad 0x00000fffffffffff, 0x00001fffffffffff
4931 .quad 0x00003fffffffffff, 0x00007fffffffffff
4932 .quad 0x0000ffffffffffff, 0x0001ffffffffffff
4933 .quad 0x0003ffffffffffff, 0x0007ffffffffffff
4934 .quad 0x000fffffffffffff, 0x001fffffffffffff
4935 .quad 0x003fffffffffffff, 0x007fffffffffffff
4936 .quad 0x00ffffffffffffff, 0x01ffffffffffffff
4937 .quad 0x03ffffffffffffff, 0x07ffffffffffffff
4938 .quad 0x0fffffffffffffff, 0x1fffffffffffffff
4939 .quad 0x3fffffffffffffff, 0x7fffffffffffffff
4940 .quad 0xffffffffffffffff
4944 # Fallback for old assembler
4947 .globl ossl_vaes_vpclmulqdq_capable
4948 .type ossl_vaes_vpclmulqdq_capable,\@abi-omnipotent
4949 ossl_vaes_vpclmulqdq_capable:
4952 .size ossl_vaes_vpclmulqdq_capable, .-ossl_vaes_vpclmulqdq_capable
4954 .globl ossl_aes_gcm_init_avx512
4955 .globl ossl_aes_gcm_setiv_avx512
4956 .globl ossl_aes_gcm_update_aad_avx512
4957 .globl ossl_aes_gcm_encrypt_avx512
4958 .globl ossl_aes_gcm_decrypt_avx512
4959 .globl ossl_aes_gcm_finalize_avx512
4960 .globl ossl_gcm_gmult_avx512
4962 .type ossl_aes_gcm_init_avx512,\@abi-omnipotent
4963 ossl_aes_gcm_init_avx512:
4964 ossl_aes_gcm_setiv_avx512:
4965 ossl_aes_gcm_update_aad_avx512:
4966 ossl_aes_gcm_encrypt_avx512:
4967 ossl_aes_gcm_decrypt_avx512:
4968 ossl_aes_gcm_finalize_avx512:
4969 ossl_gcm_gmult_avx512:
4970 .byte 0x0f,0x0b # ud2
4972 .size ossl_aes_gcm_init_avx512, .-ossl_aes_gcm_init_avx512
4976 $code =~ s/\`([^\`]*)\`/eval $1/gem;
4978 close STDOUT or die "error closing STDOUT: $!";