Optimize AES-GCM for uarchs with unroll and new instructions
authorXiaokangQian <xiaokang.qian@arm.com>
Wed, 9 Jun 2021 06:35:46 +0000 (06:35 +0000)
committerPauli <pauli@openssl.org>
Tue, 25 Jan 2022 03:30:00 +0000 (14:30 +1100)
Increase the block numbers to 8 for every iteration.  Increase the hash
table capacity.  Make use of EOR3 instruction to improve the performance.

This can improve performance 25-40% on out-of-order microarchitectures
with a large number of fast execution units, such as Neoverse V1.  We also
see 20-30% performance improvements on other architectures such as the M1.

Assembly code reviewd by Tom Cosgrove (ARM).

Reviewed-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
Reviewed-by: Paul Dale <pauli@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/15916)

crypto/arm64cpuid.pl
crypto/arm_arch.h
crypto/armcap.c
crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl [new file with mode: 0644]
crypto/modes/asm/ghashv8-armx.pl
crypto/modes/build.info
include/crypto/aes_platform.h
providers/implementations/ciphers/cipher_aes_gcm_hw_armv8.inc

index 1841c0cc04685849ff79338a6a2e06f77c13f353..ebea4be59c3a939356f19793ebae914e2c4b38af 100755 (executable)
@@ -96,6 +96,14 @@ _armv8_sha512_probe:
        ret
 .size  _armv8_sha512_probe,.-_armv8_sha512_probe
 
+.globl _armv8_eor3_probe
+.type  _armv8_eor3_probe,%function
+_armv8_eor3_probe:
+       AARCH64_VALID_CALL_TARGET
+       .long   0xce010800      // eor3 v0.16b, v0.16b, v1.16b, v2.16b
+       ret
+.size  _armv8_eor3_probe,.-_armv8_eor3_probe
+
 .globl _armv8_cpuid_probe
 .type  _armv8_cpuid_probe,%function
 _armv8_cpuid_probe:
index 291620ebc92b6f33f2f655c898a30b23589971a3..33acbd99c0b34558b9ff596a86dcced6ec7f126f 100644 (file)
@@ -81,6 +81,8 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 # define ARMV8_RNG       (1<<8)
 # define ARMV8_SM3       (1<<9)
 # define ARMV8_SM4       (1<<10)
+# define ARMV8_SHA3      (1<<11)
+# define ARMV8_UNROLL8_EOR3      (1<<12)
 
 /*
  * MIDR_EL1 system register
@@ -96,6 +98,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 
 # define ARM_CPU_PART_CORTEX_A72   0xD08
 # define ARM_CPU_PART_N1           0xD0C
+# define ARM_CPU_PART_V1           0xD40
 
 # define MIDR_PARTNUM_SHIFT       4
 # define MIDR_PARTNUM_MASK        (0xfff << MIDR_PARTNUM_SHIFT)
@@ -182,4 +185,7 @@ extern unsigned int OPENSSL_armv8_rsa_neonized;
 
 # endif  /* defined __ASSEMBLER__ */
 
+# define IS_CPU_SUPPORT_UNROLL8_EOR3() \
+           (OPENSSL_armcap_P & ARMV8_UNROLL8_EOR3)
+
 #endif
index 5016987eeb8e703360f8060d25bcc4fcf1dda231..c50322f504bdff13c778f129cbe4fbe3f79b5916 100644 (file)
@@ -171,6 +171,7 @@ static unsigned long getauxval(unsigned long key)
 #  define HWCAP_CE_SHA1          (1 << 5)
 #  define HWCAP_CE_SHA256        (1 << 6)
 #  define HWCAP_CPUID            (1 << 11)
+#  define HWCAP_SHA3             (1 << 17)
 #  define HWCAP_CE_SM3           (1 << 18)
 #  define HWCAP_CE_SM4           (1 << 19)
 #  define HWCAP_CE_SHA512        (1 << 21)
@@ -216,11 +217,20 @@ void OPENSSL_cpuid_setup(void)
      */
 #   else
     {
-        unsigned int sha512;
-        size_t len = sizeof(sha512);
+        unsigned int feature;
+        size_t len = sizeof(feature);
+        char uarch[64];
 
-        if (sysctlbyname("hw.optional.armv8_2_sha512", &sha512, &len, NULL, 0) == 0 && sha512 == 1)
+        if (sysctlbyname("hw.optional.armv8_2_sha512", &feature, &len, NULL, 0) == 0 && feature == 1)
             OPENSSL_armcap_P |= ARMV8_SHA512;
+        feature = 0;
+        if (sysctlbyname("hw.optional.armv8_2_sha3", &feature, &len, NULL, 0) == 0 && feature == 1) {
+            OPENSSL_armcap_P |= ARMV8_SHA3;
+            len = sizeof(uarch);
+            if ((sysctlbyname("machdep.cpu.brand_string", uarch, &len, NULL, 0) == 0) &&
+                (strncmp(uarch, "Apple M1", 8) == 0))
+                OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3;
+        }
     }
 #   endif
 # endif
@@ -255,6 +265,8 @@ void OPENSSL_cpuid_setup(void)
 
         if (hwcap & HWCAP_CE_SM3)
             OPENSSL_armcap_P |= ARMV8_SM3;
+        if (hwcap & HWCAP_SHA3)
+            OPENSSL_armcap_P |= ARMV8_SHA3;
 #  endif
     }
 #  ifdef __aarch64__
@@ -311,6 +323,9 @@ void OPENSSL_cpuid_setup(void)
         if (sigsetjmp(ill_jmp, 1) == 0) {
             _armv8_sm3_probe();
             OPENSSL_armcap_P |= ARMV8_SM3;
+        if (sigsetjmp(ill_jmp, 1) == 0) {
+            _armv8_eor3_probe();
+            OPENSSL_armcap_P |= ARMV8_SHA3;
         }
 #  endif
     }
@@ -340,6 +355,9 @@ void OPENSSL_cpuid_setup(void)
         (OPENSSL_armcap_P & ARMV7_NEON)) {
             OPENSSL_armv8_rsa_neonized = 1;
     }
+    if ((MIDR_IS_CPU_MODEL(OPENSSL_arm_midr, ARM_CPU_IMP_ARM, ARM_CPU_PART_V1)) &&
+        (OPENSSL_armcap_P & ARMV8_SHA3))
+        OPENSSL_armcap_P |= ARMV8_UNROLL8_EOR3;
 # endif
 }
 #endif
diff --git a/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl b/crypto/modes/asm/aes-gcm-armv8-unroll8_64.pl
new file mode 100644 (file)
index 0000000..1aaad66
--- /dev/null
@@ -0,0 +1,7369 @@
+#! /usr/bin/env perl
+# Copyright 2020-2021 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+#
+#========================================================================
+# Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
+# derived from https://github.com/ARM-software/AArch64cryptolib, original
+# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
+# licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
+# obtain it.
+#========================================================================
+#
+# Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
+# Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
+# intermediate hashesfrom the 8 blocks.
+#
+#  ____________________________________________________
+# |                                                    |
+# | PRE                                                |
+# |____________________________________________________|
+# |                |                |                  |
+# | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
+# |________________|________________|__________________|
+# |                |                |                  |
+# | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
+# |________________|____(mostly)____|__________________|
+# |                                                    |
+# | MODULO                                             |
+# |____________________________________________________|
+#
+# PRE:
+#     Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
+# EXT low_acc, low_acc, low_acc, #8
+# EOR res_curr (8k+0), res_curr (4k+0), low_acc
+#
+# CTR block:
+#     Increment and byte reverse counter in scalar registers and transfer to SIMD registers
+# REV     ctr32, rev_ctr32
+# ORR     ctr64, constctr96_top32, ctr32, LSL #32
+# INS     ctr_next.d[0], constctr96_bottom64      // Keeping this in scalar registers to free up space in SIMD RF
+# INS     ctr_next.d[1], ctr64X
+# ADD     rev_ctr32, #1
+#
+# AES block:
+#      Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
+#      Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
+#      Given we are very constrained in our ASIMD registers this is quite important
+#
+#      Encrypt:
+# LDR     input_low, [ input_ptr  ], #8
+# LDR     input_high, [ input_ptr  ], #8
+# EOR     input_low, k14_low
+# EOR     input_high, k14_high
+# INS     res_curr.d[0], input_low
+# INS     res_curr.d[1], input_high
+# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k13
+# EOR     res_curr, res_curr, ctr_curr
+# ST1     { res_curr.16b  }, [ output_ptr  ], #16
+#
+#     Decrypt:
+# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
+# AESE    ctr_curr, k13
+# LDR     res_curr, [ input_ptr  ], #16
+# EOR     res_curr, res_curr, ctr_curr
+# MOV     output_low, res_curr.d[0]
+# MOV     output_high, res_curr.d[1]
+# EOR     output_low, k14_low
+# EOR     output_high, k14_high
+# STP     output_low, output_high, [ output_ptr  ], #16
+
+# GHASH block X:
+#     Do 128b karatsuba polynomial multiplication on block
+#     We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
+#
+# multiplication:
+#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
+#
+#     The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
+#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
+#
+#     There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
+#     multiplying with "twisted" powers of H
+#
+# Note: We can PMULL directly into the acc_x in first GHASH of the loop
+# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
+#       path latency dominates the performance
+#
+#       This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
+#       than indicated here
+# REV64   res_curr, res_curr
+# INS     t_m.d[0], res_curr.d[1]
+# EOR     t_m.8B, t_m.8B, res_curr.8B
+# PMULL2  t_h, res_curr, HX
+# PMULL   t_l, res_curr, HX
+# PMULL   t_m, t_m, HX_k
+# EOR     acc_h, acc_h, t_h
+# EOR     acc_l, acc_l, t_l
+# EOR     acc_m, acc_m, t_m
+#
+# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
+#         There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
+#         with a reversed constant
+# EOR3    acc_m, acc_m, acc_l, acc_h                     // Finish off karatsuba processing
+# PMULL   t_mod, acc_h, mod_constant
+# EXT     acc_h, acc_h, acc_h, #8
+# EOR3     acc_m, acc_m, t_mod, acc_h
+# PMULL   acc_h, acc_m, mod_constant
+# EXT     acc_m, acc_m, acc_m, #8
+# EOR3    acc_l, acc_l, acc_m, acc_h
+
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
+( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
+die "can't locate arm-xlate.pl";
+
+die "only for 64 bit" if $flavour !~ /64/;
+
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_MAX_ARCH__>=8
+___
+$code.=".arch   armv8.2-a+crypto\n.arch_extension sha3\n.text\n";
+
+$input_ptr="x0";  #argument block
+$bit_length="x1";
+$output_ptr="x2";
+$current_tag="x3";
+$counter="x16";
+$constant_temp="x15";
+$modulo_constant="x10";
+$cc="x8";
+{
+my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
+my ($temp2_x,$temp3_x)=map("x$_",(13..14));
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
+my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
+my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
+
+my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
+my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
+my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
+my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
+
+my $t0="v16";
+my $t0d="d16";
+
+my $t1="v29";
+my $t2=$res1;
+my $t3=$t1;
+
+my $t4=$res0;
+my $t5=$res2;
+my $t6=$t0;
+
+my $t7=$res3;
+my $t8=$res4;
+my $t9=$res5;
+
+my $t10=$res6;
+my $t11="v21";
+my $t12=$t1;
+
+my $rtmp_ctr="v30";
+my $rtmp_ctrq="q30";
+my $rctr_inc="v31";
+my $rctr_incd="d31";
+
+my $mod_constantd=$t0d;
+my $mod_constant=$t0;
+
+my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
+my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
+my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
+my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
+my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
+my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
+my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
+my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
+my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
+my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
+my $rk2q1="v28.1q";
+my $rk3q1="v26.1q";
+my $rk4v="v27";
+
+
+#########################################################################################
+# size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
+#                               size_t len,
+#                               unsigned char *out,
+#                               const void *key,
+#                               unsigned char ivec[16],
+#                               u64 *Xi);
+#
+$code.=<<___;
+.global unroll8_eor3_aes_gcm_enc_128_kernel
+.type   unroll8_eor3_aes_gcm_enc_128_kernel,%function
+.align  4
+unroll8_eor3_aes_gcm_enc_128_kernel:
+       AARCH64_VALID_CALL_TARGET
+       cbz     x1, .L128_enc_ret
+       stp     d8, d9, [sp, #-80]!
+       mov     $counter, x4
+       mov     $cc, x5
+       stp     d10, d11, [sp, #16]
+       stp     d12, d13, [sp, #32]
+       stp     d14, d15, [sp, #48]
+       mov     x5, #0xc200000000000000
+       stp     x5, xzr, [sp, #64]
+       add     $modulo_constant, sp, #64
+
+       mov     $constant_temp, #0x100000000                            @ set up counter increment
+       movi    $rctr_inc.16b, #0x0
+       mov     $rctr_inc.d[1], $constant_temp
+       lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
+       ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
+
+       sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
+
+       and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80           @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+       rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
+
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
+
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
+
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
+
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
+
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
+
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
+
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
+
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
+
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
+
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
+
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
+
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
+
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
+
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
+
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
+
+       ld1     { $acc_lb}, [$current_tag]
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
+
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
+
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+       ldr     $rk10q, [$cc, #160]                                     @ load rk10
+
+       aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+       aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+       aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
+
+       aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
+       add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+       aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
+
+       aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
+       aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
+       aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
+
+       add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+       b.ge    .L128_enc_tail                                          @ handle tail
+
+       ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
+
+       ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
+
+       ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
+
+       ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+
+       eor3    $res0b, $ctr_t0b, $ctr0b, $rk10                         @ AES block 0 - result
+       rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
+
+       eor3    $res1b, $ctr_t1b, $ctr1b, $rk10                         @ AES block 1 - result
+       stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
+       eor3    $res5b, $ctr_t5b, $ctr5b, $rk10                         @ AES block 5 - result
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
+
+       eor3    $res2b, $ctr_t2b, $ctr2b, $rk10                         @ AES block 2 - result
+       eor3    $res6b, $ctr_t6b, $ctr6b, $rk10                         @ AES block 6 - result
+       eor3    $res4b, $ctr_t4b, $ctr4b, $rk10                         @ AES block 4 - result
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
+
+       eor3    $res3b, $ctr_t3b, $ctr3b, $rk10                         @ AES block 3 - result
+       eor3    $res7b, $ctr_t7b, $ctr7b,$rk10                          @ AES block 7 - result
+       stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
+       stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
+
+       stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
+       b.ge    .L128_enc_prepretail                                    @ do prepretail
+
+.L128_enc_main_loop:                                                   @ main loop start
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
+
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h3l | h3h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+
+       eor3    $acc_hb, $acc_hb, $t1.16b,$t2.16b                       @ GHASH block 8k+2, 8k+3 - high
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
+
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
+
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+       ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
+
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+
+       pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+
+       rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 8k+12, 8k+13 - load plaintext
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
+
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       ldr     $rk10q, [$cc, #160]                                     @ load rk10
+
+       ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+       rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+
+       aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
+       aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
+       aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
+
+       ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 8k+14, 8k+15 - load plaintext
+       rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
+
+       cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
+       eor3    $res4b, $ctr_t4b, $ctr4b, $rk10                         @ AES block 4 - result
+       aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
+
+       aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
+       aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
+
+       eor3    $res2b, $ctr_t2b, $ctr2b, $rk10                         @ AES block 8k+10 - result
+
+       mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
+       aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
+
+       eor3    $res7b, $ctr_t7b, $ctr7b, $rk10                         @ AES block 7 - result
+       aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+
+       eor3    $res1b, $ctr_t1b, $ctr1b, $rk10                         @ AES block 8k+9 - result
+       eor3    $res3b, $ctr_t3b, $ctr3b, $rk10                         @ AES block 8k+11 - result
+       mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
+
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+       eor3    $res5b, $ctr_t5b, $ctr5b, $rk10                         @ AES block 5 - result
+       mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
+
+       eor3    $res0b, $ctr_t0b, $ctr0b, $rk10                         @ AES block 8k+8 - result
+       mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
+       stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
+
+       stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
+       eor3    $res6b, $ctr_t6b, $ctr6b, $rk10                         @ AES block 6 - result
+
+       stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+
+       stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
+       b.lt    .L128_enc_main_loop
+
+.L128_enc_prepretail:                                                  @ PREPRETAIL
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h6k | h5k
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
+
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
+
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+
+       pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+       ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+       ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
+
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+       eor3    $acc_lb, $acc_lb, $acc_hb, $acc_mb                      @ MODULO - fold into low
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+
+       ldr     $rk10q, [$cc, #160]                                     @ load rk10
+       aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
+       aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
+
+       aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
+       aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
+
+       aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
+       aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
+
+       aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
+       aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
+.L128_enc_tail:                                                                @ TAIL
+
+       sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
+       ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - load plaintext
+
+       mov     $t1.16b, $rk10
+       ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+
+       eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                       @ AES block 8k+8 - result
+       ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
+       ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6k | h5k
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+
+       ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h7l | h7h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       cmp     $main_end_input_ptr, #112
+       b.gt    .L128_enc_blocks_more_than_7
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+       movi    $acc_h.8b, #0
+
+       cmp     $main_end_input_ptr, #96
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr5b, $ctr4b
+
+       mov     $ctr4b, $ctr3b
+       mov     $ctr3b, $ctr2b
+       mov     $ctr2b, $ctr1b
+
+       movi    $acc_l.8b, #0
+       movi    $acc_m.8b, #0
+       b.gt    .L128_enc_blocks_more_than_6
+
+       mov     $ctr7b, $ctr6b
+       cmp     $main_end_input_ptr, #80
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+
+       mov     $ctr4b, $ctr3b
+       mov     $ctr3b, $ctr1b
+       b.gt    .L128_enc_blocks_more_than_5
+
+       cmp     $main_end_input_ptr, #64
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr1b
+       b.gt    .L128_enc_blocks_more_than_4
+
+       mov     $ctr7b, $ctr6b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr1b
+       cmp     $main_end_input_ptr, #48
+       b.gt    .L128_enc_blocks_more_than_3
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr1b
+
+       cmp     $main_end_input_ptr, #32
+       ldr     $h34kq, [$current_tag, #96]                                     @ load h4k | h3k
+       b.gt    .L128_enc_blocks_more_than_2
+
+       cmp     $main_end_input_ptr, #16
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr7b, $ctr1b
+       b.gt    .L128_enc_blocks_more_than_1
+
+       ldr     $h12kq, [$current_tag, #48]                                     @ load h2k | h1k
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b        .L128_enc_blocks_less_than_1
+.L128_enc_blocks_more_than_7:                                          @ blocks left >  7
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-7 block
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
+
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
+
+       ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
+
+       pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
+.L128_enc_blocks_more_than_6:                                          @ blocks left >  6
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-6 block
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
+
+       eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
+       pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
+       pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
+.L128_enc_blocks_more_than_5:                                          @ blocks left >  5
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-5 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
+       pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
+
+       eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
+       pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
+.L128_enc_blocks_more_than_4:                                          @ blocks left >  4
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-4 block
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
+
+       pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
+       pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
+
+       eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
+.L128_enc_blocks_more_than_3:                                          @ blocks left >  3
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
+
+       ldr     $h4q, [$current_tag, #112]                                      @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+
+       rev64   $res0b, $res1b                                          @ GHASH final-3 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
+       ldr     $h34kq, [$current_tag, #96]                                     @ load h4k | h3k
+       pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
+
+       eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
+       pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
+.L128_enc_blocks_more_than_2:                                          @ blocks left >  2
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-2 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
+       ldr     $h3q, [$current_tag, #80]                                       @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
+       eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
+
+       pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
+
+       pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
+       pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
+.L128_enc_blocks_more_than_1:                                          @ blocks left >  1
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
+
+       ldr     $h2q, [$current_tag, #64]                                       @ load h2l | h2h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       rev64   $res0b, $res1b                                          @ GHASH final-1 block
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
+       eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
+
+       pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
+
+       ldr     $h12kq, [$current_tag, #48]                                     @ load h2k | h1k
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
+
+       pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
+       pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
+.L128_enc_blocks_less_than_1:                                          @ blocks left <= 1
+
+       rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
+       str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
+
+       neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
+
+       mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
+       ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
+       mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
+       cmp     $bit_length, #64
+
+       csel    $temp2_x, $temp1_x, $temp0_x, lt
+       csel    $temp3_x, $temp0_x, xzr, lt
+
+       mov     $ctr0.d[1], $temp3_x
+       mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
+
+       and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
+
+       rev64   $res0b, $res1b                                          @ GHASH final block
+
+       bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
+       st1     { $res1b}, [$output_ptr]                                @ store all 16B
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
+
+       eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
+       ldr     $h1q, [$current_tag, #32]                                       @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+
+       pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
+
+       pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
+       eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
+
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       st1     { $acc_l.16b }, [$current_tag]
+       lsr     x0, $bit_length, #3                                     @ return sizes
+
+       ldp     d10, d11, [sp, #16]
+       ldp     d12, d13, [sp, #32]
+       ldp     d14, d15, [sp, #48]
+       ldp     d8, d9, [sp], #80
+       ret
+
+.L128_enc_ret:
+       mov w0, #0x0
+       ret
+.size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
+___
+
+#########################################################################################
+# size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
+#                               size_t len,
+#                               unsigned char *out,
+#                               u64 *Xi,
+#                               unsigned char ivec[16],
+#                               const void *key);
+#
+$code.=<<___;
+.global unroll8_eor3_aes_gcm_dec_128_kernel
+.type   unroll8_eor3_aes_gcm_dec_128_kernel,%function
+.align  4
+unroll8_eor3_aes_gcm_dec_128_kernel:
+       AARCH64_VALID_CALL_TARGET
+       cbz     x1, .L128_dec_ret
+       stp     d8, d9, [sp, #-80]!
+       mov     $counter, x4
+       mov     $cc, x5
+       stp     d10, d11, [sp, #16]
+       stp     d12, d13, [sp, #32]
+       stp     d14, d15, [sp, #48]
+       mov     x5, #0xc200000000000000
+       stp     x5, xzr, [sp, #64]
+       add     $modulo_constant, sp, #64
+
+       lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
+       ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
+
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
+
+       mov     $constant_temp, #0x100000000                    @ set up counter increment
+       movi    $rctr_inc.16b, #0x0
+       mov     $rctr_inc.d[1], $constant_temp
+       ld1     { $acc_lb}, [$current_tag]
+         ext   $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+
+       rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
+
+       and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
+
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
+
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
+
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
+
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
+
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
+
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
+
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
+
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
+
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
+
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
+
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
+
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
+
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
+
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
+
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
+
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
+
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
+
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
+
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
+
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
+
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
+
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
+
+       add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
+
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
+
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
+
+       aese    $ctr0b, $rk9                                            @ AES block 0 - round 9
+       aese    $ctr1b, $rk9                                            @ AES block 1 - round 9
+       aese    $ctr6b, $rk9                                            @ AES block 6 - round 9
+
+       ldr     $rk10q, [$cc, #160]                                     @ load rk10
+       aese    $ctr4b, $rk9                                            @ AES block 4 - round 9
+       aese    $ctr3b, $rk9                                            @ AES block 3 - round 9
+
+       aese    $ctr2b, $rk9                                            @ AES block 2 - round 9
+       aese    $ctr5b, $rk9                                            @ AES block 5 - round 9
+       aese    $ctr7b, $rk9                                            @ AES block 7 - round 9
+
+       add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+       b.ge    .L128_dec_tail                                          @ handle tail
+
+       ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
+
+       eor3    $ctr0b, $res0b, $ctr0b, $rk10                           @ AES block 0 - result
+       eor3    $ctr1b, $res1b, $ctr1b, $rk10                           @ AES block 1 - result
+       stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
+
+       rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
+       ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
+
+       ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
+       ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
+
+       eor3    $ctr3b, $res3b, $ctr3b, $rk10                           @ AES block 3 - result
+       eor3    $ctr2b, $res2b, $ctr2b, $rk10                           @ AES block 2 - result
+       stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
+
+       eor3    $ctr6b, $res6b, $ctr6b, $rk10                           @ AES block 6 - result
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
+
+       eor3    $ctr4b, $res4b, $ctr4b, $rk10                           @ AES block 4 - result
+       eor3    $ctr5b, $res5b, $ctr5b, $rk10                           @ AES block 5 - result
+       stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
+
+       eor3    $ctr7b, $res7b, $ctr7b, $rk10                           @ AES block 7 - result
+       stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
+
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
+       b.ge    .L128_dec_prepretail                                    @ do prepretail
+
+.L128_dec_main_loop:                                                   @ main loop start
+       ldr     $h7q, [$current_tag, #176]                                      @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                                      @ load h7l | h7h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h6k | h5k
+
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5
+
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+
+       pmull   $h6.1q, $res2.1d, $h6.1d                                        @ GHASH block 8k+2 - low
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+
+       rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
+
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+       rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+       ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
+
+       ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+       rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
+
+       ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+
+       ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
+
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+
+       aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
+       aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
+       ldr     $rk10q, [$cc, #160]                                     @ load rk10
+
+       aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
+
+       aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
+       aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+
+       rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
+
+       aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
+       aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
+       eor3    $ctr1b, $res1b, $ctr1b, $rk10                           @ AES block 8k+9 - result
+
+       eor3    $ctr0b, $res0b, $ctr0b, $rk10                           @ AES block 8k+8 - result
+       eor3    $ctr7b, $res7b, $ctr7b, $rk10                           @ AES block 8k+15 - result
+       eor3    $ctr6b, $res6b, $ctr6b, $rk10                           @ AES block 8k+14 - result
+
+       eor3    $ctr2b, $res2b, $ctr2b, $rk10                           @ AES block 8k+10 - result
+       stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
+       mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
+
+       eor3    $ctr4b, $res4b, $ctr4b, $rk10                           @ AES block 8k+12 - result
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+       mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
+
+       eor3    $ctr3b, $res3b, $ctr3b, $rk10                           @ AES block 8k+11 - result
+       cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
+       stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
+
+       eor3    $ctr5b, $res5b, $ctr5b, $rk10                           @ AES block 8k+13 - result
+       mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
+
+       stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
+
+       stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
+       mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
+       b.lt    .L128_dec_main_loop
+
+.L128_dec_prepretail:                                                  @ PREPRETAIL
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                        @ CTR block 8k+13
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5
+
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4
+
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6
+
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h6k | h5k
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+       ldr     $rk10q, [$cc, #160]                                     @ load rk10
+
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+
+       aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+       aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
+
+       aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
+       aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
+       aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
+
+       aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
+       aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
+       aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
+
+.L128_dec_tail:                                                                @ TAIL
+
+       mov     $t1.16b, $rk10
+       sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
+
+       cmp     $main_end_input_ptr, #112
+
+       ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h7l | h7h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
+
+       ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
+
+       ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6k | h5k
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+
+       eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
+       b.gt    .L128_dec_blocks_more_than_7
+
+       cmp     $main_end_input_ptr, #96
+       mov     $ctr7b, $ctr6b
+       movi    $acc_l.8b, #0
+
+       movi    $acc_h.8b, #0
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+
+       mov     $ctr4b, $ctr3b
+       mov     $ctr3b, $ctr2b
+       mov     $ctr2b, $ctr1b
+
+       movi    $acc_m.8b, #0
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b.gt    .L128_dec_blocks_more_than_6
+
+       cmp     $main_end_input_ptr, #80
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+
+       mov     $ctr4b, $ctr3b
+       mov     $ctr3b, $ctr1b
+       b.gt    .L128_dec_blocks_more_than_5
+
+       cmp     $main_end_input_ptr, #64
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+
+       mov     $ctr4b, $ctr1b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b.gt    .L128_dec_blocks_more_than_4
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr1b
+       cmp     $main_end_input_ptr, #48
+       b.gt    .L128_dec_blocks_more_than_3
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr7b, $ctr6b
+       cmp     $main_end_input_ptr, #32
+
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       mov     $ctr6b, $ctr1b
+       b.gt    .L128_dec_blocks_more_than_2
+
+       cmp     $main_end_input_ptr, #16
+
+       mov     $ctr7b, $ctr1b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b.gt    L128_dec_blocks_more_than_1
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       b        .L128_dec_blocks_less_than_1
+.L128_dec_blocks_more_than_7:                                          @ blocks left >  7
+       rev64   $res0b, $res1b                                          @ GHASH final-7 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
+
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
+
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
+       eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
+
+       pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
+.L128_dec_blocks_more_than_6:                                          @ blocks left >  6
+
+       rev64   $res0b, $res1b                                          @ GHASH final-6 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
+
+       pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
+       pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
+       eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
+.L128_dec_blocks_more_than_5:                                          @ blocks left >  5
+
+       rev64   $res0b, $res1b                                          @ GHASH final-5 block
+
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
+
+       eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
+       pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
+       pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
+.L128_dec_blocks_more_than_4:                                          @ blocks left >  4
+
+       rev64   $res0b, $res1b                                          @ GHASH final-4 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
+
+       pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
+
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
+
+       eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
+
+       pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
+.L128_dec_blocks_more_than_3:                                          @ blocks left >  3
+
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
+       rev64   $res0b, $res1b                                          @ GHASH final-3 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
+
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
+
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
+       pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
+       pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
+.L128_dec_blocks_more_than_2:                                          @ blocks left >  2
+
+       rev64   $res0b, $res1b                                          @ GHASH final-2 block
+
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
+
+       pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
+
+       pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
+       pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
+
+       eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
+.L128_dec_blocks_more_than_1:                                          @ blocks left >  1
+
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
+       rev64   $res0b, $res1b                                          @ GHASH final-1 block
+
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
+
+       ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
+       pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
+       eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
+
+       pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
+.L128_dec_blocks_less_than_1:                                          @ blocks left <= 1
+
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
+
+       neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
+
+       mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
+       cmp     $bit_length, #64
+       mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
+
+       csel    $temp2_x, $temp1_x, $temp0_x, lt
+       csel    $temp3_x, $temp0_x, xzr, lt
+
+       mov     $ctr0.d[1], $temp3_x
+       mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
+
+       and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
+
+       rev64   $res0b, $res1b                                          @ GHASH final block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
+       ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
+       eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
+
+       bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
+
+       pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
+       st1     { $res4b}, [$output_ptr]                                @ store all 16B
+
+       pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
+
+       eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
+
+       eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
+
+       pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
+
+       eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $t11.16b                     @ MODULO - fold into mid
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
+
+       eor3    $acc_lb, $acc_lb, $acc_mb, $acc_hb                      @ MODULO - fold into low
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       st1     { $acc_l.16b }, [$current_tag]
+       rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
+
+       str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
+
+       lsr     x0, $bit_length, #3
+
+       ldp     d10, d11, [sp, #16]
+       ldp     d12, d13, [sp, #32]
+       ldp     d14, d15, [sp, #48]
+       ldp     d8, d9, [sp], #80
+       ret
+.L128_dec_ret:
+       mov w0, #0x0
+       ret
+.size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
+___
+}
+
+{
+my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
+my ($temp2_x,$temp3_x)=map("x$_",(13..14));
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
+my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
+my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
+
+my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
+my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
+my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
+my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
+
+my $t0="v16";
+my $t0d="d16";
+
+my $t1="v29";
+my $t2=$res1;
+my $t3=$t1;
+
+my $t4=$res0;
+my $t5=$res2;
+my $t6=$t0;
+
+my $t7=$res3;
+my $t8=$res4;
+my $t9=$res5;
+
+my $t10=$res6;
+my $t11="v21";
+my $t12=$t1;
+
+my $rtmp_ctr="v30";
+my $rtmp_ctrq="q30";
+my $rctr_inc="v31";
+my $rctr_incd="d31";
+
+my $mod_constantd=$t0d;
+my $mod_constant=$t0;
+
+my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
+my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
+my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
+my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
+my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
+my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
+my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
+my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
+my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
+my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
+my $rk2q1="v28.1q";
+my $rk3q1="v26.1q";
+my $rk4v="v27";
+
+#########################################################################################
+# size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
+#                               size_t len,
+#                               unsigned char *out,
+#                               const void *key,
+#                               unsigned char ivec[16],
+#                               u64 *Xi);
+#
+$code.=<<___;
+.global unroll8_eor3_aes_gcm_enc_192_kernel
+.type   unroll8_eor3_aes_gcm_enc_192_kernel,%function
+.align  4
+unroll8_eor3_aes_gcm_enc_192_kernel:
+       AARCH64_VALID_CALL_TARGET
+       cbz     x1, .L192_enc_ret
+       stp     d8, d9, [sp, #-80]!
+       mov     $counter, x4
+       mov     $cc, x5
+       stp     d10, d11, [sp, #16]
+       stp     d12, d13, [sp, #32]
+       stp     d14, d15, [sp, #48]
+       mov     x5, #0xc200000000000000
+       stp     x5, xzr, [sp, #64]
+       add     $modulo_constant, sp, #64
+
+       lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
+       ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
+
+       mov     $constant_temp, #0x100000000                            @ set up counter increment
+       movi    $rctr_inc.16b, #0x0
+       mov     $rctr_inc.d[1], $constant_temp
+
+       rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
+       sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
+
+       and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+
+       add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
+
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
+
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
+
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
+
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
+
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
+
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
+
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
+
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
+
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
+
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
+       ldp     $rk6q, $rk7q, [$cc, #96]                                                @ load rk6, rk7
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
+
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
+
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
+
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
+
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
+
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
+
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
+
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
+
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
+
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
+
+       add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
+
+        ld1     { $acc_lb}, [$current_tag]
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
+
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
+
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
+
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 14 - round 10
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 11 - round 10
+
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 9 - round 10
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 13 - round 10
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 12 - round 10
+
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8 - round 10
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 10 - round 10
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 15 - round 10
+
+       aese    $ctr6b, $rk11                                           @ AES block 14 - round 11
+       aese    $ctr3b, $rk11                                           @ AES block 11 - round 11
+
+       aese    $ctr4b, $rk11                                           @ AES block 12 - round 11
+       aese    $ctr7b, $rk11                                           @ AES block 15 - round 11
+       ldr     $rk12q, [$cc, #192]                                     @ load rk12
+
+       aese    $ctr1b, $rk11                                           @ AES block 9 - round 11
+       aese    $ctr5b, $rk11                                           @ AES block 13 - round 11
+
+       aese    $ctr2b, $rk11                                           @ AES block 10 - round 11
+       aese    $ctr0b, $rk11                                           @ AES block 8 - round 11
+       b.ge    .L192_enc_tail                                          @ handle tail
+
+       ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
+
+       ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
+
+       ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
+
+       ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
+
+       eor3    $res0b, $ctr_t0b, $ctr0b, $rk12                         @ AES block 0 - result
+       rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
+
+       eor3    $res3b, $ctr_t3b, $ctr3b, $rk12                         @ AES block 3 - result
+       eor3    $res1b, $ctr_t1b, $ctr1b, $rk12                         @ AES block 1 - result
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
+       eor3    $res4b, $ctr_t4b, $ctr4b, $rk12                         @ AES block 4 - result
+
+       eor3    $res5b, $ctr_t5b, $ctr5b, $rk12                         @ AES block 5 - result
+       eor3    $res7b, $ctr_t7b, $ctr7b, $rk12                         @ AES block 7 - result
+       stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
+
+       eor3    $res2b, $ctr_t2b, $ctr2b, $rk12                         @ AES block 2 - result
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
+
+       stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
+       eor3    $res6b, $ctr_t6b, $ctr6b, $rk12                         @ AES block 6 - result
+
+       stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
+       stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
+
+       b.ge    .L192_enc_prepretail                                    @ do prepretail
+
+.L192_enc_main_loop:                                                   @ main loop start
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
+
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
+
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
+       ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
+
+       pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
+
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
+
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
+       ldr     $rk12q, [$cc, #192]                                     @ load rk12
+       ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
+       ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
+
+       aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+       ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 8k+12, 8k+13 - load plaintext
+
+       ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 8k+14, 8k+15 - load plaintext
+       aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
+
+       rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
+
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
+       aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
+
+       aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
+       aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
+       eor3    $res4b, $ctr_t4b, $ctr4b, $rk12                         @ AES block 4 - result
+
+       aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
+       aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
+       aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
+
+       rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
+       eor3    $res7b, $ctr_t7b, $ctr7b, $rk12                         @ AES block 7 - result
+
+       eor3    $res2b, $ctr_t2b, $ctr2b, $rk12                         @ AES block 8k+10 - result
+       eor3    $res0b, $ctr_t0b, $ctr0b, $rk12                         @ AES block 8k+8 - result
+       mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
+
+       eor3    $res1b, $ctr_t1b, $ctr1b, $rk12                         @ AES block 8k+9 - result
+       mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
+       stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+
+       eor3    $res6b, $ctr_t6b, $ctr6b, $rk12                         @ AES block 6 - result
+       mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
+       eor3    $res5b, $ctr_t5b, $ctr5b, $rk12                         @ AES block 5 - result
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+
+       eor3    $res3b, $ctr_t3b, $ctr3b, $rk12                         @ AES block 8k+11 - result
+       mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
+
+       stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
+
+       stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
+
+       cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
+       stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
+       b.lt    .L192_enc_main_loop
+
+.L192_enc_prepretail:                                                  @ PREPRETAIL
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
+
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+       pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
+
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
+
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       ldr     $rk12q, [$cc, #192]                                     @ load rk12
+
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
+
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
+
+       aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
+       aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
+
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
+       aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
+
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+       aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
+       aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
+
+       aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
+       aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
+       aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
+
+.L192_enc_tail:                                                                @ TAIL
+
+       ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
+        ext     $h5.16b, $h5.16b, $h5.16b, #8
+       sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
+
+       ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - l3ad plaintext
+
+       ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8l | h8h
+        ext     $h8.16b, $h8.16b, $h8.16b, #8
+
+       mov     $t1.16b, $rk12
+
+       ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
+        ext     $h6.16b, $h6.16b, $h6.16b, #8
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       cmp     $main_end_input_ptr, #112
+
+       eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                       @ AES block 8k+8 - result
+       ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
+       b.gt    .L192_enc_blocks_more_than_7
+
+       cmp     $main_end_input_ptr, #96
+       mov     $ctr7b, $ctr6b
+       movi    $acc_h.8b, #0
+
+       mov     $ctr6b, $ctr5b
+       movi    $acc_l.8b, #0
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr3b
+       mov     $ctr3b, $ctr2b
+
+       mov     $ctr2b, $ctr1b
+       movi    $acc_m.8b, #0
+       b.gt    .L192_enc_blocks_more_than_6
+
+       mov     $ctr7b, $ctr6b
+       cmp     $main_end_input_ptr, #80
+
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr3b
+
+       mov     $ctr3b, $ctr1b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b.gt    .L192_enc_blocks_more_than_5
+
+       cmp     $main_end_input_ptr, #64
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+
+       mov     $ctr4b, $ctr1b
+       b.gt    .L192_enc_blocks_more_than_4
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr1b
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       cmp     $main_end_input_ptr, #48
+       b.gt    .L192_enc_blocks_more_than_3
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr1b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       cmp     $main_end_input_ptr, #32
+       b.gt    .L192_enc_blocks_more_than_2
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       cmp     $main_end_input_ptr, #16
+       mov     $ctr7b, $ctr1b
+       b.gt    .L192_enc_blocks_more_than_1
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       b        .L192_enc_blocks_less_than_1
+.L192_enc_blocks_more_than_7:                                          @ blocks left >  7
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-7 block
+       ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
+
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
+
+       pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
+       eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
+.L192_enc_blocks_more_than_6:                                          @ blocks left >  6
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-6 block
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
+
+       pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
+       eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
+
+       pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
+.L192_enc_blocks_more_than_5:                                          @ blocks left >  5
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-5 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
+       pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
+       pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
+       pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
+
+       eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
+.L192_enc_blocks_more_than_4:                                          @ blocks left >  4
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-4 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
+       pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
+
+       pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
+
+       pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
+       eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
+.L192_enc_blocks_more_than_3:                                          @ blocks left >  3
+
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-3 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
+
+       eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
+       pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
+
+       pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
+       pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
+.L192_enc_blocks_more_than_2:                                          @ blocks left >  2
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-2 block
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
+
+       pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
+       pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
+       eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
+.L192_enc_blocks_more_than_1:                                          @ blocks left >  1
+
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-1 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
+       pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
+       pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
+
+       eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
+       pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
+.L192_enc_blocks_less_than_1:                                          @ blocks left <= 1
+
+       mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
+
+       neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
+
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
+       cmp     $bit_length, #64
+       mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
+
+       csel    $temp2_x, $temp1_x, $temp0_x, lt
+       csel    $temp3_x, $temp0_x, xzr, lt
+
+       mov     $ctr0.d[1], $temp3_x
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+
+       ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
+       mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
+
+       and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
+
+       rev64   $res0b, $res1b                                          @ GHASH final block
+       bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
+
+       st1     { $res1b}, [$output_ptr]                                @ store all 16B
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
+       pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
+       pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
+
+       eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
+
+       pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
+
+       eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+
+       rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
+
+       str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+               ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       st1     { $acc_l.16b }, [$current_tag]
+
+       lsr     x0, $bit_length, #3                                     @ return sizes
+
+       ldp     d10, d11, [sp, #16]
+       ldp     d12, d13, [sp, #32]
+       ldp     d14, d15, [sp, #48]
+       ldp     d8, d9, [sp], #80
+       ret
+
+.L192_enc_ret:
+       mov w0, #0x0
+       ret
+.size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
+___
+
+#########################################################################################
+# size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
+#                               size_t len,
+#                               unsigned char *out,
+#                               const void *key,
+#                               unsigned char ivec[16],
+#                               u64 *Xi);
+#
+$code.=<<___;
+.global unroll8_eor3_aes_gcm_dec_192_kernel
+.type   unroll8_eor3_aes_gcm_dec_192_kernel,%function
+.align  4
+unroll8_eor3_aes_gcm_dec_192_kernel:
+       AARCH64_VALID_CALL_TARGET
+       cbz     x1, .L192_dec_ret
+       stp     d8, d9, [sp, #-80]!
+       mov     $counter, x4
+       mov     $cc, x5
+       stp     d10, d11, [sp, #16]
+       stp     d12, d13, [sp, #32]
+       stp     d14, d15, [sp, #48]
+        mov     x5, #0xc200000000000000
+       stp     x5, xzr, [sp, #64]
+       add     $modulo_constant, sp, #64
+
+       lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
+       ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
+       ld1     { $acc_lb}, [$current_tag]
+
+               mov     $constant_temp, #0x100000000                    @ set up counter increment
+       movi    $rctr_inc.16b, #0x0
+       mov     $rctr_inc.d[1], $constant_temp
+
+       rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
+
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
+
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
+
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
+
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
+
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
+
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
+
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
+
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
+
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
+
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
+
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
+
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
+
+       sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
+
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
+
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
+
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
+
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
+
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
+
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
+       and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
+
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
+
+       add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
+
+       ld1     { $acc_lb}, [$current_tag]
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
+       add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
+
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
+
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
+
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
+
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
+
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
+       ldr     $rk12q, [$cc, #192]                                     @ load rk12
+
+       aese    $ctr0b, $rk11                                           @ AES block 0 - round 11
+       aese    $ctr1b, $rk11                                           @ AES block 1 - round 11
+       aese    $ctr4b, $rk11                                           @ AES block 4 - round 11
+
+       aese    $ctr6b, $rk11                                           @ AES block 6 - round 11
+       aese    $ctr5b, $rk11                                           @ AES block 5 - round 11
+       aese    $ctr7b, $rk11                                           @ AES block 7 - round 11
+
+       aese    $ctr2b, $rk11                                           @ AES block 2 - round 11
+       aese    $ctr3b, $rk11                                           @ AES block 3 - round 11
+       b.ge    .L192_dec_tail                                          @ handle tail
+
+       ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
+
+       ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
+
+       ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
+
+       eor3    $ctr1b, $res1b, $ctr1b, $rk12                           @ AES block 1 - result
+       eor3    $ctr0b, $res0b, $ctr0b, $rk12                           @ AES block 0 - result
+       stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
+
+       rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
+       eor3    $ctr3b, $res3b, $ctr3b, $rk12                           @ AES block 3 - result
+
+       eor3    $ctr2b, $res2b, $ctr2b, $rk12                           @ AES block 2 - result
+       stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
+       ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
+
+       eor3    $ctr4b, $res4b, $ctr4b, $rk12                           @ AES block 4 - result
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
+
+       eor3    $ctr5b, $res5b, $ctr5b, $rk12                           @ AES block 5 - result
+       stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+
+       eor3    $ctr6b, $res6b, $ctr6b, $rk12                           @ AES block 6 - result
+       eor3    $ctr7b, $res7b, $ctr7b, $rk12                           @ AES block 7 - result
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
+       stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
+       b.ge    .L192_dec_prepretail                                    @ do prepretail
+
+.L192_dec_main_loop:                                                   @ main loop start
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7
+
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6
+
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+       rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
+
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
+
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
+
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
+       ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
+
+       rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
+
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
+       ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
+
+       rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
+       ldr     $rk12q, [$cc, #192]                                     @ load rk12
+
+       ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
+
+       aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+       aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
+
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
+       aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
+       aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
+
+       eor3    $ctr0b, $res0b, $ctr0b, $rk12                           @ AES block 8k+8 - result
+       rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
+
+       aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
+       aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
+
+       aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
+       aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+
+       eor3    $ctr1b, $res1b, $ctr1b, $rk12                           @ AES block 8k+9 - result
+       stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
+       eor3    $ctr3b, $res3b, $ctr3b, $rk12                           @ AES block 8k+11 - result
+
+       eor3    $ctr2b, $res2b, $ctr2b, $rk12                           @ AES block 8k+10 - result
+       eor3    $ctr7b, $res7b, $ctr7b, $rk12                           @ AES block 8k+15 - result
+       stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
+
+       eor3    $ctr5b, $res5b, $ctr5b, $rk12                           @ AES block 8k+13 - result
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+       mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
+
+       eor3    $ctr4b, $res4b, $ctr4b, $rk12                           @ AES block 8k+12 - result
+       stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
+       cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
+
+       eor3    $ctr6b, $res6b, $ctr6b, $rk12                           @ AES block 8k+14 - result
+       stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
+       mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
+
+       mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
+       mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
+       b.lt    .L192_dec_main_loop
+
+.L192_dec_prepretail:                                                  @ PREPRETAIL
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7
+
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4
+
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
+
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
+
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       ldr     $rk12q, [$cc, #192]                                     @ load rk12
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
+
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
+
+       aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+       aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
+
+       aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
+
+       aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
+       aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+
+       aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
+       aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
+       aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
+
+.L192_dec_tail:                                                                @ TAIL
+
+       sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
+
+       ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
+        ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
+
+       ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8l | h8h
+        ext     $h8.16b, $h8.16b, $h8.16b, #8
+
+       mov     $t1.16b, $rk12
+
+       ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
+        ext     $h6.16b, $h6.16b, $h6.16b, #8
+        ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
+
+       eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
+       cmp     $main_end_input_ptr, #112
+       b.gt    .L192_dec_blocks_more_than_7
+
+       mov     $ctr7b, $ctr6b
+       movi    $acc_h.8b, #0
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr3b
+
+       cmp     $main_end_input_ptr, #96
+       movi    $acc_l.8b, #0
+       mov     $ctr3b, $ctr2b
+
+       mov     $ctr2b, $ctr1b
+       movi    $acc_m.8b, #0
+       b.gt    .L192_dec_blocks_more_than_6
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+
+       mov     $ctr4b, $ctr3b
+       mov     $ctr3b, $ctr1b
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       cmp     $main_end_input_ptr, #80
+       b.gt    .L192_dec_blocks_more_than_5
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr1b
+       cmp     $main_end_input_ptr, #64
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b.gt    .L192_dec_blocks_more_than_4
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr1b
+       cmp     $main_end_input_ptr, #48
+       b.gt    .L192_dec_blocks_more_than_3
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr7b, $ctr6b
+       cmp     $main_end_input_ptr, #32
+
+       mov     $ctr6b, $ctr1b
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       b.gt    .L192_dec_blocks_more_than_2
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr7b, $ctr1b
+       cmp     $main_end_input_ptr, #16
+       b.gt    .L192_dec_blocks_more_than_1
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       b        .L192_dec_blocks_less_than_1
+.L192_dec_blocks_more_than_7:                                          @ blocks left >  7
+       rev64   $res0b, $res1b                                          @ GHASH final-7 block
+
+       ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
+
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
+
+       eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
+
+       pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+.L192_dec_blocks_more_than_6:                                          @ blocks left >  6
+
+       rev64   $res0b, $res1b                                          @ GHASH final-6 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
+
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
+       eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
+       pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
+       pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
+.L192_dec_blocks_more_than_5:                                          @ blocks left >  5
+
+       rev64   $res0b, $res1b                                          @ GHASH final-5 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
+       pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
+
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
+       pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
+       eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
+.L192_dec_blocks_more_than_4:                                          @ blocks left >  4
+
+       rev64   $res0b, $res1b                                          @ GHASH final-4 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
+       pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
+
+       pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
+       pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
+
+       eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
+.L192_dec_blocks_more_than_3:                                          @ blocks left >  3
+
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       rev64   $res0b, $res1b                                          @ GHASH final-3 block
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
+       pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
+
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
+       eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
+.L192_dec_blocks_more_than_2:                                          @ blocks left >  2
+
+       rev64   $res0b, $res1b                                          @ GHASH final-2 block
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
+
+       pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
+       pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
+
+       pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
+       eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
+.L192_dec_blocks_more_than_1:                                          @ blocks left >  1
+
+       rev64   $res0b, $res1b                                          @ GHASH final-1 block
+       ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+
+       pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
+
+       pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
+
+       eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
+.L192_dec_blocks_less_than_1:                                          @ blocks left <= 1
+
+       rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
+       str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
+
+       neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
+       mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
+
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
+       lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
+       cmp     $bit_length, #64
+
+       csel    $temp2_x, $temp1_x, $temp0_x, lt
+       csel    $temp3_x, $temp0_x, xzr, lt
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+
+       mov     $ctr0.d[1], $temp3_x
+       ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
+
+       mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
+
+       and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
+       bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
+
+       rev64   $res0b, $res1b                                          @ GHASH final block
+
+       st1     { $res4b}, [$output_ptr]                                @ store all 16B
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
+       pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
+
+       eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
+       pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
+
+       pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
+
+       eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
+       eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
+
+       eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $t11.16b                     @ MODULO - fold into mid
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
+
+       eor3    $acc_lb, $acc_lb, $acc_mb, $acc_hb                      @ MODULO - fold into low
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       st1     { $acc_l.16b }, [$current_tag]
+
+       ldp     d10, d11, [sp, #16]
+       ldp     d12, d13, [sp, #32]
+       ldp     d14, d15, [sp, #48]
+       ldp     d8, d9, [sp], #80
+       ret
+
+.L192_dec_ret:
+       mov w0, #0x0
+       ret
+.size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
+___
+}
+
+{
+
+my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
+my ($temp2_x,$temp3_x)=map("x$_",(13..14));
+my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
+my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
+my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
+my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
+my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
+
+my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
+my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
+my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
+
+my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
+my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
+
+my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
+my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
+my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
+my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
+
+my $t0="v16";
+my $t0d="d16";
+
+my $t1="v29";
+my $t2=$res1;
+my $t3=$t1;
+
+my $t4=$res0;
+my $t5=$res2;
+my $t6=$t0;
+
+my $t7=$res3;
+my $t8=$res4;
+my $t9=$res5;
+
+my $t10=$res6;
+my $t11="v21";
+my $t12=$t1;
+
+my $rtmp_ctr="v30";
+my $rtmp_ctrq="q30";
+my $rctr_inc="v31";
+my $rctr_incd="d31";
+
+my $mod_constantd=$t0d;
+my $mod_constant=$t0;
+
+my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
+my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
+my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
+my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
+my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
+my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
+my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
+my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
+my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
+my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
+my $rk2q1="v28.1q";
+my $rk3q1="v26.1q";
+my $rk4v="v27";
+#########################################################################################
+# size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
+#                               size_t len,
+#                               unsigned char *out,
+#                               const void *key,
+#                               unsigned char ivec[16],
+#                               u64 *Xi);
+#
+$code.=<<___;
+.global unroll8_eor3_aes_gcm_enc_256_kernel
+.type   unroll8_eor3_aes_gcm_enc_256_kernel,%function
+.align  4
+unroll8_eor3_aes_gcm_enc_256_kernel:
+       AARCH64_VALID_CALL_TARGET
+       cbz     x1, .L256_enc_ret
+       stp     d8, d9, [sp, #-80]!
+       mov     $counter, x4
+       mov     $cc, x5
+       stp     d10, d11, [sp, #16]
+       stp     d12, d13, [sp, #32]
+       stp     d14, d15, [sp, #48]
+       mov     x5, #0xc200000000000000
+       stp     x5, xzr, [sp, #64]
+       add     $modulo_constant, sp, #64
+
+       ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
+
+       lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
+
+       mov     $constant_temp, #0x100000000                    @ set up counter increment
+       movi    $rctr_inc.16b, #0x0
+       mov     $rctr_inc.d[1], $constant_temp
+       sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
+
+       and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+
+       add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+       rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
+
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
+
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
+
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
+
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
+
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
+
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
+
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
+
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
+
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
+
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
+
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
+
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
+
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
+
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
+
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
+
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
+
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
+
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
+
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
+
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
+
+       ld1     { $acc_lb}, [$current_tag]
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
+
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
+
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
+
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
+
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
+
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
+
+       aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 11
+       ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
+       aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 11
+
+       aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 11
+       aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 11
+       aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 11
+
+       aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 11
+       aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 11
+       aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 11
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
+       ldr     $rk14q, [$cc, #224]                                     @ load rk14
+
+       aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 12
+       aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 12
+       aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 12
+
+       aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 12
+       aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 12
+       aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 12
+
+       aese    $ctr2b, $rk13                                           @ AES block 2 - round 13
+       aese    $ctr1b, $rk13                                           @ AES block 1 - round 13
+       aese    $ctr4b, $rk13                                           @ AES block 4 - round 13
+
+       aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 12
+       aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 12
+
+       aese    $ctr0b, $rk13                                           @ AES block 0 - round 13
+       aese    $ctr5b, $rk13                                           @ AES block 5 - round 13
+
+       aese    $ctr6b, $rk13                                           @ AES block 6 - round 13
+       aese    $ctr7b, $rk13                                           @ AES block 7 - round 13
+       aese    $ctr3b, $rk13                                           @ AES block 3 - round 13
+
+       add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+       b.ge    .L256_enc_tail                                          @ handle tail
+
+       ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
+
+       ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
+
+       eor3    $res0b, $ctr_t0b, $ctr0b, $rk14                         @ AES block 0 - result
+       rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
+
+       eor3    $res1b, $ctr_t1b, $ctr1b, $rk14                         @ AES block 1 - result
+       eor3    $res3b, $ctr_t3b, $ctr3b, $rk14                         @ AES block 3 - result
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
+       ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
+
+       ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
+       eor3    $res2b, $ctr_t2b, $ctr2b, $rk14                         @ AES block 2 - result
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
+       stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
+
+       stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
+
+       eor3    $res4b, $ctr_t4b, $ctr4b, $rk14                         @ AES block 4 - result
+
+       eor3    $res7b, $ctr_t7b, $ctr7b, $rk14                         @ AES block 7 - result
+       eor3    $res6b, $ctr_t6b, $ctr6b, $rk14                         @ AES block 6 - result
+       eor3    $res5b, $ctr_t5b, $ctr5b, $rk14                         @ AES block 5 - result
+
+       stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
+
+       stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
+       b.ge    .L256_enc_prepretail                                    @ do prepretail
+
+.L256_enc_main_loop:                                                   @ main loop start
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5
+
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
+
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
+
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
+
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
+
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
+
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
+
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+
+       ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
+       rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
+
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+       ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
+       aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
+
+       aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
+       aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
+
+       aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
+       aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
+
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
+
+       aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
+       aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
+
+       aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
+       aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
+       rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
+       aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+
+       aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
+       ldr     $rk14q, [$cc, #224]                                     @ load rk14
+       aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
+
+       ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
+       aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
+       aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+       aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
+       ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
+
+       ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
+       aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
+       aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
+
+       rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
+       aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
+
+       aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
+       aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
+       cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
+
+       eor3    $res2b, $ctr_t2b, $ctr2b, $rk14                         @ AES block 8k+10 - result
+       rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
+
+       aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
+       aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
+       eor3    $res5b, $ctr_t5b, $ctr5b, $rk14                         @ AES block 5 - result
+
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
+
+       eor3    $res4b, $ctr_t4b, $ctr4b, $rk14                         @ AES block 4 - result
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
+       eor3    $res3b, $ctr_t3b, $ctr3b, $rk14                         @ AES block 8k+11 - result
+
+       mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
+       eor3    $res1b, $ctr_t1b, $ctr1b, $rk14                         @ AES block 8k+9 - result
+       eor3    $res0b, $ctr_t0b, $ctr0b, $rk14                         @ AES block 8k+8 - result
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
+       stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
+       mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
+
+       eor3    $res7b, $ctr_t7b, $ctr7b, $rk14                         @ AES block 7 - result
+       eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
+       stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
+
+       eor3    $res6b, $ctr_t6b, $ctr6b, $rk14                         @ AES block 6 - result
+       mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
+       stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
+
+       stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
+       mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
+       b.lt    .L256_enc_main_loop
+
+.L256_enc_prepretail:                                                  @ PREPRETAIL
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
+
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
+
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
+
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
+
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
+
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
+
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
+
+       ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+       aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+       aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
+       aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
+
+       aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
+       aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
+       aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
+       ldr     $rk14q, [$cc, #224]                                     @ load rk14
+
+       aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
+       aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
+       aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
+
+       aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
+       aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+
+       aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+
+       aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
+       aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
+       aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
+
+       eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
+       aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
+       aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
+
+       aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
+       aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
+       aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
+
+       aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
+       aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
+.L256_enc_tail:                                                                @ TAIL
+
+       ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8l | h8h
+        ext     $h8.16b, $h8.16b, $h8.16b, #8
+       sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
+
+       ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - load plaintext
+
+       ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
+        ext     $h5.16b, $h5.16b, $h5.16b, #8
+
+       ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
+       ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
+        ext     $h6.16b, $h6.16b, $h6.16b, #8
+        ext     $h7.16b, $h7.16b, $h7.16b, #8
+       mov     $t1.16b, $rk14
+
+       cmp     $main_end_input_ptr, #112
+       eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                               @ AES block 8k+8 - result
+       b.gt    .L256_enc_blocks_more_than_7
+
+       movi    $acc_l.8b, #0
+       mov     $ctr7b, $ctr6b
+       movi    $acc_h.8b, #0
+
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr3b
+
+       mov     $ctr3b, $ctr2b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr2b, $ctr1b
+
+       movi    $acc_m.8b, #0
+       cmp     $main_end_input_ptr, #96
+       b.gt    .L256_enc_blocks_more_than_6
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+       cmp     $main_end_input_ptr, #80
+
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr3b
+       mov     $ctr3b, $ctr1b
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b.gt    .L256_enc_blocks_more_than_5
+
+       mov     $ctr7b, $ctr6b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr4b
+
+       cmp     $main_end_input_ptr, #64
+       mov     $ctr4b, $ctr1b
+       b.gt    .L256_enc_blocks_more_than_4
+
+       cmp     $main_end_input_ptr, #48
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr1b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b.gt    .L256_enc_blocks_more_than_3
+
+       cmp     $main_end_input_ptr, #32
+       mov     $ctr7b, $ctr6b
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+
+       mov     $ctr6b, $ctr1b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       b.gt    .L256_enc_blocks_more_than_2
+
+       mov     $ctr7b, $ctr1b
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       cmp     $main_end_input_ptr, #16
+       b.gt    .L256_enc_blocks_more_than_1
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       b        .L256_enc_blocks_less_than_1
+.L256_enc_blocks_more_than_7:                                          @ blocks left >  7
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-7 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
+
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
+       ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
+       eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
+
+       pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
+.L256_enc_blocks_more_than_6:                                          @ blocks left >  6
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-6 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
+       pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
+
+       pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
+       eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
+.L256_enc_blocks_more_than_5:                                          @ blocks left >  5
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-5 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
+
+       pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
+       pmull   $rk3q1, $res0.1d, $h6.1d                                  @ GHASH final-5 block - low
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
+       eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
+.L256_enc_blocks_more_than_4:                                          @ blocks left >  4
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-4 block
+
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
+       pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
+
+       eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
+       pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
+
+       pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
+.L256_enc_blocks_more_than_3:                                          @ blocks left >  3
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
+
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       rev64   $res0b, $res1b                                          @ GHASH final-3 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
+       pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
+       pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
+
+       eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
+.L256_enc_blocks_more_than_2:                                          @ blocks left >  2
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
+
+       rev64   $res0b, $res1b                                          @ GHASH final-2 block
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
+       eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
+
+       pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
+       pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
+.L256_enc_blocks_more_than_1:                                          @ blocks left >  1
+
+       st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
+
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       rev64   $res0b, $res1b                                          @ GHASH final-1 block
+       ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
+       pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
+
+       eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
+
+       pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
+
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
+.L256_enc_blocks_less_than_1:                                          @ blocks left <= 1
+
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
+
+       neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
+
+       mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
+       cmp     $bit_length, #64
+       mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
+
+       csel    $temp3_x, $temp0_x, xzr, lt
+       csel    $temp2_x, $temp1_x, $temp0_x, lt
+
+       mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+
+       ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
+       mov     $ctr0.d[1], $temp3_x
+
+       and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
+
+       rev64   $res0b, $res1b                                          @ GHASH final block
+
+       rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
+       bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
+       str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       st1     { $res1b}, [$output_ptr]                                @ store all 16B
+
+       ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
+       pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
+       pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
+
+       eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
+
+       pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
+
+       eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+
+       eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
+               ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       st1     { $acc_l.16b }, [$current_tag]
+       lsr     x0, $bit_length, #3                                     @ return sizes
+
+        ldp     d10, d11, [sp, #16]
+       ldp     d12, d13, [sp, #32]
+       ldp     d14, d15, [sp, #48]
+       ldp     d8, d9, [sp], #80
+       ret
+
+.L256_enc_ret:
+       mov w0, #0x0
+       ret
+.size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
+___
+
+{
+#########################################################################################
+# size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
+#                               size_t len,
+#                               unsigned char *out,
+#                               const void *key,
+#                               unsigned char ivec[16],
+#                               u64 *Xi);
+#
+$code.=<<___;
+.global unroll8_eor3_aes_gcm_dec_256_kernel
+.type   unroll8_eor3_aes_gcm_dec_256_kernel,%function
+.align  4
+unroll8_eor3_aes_gcm_dec_256_kernel:
+       AARCH64_VALID_CALL_TARGET
+       cbz     x1, .L256_dec_ret
+       stp     d8, d9, [sp, #-80]!
+       mov     $counter, x4
+       mov     $cc, x5
+       stp     d10, d11, [sp, #16]
+       stp     d12, d13, [sp, #32]
+       stp     d14, d15, [sp, #48]
+       mov     x5, #0xc200000000000000
+       stp     x5, xzr, [sp, #64]
+       add     $modulo_constant, sp, #64
+
+       ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
+
+       mov     $constant_temp, #0x100000000                    @ set up counter increment
+       movi    $rctr_inc.16b, #0x0
+       mov     $rctr_inc.d[1], $constant_temp
+       lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
+
+       sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
+
+       rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
+
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
+
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
+
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
+
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
+
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
+
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
+
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
+
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
+
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
+       ldp     $rk4q, $rk5q, [$cc, #64]                                                @ load rk4, rk5
+
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
+
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
+
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
+
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
+
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
+
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
+
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
+
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
+
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
+
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
+
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
+
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
+
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
+
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
+
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
+
+       and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
+
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
+
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
+
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
+
+       ld1     { $acc_lb}, [$current_tag]
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+       add     $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
+       add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
+
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
+
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
+
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
+
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
+
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
+
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
+
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
+       ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
+
+       aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
+
+       aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 11
+       aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 11
+       aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 11
+
+       aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 11
+       aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 11
+       aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 11
+
+       aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 11
+       ldr     $rk14q, [$cc, #224]                                     @ load rk14
+
+       aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 12
+       aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 12
+       aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 12
+
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+       aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 12
+       aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 12
+
+       aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 12
+       aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 12
+       aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 12
+
+       aese    $ctr5b, $rk13                                           @ AES block 5 - round 13
+       aese    $ctr1b, $rk13                                           @ AES block 1 - round 13
+       aese    $ctr2b, $rk13                                           @ AES block 2 - round 13
+
+       aese    $ctr0b, $rk13                                           @ AES block 0 - round 13
+       aese    $ctr4b, $rk13                                           @ AES block 4 - round 13
+       aese    $ctr6b, $rk13                                           @ AES block 6 - round 13
+
+       aese    $ctr3b, $rk13                                           @ AES block 3 - round 13
+       aese    $ctr7b, $rk13                                           @ AES block 7 - round 13
+       b.ge    .L256_dec_tail                                          @ handle tail
+
+       ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
+
+       ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
+
+       ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
+
+       ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
+       cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
+
+       eor3    $ctr1b, $res1b, $ctr1b, $rk14                           @ AES block 1 - result
+       eor3    $ctr0b, $res0b, $ctr0b, $rk14                           @ AES block 0 - result
+       stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
+
+       rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
+       eor3    $ctr3b, $res3b, $ctr3b, $rk14                           @ AES block 3 - result
+
+       eor3    $ctr5b, $res5b, $ctr5b, $rk14                           @ AES block 5 - result
+
+       eor3    $ctr4b, $res4b, $ctr4b, $rk14                           @ AES block 4 - result
+       rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
+
+       eor3    $ctr2b, $res2b, $ctr2b, $rk14                           @ AES block 2 - result
+       stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
+
+       rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
+
+       eor3    $ctr6b, $res6b, $ctr6b, $rk14                           @ AES block 6 - result
+
+       rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
+       stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
+
+       eor3    $ctr7b, $res7b, $ctr7b, $rk14                           @ AES block 7 - result
+       stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
+
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
+       b.ge    .L256_dec_prepretail                                    @ do prepretail
+
+.L256_dec_main_loop:                                                   @ main loop start
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5
+
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
+
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
+
+       ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
+
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
+
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
+
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
+
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
+
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
+       aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
+       ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
+
+       aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
+       aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
+
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+       rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
+       aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
+
+       ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
+       aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
+
+       aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
+       aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
+
+       aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
+       aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
+       aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
+
+       rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+       aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
+       aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
+
+       ldr     $rk14q, [$cc, #224]                                     @ load rk14
+       aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
+       aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+       aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
+       aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
+
+       ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
+       aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
+       aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
+
+       ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
+       aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
+       aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
+
+       rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
+       eor3    $ctr2b, $res2b, $ctr2b, $rk14                           @ AES block 8k+10 - result
+       eor3    $ctr1b, $res1b, $ctr1b, $rk14                           @ AES block 8k+9 - result
+
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+       aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
+
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
+
+       eor3    $ctr5b, $res5b, $ctr5b, $rk14                           @ AES block 8k+13 - result
+       eor3    $ctr0b, $res0b, $ctr0b, $rk14                           @ AES block 8k+8 - result
+       aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
+
+       stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
+       mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
+       eor3    $ctr4b, $res4b, $ctr4b, $rk14                           @ AES block 8k+12 - result
+
+       eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
+       eor3    $ctr3b, $res3b, $ctr3b, $rk14                           @ AES block 8k+11 - result
+       stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
+
+       mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
+       mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
+       aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
+
+       mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
+       stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
+       eor3    $ctr7b, $res7b, $ctr7b, $rk14                           @ AES block 8k+15 - result
+
+       eor3    $ctr6b, $res6b, $ctr6b, $rk14                           @ AES block 8k+14 - result
+       rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
+
+       cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
+       stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
+       b.lt    .L256_dec_main_loop
+
+.L256_dec_prepretail:                                                  @ PREPRETAIL
+       ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
+       rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
+
+       rev64   $res4b, $res4b                                          @ GHASH block 8k+4
+       ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
+       ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
+
+       rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
+       rev64   $res0b, $res0b                                          @ GHASH block 8k
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
+
+       ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
+       ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
+       ext     $h7.16b, $h7.16b, $h7.16b, #8
+       ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
+       ext     $h8.16b, $h8.16b, $h8.16b, #8
+       rev64   $res1b, $res1b                                          @ GHASH block 8k+1
+
+       rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
+       rev64   $res2b, $res2b                                          @ GHASH block 8k+2
+       ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
+       ext     $h5.16b, $h5.16b, $h5.16b, #8
+       ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
+       ext     $h6.16b, $h6.16b, $h6.16b, #8
+
+       aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
+       aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
+       aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
+
+       aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
+       aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
+       aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
+
+       aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
+       aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
+       aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
+
+       ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
+       aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
+       eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
+
+       aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
+       aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
+       aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
+
+       aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
+       aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
+       aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
+
+       pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
+       trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
+
+       rev64   $res3b, $res3b                                          @ GHASH block 8k+3
+       pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
+
+       aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
+       aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
+       aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
+
+       aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
+       aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
+
+       aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
+       aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
+
+       aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
+       rev64   $res6b, $res6b                                          @ GHASH block 8k+6
+
+       aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
+       aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
+       aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
+
+       pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
+       trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
+       aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
+
+       ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
+       aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
+       pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
+
+       aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
+       eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
+       eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
+
+       aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
+       pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
+       aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
+
+       eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
+       trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
+       trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
+
+       pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
+       pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
+       eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
+
+       pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
+       aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
+       aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
+
+       eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+       aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
+
+       aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
+       aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
+       eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
+
+       eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
+       aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
+       aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
+
+       aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
+       aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
+       aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
+
+       aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
+       pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
+       aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
+
+       aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
+       aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
+       pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
+
+       aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
+       aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
+       ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       rev64   $res7b, $res7b                                          @ GHASH block 8k+7
+       rev64   $res5b, $res5b                                          @ GHASH block 8k+5
+
+       eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
+
+       trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
+
+       aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
+       aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
+
+       pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
+       pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
+       pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
+
+       trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
+       pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
+       trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
+       pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
+       aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
+
+       aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
+       aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
+       aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
+
+       ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
+       pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
+       aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
+
+       aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
+       aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
+
+       aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
+       aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
+       eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
+
+       aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
+       trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
+       aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
+
+       aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
+       aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
+       aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
+
+       aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
+       aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
+       aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
+
+       aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
+       aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
+       eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
+       aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
+       eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
+       aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
+       pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
+
+       aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
+       pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
+       pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
+
+       pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
+       pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
+       pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
+
+       ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
+       eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
+       eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
+
+       aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
+       aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
+       aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
+
+       eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
+       eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+
+       eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
+
+       aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
+       aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
+       aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
+
+       aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
+       aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
+       aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
+
+       eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
+
+       aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
+       aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
+       ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
+
+       ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
+
+       aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
+       aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
+       aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
+
+       pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
+
+       aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
+       aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
+       aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
+
+       aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
+       aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
+
+       eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
+
+       aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
+       aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
+       aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+       aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
+       aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
+
+       aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
+       ldr     $rk14q, [$cc, #224]                                     @ load rk14
+       aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
+
+       aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
+       ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
+       aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
+
+       aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
+       aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
+       aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
+
+       aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
+       eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
+       add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
+
+       aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
+       aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
+.L256_dec_tail:                                                                @ TAIL
+
+       ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
+       sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
+       cmp     $main_end_input_ptr, #112
+
+       ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
+
+       ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8l | h8h
+        ext     $h8.16b, $h8.16b, $h8.16b, #8
+       mov     $t1.16b, $rk14
+
+       ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
+        ext     $h5.16b, $h5.16b, $h5.16b, #8
+
+       eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
+       ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
+        ext     $h6.16b, $h6.16b, $h6.16b, #8
+        ext     $h7.16b, $h7.16b, $h7.16b, #8
+       b.gt    .L256_dec_blocks_more_than_7
+
+       mov     $ctr7b, $ctr6b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr3b
+       movi    $acc_l.8b, #0
+
+       movi    $acc_h.8b, #0
+       movi    $acc_m.8b, #0
+       mov     $ctr3b, $ctr2b
+
+       cmp     $main_end_input_ptr, #96
+       mov     $ctr2b, $ctr1b
+       b.gt    .L256_dec_blocks_more_than_6
+
+       mov     $ctr7b, $ctr6b
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr4b
+       cmp     $main_end_input_ptr, #80
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr4b, $ctr3b
+       mov     $ctr3b, $ctr1b
+       b.gt    .L256_dec_blocks_more_than_5
+
+       cmp     $main_end_input_ptr, #64
+       mov     $ctr7b, $ctr6b
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr6b, $ctr5b
+
+       mov     $ctr5b, $ctr4b
+       mov     $ctr4b, $ctr1b
+       b.gt    .L256_dec_blocks_more_than_4
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr7b, $ctr6b
+       cmp     $main_end_input_ptr, #48
+
+       mov     $ctr6b, $ctr5b
+       mov     $ctr5b, $ctr1b
+       b.gt    .L256_dec_blocks_more_than_3
+
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       mov     $ctr7b, $ctr6b
+
+       cmp     $main_end_input_ptr, #32
+       mov     $ctr6b, $ctr1b
+       b.gt    .L256_dec_blocks_more_than_2
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+
+       mov     $ctr7b, $ctr1b
+       cmp     $main_end_input_ptr, #16
+       b.gt    .L256_dec_blocks_more_than_1
+
+       sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       b        .L256_dec_blocks_less_than_1
+.L256_dec_blocks_more_than_7:                                          @ blocks left >  7
+       rev64   $res0b, $res1b                                          @ GHASH final-7 block
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
+
+       ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
+       eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
+
+       pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
+       pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
+.L256_dec_blocks_more_than_6:                                          @ blocks left >  6
+
+       rev64   $res0b, $res1b                                          @ GHASH final-6 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
+       pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
+
+       pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
+
+       eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
+
+       pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
+.L256_dec_blocks_more_than_5:                                          @ blocks left >  5
+
+       rev64   $res0b, $res1b                                          @ GHASH final-5 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
+
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
+
+       pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
+       eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+.L256_dec_blocks_more_than_4:                                          @ blocks left >  4
+
+       rev64   $res0b, $res1b                                          @ GHASH final-4 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
+       pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
+
+       pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
+       eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
+.L256_dec_blocks_more_than_3:                                          @ blocks left >  3
+
+       ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
+       ext     $h4.16b, $h4.16b, $h4.16b, #8
+       rev64   $res0b, $res1b                                          @ GHASH final-3 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
+       ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
+
+       eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
+       pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
+       pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
+
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
+.L256_dec_blocks_more_than_2:                                          @ blocks left >  2
+
+       rev64   $res0b, $res1b                                          @ GHASH final-2 block
+
+       ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
+       ext     $h3.16b, $h3.16b, $h3.16b, #8
+       ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
+
+       pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
+       eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+
+       pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
+       pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
+.L256_dec_blocks_more_than_1:                                          @ blocks left >  1
+
+       rev64   $res0b, $res1b                                          @ GHASH final-1 block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
+       ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
+       ext     $h2.16b, $h2.16b, $h2.16b, #8
+
+       eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
+       ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
+       st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
+
+       ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
+       pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
+
+       ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
+
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
+
+       eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
+       pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
+
+       pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
+
+       movi    $t0.8b, #0                                              @ surpress further partial tag feed in
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
+
+       eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
+.L256_dec_blocks_less_than_1:                                          @ blocks left <= 1
+
+       ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
+       mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
+       rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
+       str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
+
+       neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
+
+       and     $bit_length, $bit_length, #127                          @ bit_length %= 128
+
+       lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
+       cmp     $bit_length, #64
+       mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
+
+       csel    $temp3_x, $temp0_x, xzr, lt
+       csel    $temp2_x, $temp1_x, $temp0_x, lt
+
+       mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
+       mov     $ctr0.d[1], $temp3_x
+
+       and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
+       ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
+       ext     $h1.16b, $h1.16b, $h1.16b, #8
+       bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
+
+       rev64   $res0b, $res1b                                          @ GHASH final block
+
+       eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
+
+       ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
+       pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
+
+       eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
+
+       pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
+       eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
+
+       pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
+
+       eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
+       ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
+       eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
+
+       pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
+       eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
+
+       ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
+       st1     { $res4b}, [$output_ptr]                                @ store all 16B
+
+       eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
+
+       eor     $t11.16b, $acc_hb, $t11.16b                             @ MODULO - fold into mid
+       eor     $acc_mb, $acc_mb, $t11.16b                              @ MODULO - fold into mid
+
+       pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
+
+       ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
+       eor     $acc_lb, $acc_lb, $acc_hb                               @ MODULO - fold into low
+
+       eor     $acc_lb, $acc_lb, $acc_mb                               @ MODULO - fold into low
+       ext     $acc_lb, $acc_lb, $acc_lb, #8
+       rev64   $acc_lb, $acc_lb
+       st1     { $acc_l.16b }, [$current_tag]
+       lsr     x0, $bit_length, #3                                     @ return sizes
+
+        ldp     d10, d11, [sp, #16]
+       ldp     d12, d13, [sp, #32]
+       ldp     d14, d15, [sp, #48]
+       ldp     d8, d9, [sp], #80
+       ret
+
+.L256_dec_ret:
+       mov w0, #0x0
+       ret
+.size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
+___
+}
+}
+
+$code.=<<___;
+.asciz  "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
+.align  2
+#endif
+___
+
+{
+    my  %opcode = (
+    "rax1"    => 0xce608c00,    "eor3"    => 0xce000000,
+    "bcax"    => 0xce200000,    "xar"    => 0xce800000    );
+
+    sub unsha3 {
+         my ($mnemonic,$arg)=@_;
+
+         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
+         &&
+         sprintf ".inst\t0x%08x\t//%s %s",
+            $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
+            $mnemonic,$arg;
+    }
+    sub unvmov {
+        my $arg=shift;
+
+        $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
+        sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
+                             $3<8?$3:$3+8,($4 eq "lo")?0:1;
+    }
+
+     foreach(split("\n",$code)) {
+        s/@\s/\/\//o;               # old->new style commentary
+        s/\`([^\`]*)\`/eval($1)/ge;
+
+        m/\bld1r\b/ and s/\.16b/.2d/g    or
+        s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
+        print $_,"\n";
+     }
+}
+
+close STDOUT or die "error closing STDOUT: $!"; # enforce flush
index a1cfad0ef6fd1824517892416c1aaaf6f74ac6e3..4dbcbe8d0c05a0a38c4ddc76fe1b40166f135d2c 100644 (file)
@@ -158,6 +158,7 @@ $code.=<<___;
 ___
 if ($flavour =~ /64/) {
 my ($t3,$Yl,$Ym,$Yh) = map("q$_",(4..7));
+my ($H3,$H34k,$H4,$H5,$H56k,$H6,$H7,$H78k,$H8) = map("q$_",(15..23));
 
 $code.=<<___;
        @ calculate H^3 and H^4
@@ -192,15 +193,103 @@ $code.=<<___;
         vpmull.p64     $Yl,$Yl,$xC2
        veor            $t2,$t2,$Xh
         veor           $t3,$t3,$Yh
-       veor            $H, $Xl,$t2             @ H^3
-        veor           $H2,$Yl,$t3             @ H^4
+       veor            $H3, $Xl,$t2            @ H^3
+        veor           $H4,$Yl,$t3             @ H^4
+
+       vext.8          $t0,$H3, $H3,#8         @ Karatsuba pre-processing
+        vext.8         $t1,$H4,$H4,#8
+        vext.8         $t2,$H2,$H2,#8
+       veor            $t0,$t0,$H3
+        veor           $t1,$t1,$H4
+        veor           $t2,$t2,$H2
+       vext.8          $H34k,$t0,$t1,#8                @ pack Karatsuba pre-processed
+       vst1.64         {$H3-$H4},[x0],#48              @ store Htable[3..5]
+
+       @ calculate H^5 and H^6
+       vpmull.p64      $Xl,$H2, $H3
+        vpmull.p64     $Yl,$H3,$H3
+       vpmull2.p64     $Xh,$H2, $H3
+        vpmull2.p64    $Yh,$H3,$H3
+       vpmull.p64      $Xm,$t0,$t2
+        vpmull.p64     $Ym,$t0,$t0
 
-       vext.8          $t0,$H, $H,#8           @ Karatsuba pre-processing
-        vext.8         $t1,$H2,$H2,#8
-       veor            $t0,$t0,$H
-        veor           $t1,$t1,$H2
-       vext.8          $Hhl,$t0,$t1,#8         @ pack Karatsuba pre-processed
-       vst1.64         {$H-$H2},[x0]           @ store Htable[3..5]
+       vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
+        vext.8         $t1,$Yl,$Yh,#8
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t0
+        veor           $t3,$Yl,$Yh
+        veor           $Ym,$Ym,$t1
+       veor            $Xm,$Xm,$t2
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
+        veor           $Ym,$Ym,$t3
+        vpmull.p64     $t3,$Yl,$xC2
+
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+        vmov           $Yh#lo,$Ym#hi
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+        vmov           $Ym#hi,$Yl#lo
+       veor            $Xl,$Xm,$t2
+        veor           $Yl,$Ym,$t3
+
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
+        vext.8         $t3,$Yl,$Yl,#8
+       vpmull.p64      $Xl,$Xl,$xC2
+        vpmull.p64     $Yl,$Yl,$xC2
+       veor            $t2,$t2,$Xh
+        veor           $t3,$t3,$Yh
+       veor            $H5,$Xl,$t2             @ H^5
+        veor           $H6,$Yl,$t3             @ H^6
+
+       vext.8          $t0,$H5, $H5,#8         @ Karatsuba pre-processing
+        vext.8         $t1,$H6,$H6,#8
+        vext.8         $t2,$H2,$H2,#8
+       veor            $t0,$t0,$H5
+        veor           $t1,$t1,$H6
+        veor           $t2,$t2,$H2
+       vext.8          $H56k,$t0,$t1,#8                @ pack Karatsuba pre-processed
+       vst1.64         {$H5-$H6},[x0],#48              @ store Htable[6..8]
+
+       @ calculate H^7 and H^8
+       vpmull.p64      $Xl,$H2,$H5
+        vpmull.p64     $Yl,$H2,$H6
+       vpmull2.p64     $Xh,$H2,$H5
+        vpmull2.p64    $Yh,$H2,$H6
+       vpmull.p64      $Xm,$t0,$t2
+        vpmull.p64     $Ym,$t1,$t2
+
+       vext.8          $t0,$Xl,$Xh,#8          @ Karatsuba post-processing
+        vext.8         $t1,$Yl,$Yh,#8
+       veor            $t2,$Xl,$Xh
+       veor            $Xm,$Xm,$t0
+        veor           $t3,$Yl,$Yh
+        veor           $Ym,$Ym,$t1
+       veor            $Xm,$Xm,$t2
+       vpmull.p64      $t2,$Xl,$xC2            @ 1st phase
+        veor           $Ym,$Ym,$t3
+        vpmull.p64     $t3,$Yl,$xC2
+
+       vmov            $Xh#lo,$Xm#hi           @ Xh|Xm - 256-bit result
+        vmov           $Yh#lo,$Ym#hi
+       vmov            $Xm#hi,$Xl#lo           @ Xm is rotated Xl
+        vmov           $Ym#hi,$Yl#lo
+       veor            $Xl,$Xm,$t2
+        veor           $Yl,$Ym,$t3
+
+       vext.8          $t2,$Xl,$Xl,#8          @ 2nd phase
+        vext.8         $t3,$Yl,$Yl,#8
+       vpmull.p64      $Xl,$Xl,$xC2
+        vpmull.p64     $Yl,$Yl,$xC2
+       veor            $t2,$t2,$Xh
+        veor           $t3,$t3,$Yh
+       veor            $H7,$Xl,$t2             @ H^7
+        veor           $H8,$Yl,$t3             @ H^8
+
+       vext.8          $t0,$H7,$H7,#8          @ Karatsuba pre-processing
+        vext.8         $t1,$H8,$H8,#8
+       veor            $t0,$t0,$H7
+        veor           $t1,$t1,$H8
+       vext.8          $H78k,$t0,$t1,#8                @ pack Karatsuba pre-processed
+       vst1.64         {$H7-$H8},[x0]          @ store Htable[9..11]
 ___
 }
 $code.=<<___;
index 0ea122ea98036bb75610b9c7814b7d4c3db78193..3166cdc2a61e59410902e516d56943f4ff6b1091 100644 (file)
@@ -24,7 +24,7 @@ IF[{- !$disabled{asm} -}]
 
   $MODESASM_armv4=ghash-armv4.S ghashv8-armx.S
   $MODESDEF_armv4=GHASH_ASM
-  $MODESASM_aarch64=ghashv8-armx.S aes-gcm-armv8_64.S
+  $MODESASM_aarch64=ghashv8-armx.S aes-gcm-armv8_64.S aes-gcm-armv8-unroll8_64.S
   $MODESDEF_aarch64=
 
   $MODESASM_parisc11=ghash-parisc.s
@@ -78,6 +78,8 @@ GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl
 INCLUDE[ghashv8-armx.o]=..
 GENERATE[aes-gcm-armv8_64.S]=asm/aes-gcm-armv8_64.pl
 INCLUDE[aes-gcm-armv8_64.o]=..
+GENERATE[aes-gcm-armv8-unroll8_64.S]=asm/aes-gcm-armv8-unroll8_64.pl
+INCLUDE[aes-gcm-armv8-unroll8_64.o]=..
 GENERATE[ghash-s390x.S]=asm/ghash-s390x.pl
 INCLUDE[ghash-s390x.o]=..
 GENERATE[ghash-c64xplus.S]=asm/ghash-c64xplus.pl
index 9e74d86a50b159a4355f6df14d9cf56415ab9cd1..45021dfd9f7b4ddc43304f195f41833b5394c1a8 100644 (file)
@@ -129,6 +129,18 @@ size_t aes_gcm_dec_192_kernel(const uint8_t * ciphertext, uint64_t plaintext_len
                               uint64_t *Xi, unsigned char ivec[16], const void *key);
 size_t aes_gcm_dec_256_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
                               uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t unroll8_eor3_aes_gcm_enc_128_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
+                              uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t unroll8_eor3_aes_gcm_enc_192_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
+                              uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t unroll8_eor3_aes_gcm_enc_256_kernel(const uint8_t * plaintext, uint64_t plaintext_length, uint8_t * ciphertext,
+                              uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t unroll8_eor3_aes_gcm_dec_128_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
+                              uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t unroll8_eor3_aes_gcm_dec_192_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
+                              uint64_t *Xi, unsigned char ivec[16], const void *key);
+size_t unroll8_eor3_aes_gcm_dec_256_kernel(const uint8_t * ciphertext, uint64_t plaintext_length, uint8_t * plaintext,
+                              uint64_t *Xi, unsigned char ivec[16], const void *key);
 size_t armv8_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key,
                              unsigned char ivec[16], u64 *Xi);
 size_t armv8_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len, const void *key,
index db2678714e27f5295f2816a52517fc21176feb6d..999e8215dfee1607554c41e876c227873ef08920 100644 (file)
@@ -22,13 +22,25 @@ size_t armv8_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t
 
     switch(aes_key->rounds) {
         case 10:
-            aes_gcm_enc_128_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            if (IS_CPU_SUPPORT_UNROLL8_EOR3()) {
+                unroll8_eor3_aes_gcm_enc_128_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            } else {
+                aes_gcm_enc_128_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            }
             break;
         case 12:
-            aes_gcm_enc_192_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            if (IS_CPU_SUPPORT_UNROLL8_EOR3()) {
+                unroll8_eor3_aes_gcm_enc_192_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            } else {
+                aes_gcm_enc_192_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            }
             break;
         case 14:
-            aes_gcm_enc_256_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            if (IS_CPU_SUPPORT_UNROLL8_EOR3()) {
+                unroll8_eor3_aes_gcm_enc_256_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            } else {
+                aes_gcm_enc_256_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            }
             break;
     }
     return align_bytes;
@@ -44,13 +56,25 @@ size_t armv8_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t
 
     switch(aes_key->rounds) {
         case 10:
-            aes_gcm_dec_128_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            if (IS_CPU_SUPPORT_UNROLL8_EOR3()) {
+                unroll8_eor3_aes_gcm_dec_128_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            } else {
+                aes_gcm_dec_128_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            }
             break;
         case 12:
-            aes_gcm_dec_192_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            if (IS_CPU_SUPPORT_UNROLL8_EOR3()) {
+                unroll8_eor3_aes_gcm_dec_192_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            } else {
+                aes_gcm_dec_192_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            }
             break;
         case 14:
-            aes_gcm_dec_256_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            if (IS_CPU_SUPPORT_UNROLL8_EOR3()) {
+                unroll8_eor3_aes_gcm_dec_256_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            } else {
+                aes_gcm_dec_256_kernel(in, align_bytes * 8, out, (uint64_t *)Xi, ivec, key);
+            }
             break;
     }
     return align_bytes;