d17f4b16ef4627a38a86d75c60ffe05ad3eb4655
[openssl.git] / crypto / modes / asm / aes-gcm-armv8-unroll8_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 #========================================================================
11 # Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
12 # derived from https://github.com/ARM-software/AArch64cryptolib, original
13 # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14 # licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
15 # obtain it.
16 #========================================================================
17 #
18 # Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
19 # Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
20 # intermediate hashesfrom the 8 blocks.
21 #
22 #  ____________________________________________________
23 # |                                                    |
24 # | PRE                                                |
25 # |____________________________________________________|
26 # |                |                |                  |
27 # | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
28 # |________________|________________|__________________|
29 # |                |                |                  |
30 # | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
31 # |________________|________________|__________________|
32 # |                |                |                  |
33 # | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
34 # |________________|________________|__________________|
35 # |                |                |                  |
36 # | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
37 # |________________|________________|__________________|
38 # |                |                |                  |
39 # | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
40 # |________________|________________|__________________|
41 # |                |                |                  |
42 # | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
43 # |________________|________________|__________________|
44 # |                |                |                  |
45 # | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
46 # |________________|________________|__________________|
47 # |                |                |                  |
48 # | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
49 # |________________|____(mostly)____|__________________|
50 # |                                                    |
51 # | MODULO                                             |
52 # |____________________________________________________|
53 #
54 # PRE:
55 #     Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
56 # EXT low_acc, low_acc, low_acc, #8
57 # EOR res_curr (8k+0), res_curr (4k+0), low_acc
58 #
59 # CTR block:
60 #     Increment and byte reverse counter in scalar registers and transfer to SIMD registers
61 # REV     ctr32, rev_ctr32
62 # ORR     ctr64, constctr96_top32, ctr32, LSL #32
63 # INS     ctr_next.d[0], constctr96_bottom64      // Keeping this in scalar registers to free up space in SIMD RF
64 # INS     ctr_next.d[1], ctr64X
65 # ADD     rev_ctr32, #1
66 #
67 # AES block:
68 #      Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
69 #      Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
70 #      Given we are very constrained in our ASIMD registers this is quite important
71 #
72 #      Encrypt:
73 # LDR     input_low, [ input_ptr  ], #8
74 # LDR     input_high, [ input_ptr  ], #8
75 # EOR     input_low, k14_low
76 # EOR     input_high, k14_high
77 # INS     res_curr.d[0], input_low
78 # INS     res_curr.d[1], input_high
79 # AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
80 # AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
81 # AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
82 # AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
83 # AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
84 # AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
85 # AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
86 # AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
87 # AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
88 # AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
89 # AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
90 # AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
91 # AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
92 # AESE    ctr_curr, k13
93 # EOR     res_curr, res_curr, ctr_curr
94 # ST1     { res_curr.16b  }, [ output_ptr  ], #16
95 #
96 #     Decrypt:
97 # AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
98 # AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
99 # AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
100 # AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
101 # AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
102 # AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
103 # AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
104 # AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
105 # AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
106 # AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
107 # AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
108 # AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
109 # AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
110 # AESE    ctr_curr, k13
111 # LDR     res_curr, [ input_ptr  ], #16
112 # EOR     res_curr, res_curr, ctr_curr
113 # MOV     output_low, res_curr.d[0]
114 # MOV     output_high, res_curr.d[1]
115 # EOR     output_low, k14_low
116 # EOR     output_high, k14_high
117 # STP     output_low, output_high, [ output_ptr  ], #16
118
119 # GHASH block X:
120 #     Do 128b karatsuba polynomial multiplication on block
121 #     We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
122 #
123 # multiplication:
124 #     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
125 #
126 #     The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
127 #     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
128 #
129 #     There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
130 #     multiplying with "twisted" powers of H
131 #
132 # Note: We can PMULL directly into the acc_x in first GHASH of the loop
133 # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
134 #       path latency dominates the performance
135 #
136 #       This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
137 #       than indicated here
138 # REV64   res_curr, res_curr
139 # INS     t_m.d[0], res_curr.d[1]
140 # EOR     t_m.8B, t_m.8B, res_curr.8B
141 # PMULL2  t_h, res_curr, HX
142 # PMULL   t_l, res_curr, HX
143 # PMULL   t_m, t_m, HX_k
144 # EOR     acc_h, acc_h, t_h
145 # EOR     acc_l, acc_l, t_l
146 # EOR     acc_m, acc_m, t_m
147 #
148 # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
149 #         There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
150 #         with a reversed constant
151 # EOR3    acc_m, acc_m, acc_l, acc_h                     // Finish off karatsuba processing
152 # PMULL   t_mod, acc_h, mod_constant
153 # EXT     acc_h, acc_h, acc_h, #8
154 # EOR3     acc_m, acc_m, t_mod, acc_h
155 # PMULL   acc_h, acc_m, mod_constant
156 # EXT     acc_m, acc_m, acc_m, #8
157 # EOR3    acc_l, acc_l, acc_m, acc_h
158
159 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
160 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
161
162 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
163 ( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
164 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
165 die "can't locate arm-xlate.pl";
166
167 die "only for 64 bit" if $flavour !~ /64/;
168
169 open OUT,"| \"$^X\" $xlate $flavour $output";
170 *STDOUT=*OUT;
171
172 $code=<<___;
173 #include "arm_arch.h"
174
175 #if __ARM_MAX_ARCH__>=8
176 ___
177 $code.=".arch   armv8-a+crypto\n.text\n";
178
179 $input_ptr="x0";  #argument block
180 $bit_length="x1";
181 $byte_length="x9";
182 $output_ptr="x2";
183 $current_tag="x3";
184 $counter="x16";
185 $constant_temp="x15";
186 $modulo_constant="x10";
187 $cc="x8";
188 {
189 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
190 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
191 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
192 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
193 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
194 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
195 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
196
197 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
198 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
199 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
200
201 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
202 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
203
204 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
205 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
206 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
207 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
208
209 my $t0="v16";
210 my $t0d="d16";
211
212 my $t1="v29";
213 my $t2=$res1;
214 my $t3=$t1;
215
216 my $t4=$res0;
217 my $t5=$res2;
218 my $t6=$t0;
219
220 my $t7=$res3;
221 my $t8=$res4;
222 my $t9=$res5;
223
224 my $t10=$res6;
225 my $t11="v21";
226 my $t12=$t1;
227
228 my $rtmp_ctr="v30";
229 my $rtmp_ctrq="q30";
230 my $rctr_inc="v31";
231 my $rctr_incd="d31";
232
233 my $mod_constantd=$t0d;
234 my $mod_constant=$t0;
235
236 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
237 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
238 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
239 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
240 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
241 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
242 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
243 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
244 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
245 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
246 my $rk2q1="v28.1q";
247 my $rk3q1="v26.1q";
248 my $rk4v="v27";
249
250
251 #########################################################################################
252 # size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
253 #                               size_t len,
254 #                               unsigned char *out,
255 #                               const void *key,
256 #                               unsigned char ivec[16],
257 #                               u64 *Xi);
258 #
259 $code.=<<___;
260 .global unroll8_eor3_aes_gcm_enc_128_kernel
261 .type   unroll8_eor3_aes_gcm_enc_128_kernel,%function
262 .align  4
263 unroll8_eor3_aes_gcm_enc_128_kernel:
264         AARCH64_VALID_CALL_TARGET
265         cbz     x1, .L128_enc_ret
266         stp     d8, d9, [sp, #-80]!
267         lsr     $byte_length, $bit_length, #3
268         mov     $counter, x4
269         mov     $cc, x5
270         stp     d10, d11, [sp, #16]
271         stp     d12, d13, [sp, #32]
272         stp     d14, d15, [sp, #48]
273         mov     x5, #0xc200000000000000
274         stp     x5, xzr, [sp, #64]
275         add     $modulo_constant, sp, #64
276
277         mov     $constant_temp, #0x100000000                            @ set up counter increment
278         movi    $rctr_inc.16b, #0x0
279         mov     $rctr_inc.d[1], $constant_temp
280         mov     $main_end_input_ptr, $byte_length
281         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
282
283         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
284
285         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80           @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
286
287         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
288
289         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
290
291         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
292         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
293
294         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
295         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
296
297         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
298         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
299
300         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
301         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
302
303         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
304         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
305         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
306
307         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
308         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
309
310         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
311         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
312
313         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
314         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
315         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
316
317         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
318         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
319         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
320
321         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
322         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
323         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
324
325         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
326
327         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
328         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
329         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
330
331         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
332         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
333         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
334
335         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
336         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
337         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
338
339         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
340         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
341         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
342
343         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
344         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
345         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
346
347         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
348
349         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
350         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
351         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
352
353         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
354         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
355         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
356
357         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
358
359         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
360         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
361         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
362
363         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
364         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
365         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
366
367         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
368         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
369         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
370
371         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
372         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
373         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
374
375         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
376         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
377         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
378
379         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
380         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
381         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
382
383         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
384         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
385         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
386
387         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
388         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
389         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
390
391         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
392         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
393         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
394
395         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
396
397         ld1     { $acc_lb}, [$current_tag]
398         ext     $acc_lb, $acc_lb, $acc_lb, #8
399         rev64   $acc_lb, $acc_lb
400
401         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
402
403         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
404         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
405         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
406
407         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
408         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
409         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
410
411         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
412         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
413         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
414
415         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
416         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
417         ldr     $rk10q, [$cc, #160]                                     @ load rk10
418
419         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
420         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
421         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
422
423         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
424         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
425         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
426
427         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
428         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
429         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
430
431         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
432         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
433         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
434
435         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
436         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
437         b.ge    .L128_enc_tail                                          @ handle tail
438
439         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
440
441         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
442
443         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
444
445         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
446         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
447
448         eor3    $res0b, $ctr_t0b, $ctr0b, $rk10                         @ AES block 0 - result
449         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
450         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
451
452         eor3    $res1b, $ctr_t1b, $ctr1b, $rk10                         @ AES block 1 - result
453         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
454
455         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
456         eor3    $res5b, $ctr_t5b, $ctr5b, $rk10                         @ AES block 5 - result
457         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
458
459         eor3    $res2b, $ctr_t2b, $ctr2b, $rk10                         @ AES block 2 - result
460         eor3    $res6b, $ctr_t6b, $ctr6b, $rk10                         @ AES block 6 - result
461         eor3    $res4b, $ctr_t4b, $ctr4b, $rk10                         @ AES block 4 - result
462
463         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
464         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
465
466         eor3    $res3b, $ctr_t3b, $ctr3b, $rk10                         @ AES block 3 - result
467         eor3    $res7b, $ctr_t7b, $ctr7b,$rk10                          @ AES block 7 - result
468         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
469
470         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
471         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
472         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
473
474         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
475
476         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
477         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
478         b.ge    .L128_enc_prepretail                                    @ do prepretail
479
480 .L128_enc_main_loop:                                                    @ main loop start
481         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
482         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
483         ext     $h5.16b, $h5.16b, $h5.16b, #8
484         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
485         ext     $h6.16b, $h6.16b, $h6.16b, #8
486         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
487
488         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
489         rev64   $res0b, $res0b                                          @ GHASH block 8k
490         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
491         ext     $h7.16b, $h7.16b, $h7.16b, #8
492         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
493         ext     $h8.16b, $h8.16b, $h8.16b, #8
494
495         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
496         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
497         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
498
499         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
500         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
501         rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
502         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
503
504         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
505         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
506         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
507
508         rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
509
510         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
511         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
512         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
513
514         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
515         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
516         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
517
518         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
519         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
520         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
521
522         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
523         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
524         ext     $h3.16b, $h3.16b, $h3.16b, #8
525         ldr     $h4q, [$current_tag, #112]                              @ load h3l | h3h
526         ext     $h4.16b, $h4.16b, $h4.16b, #8
527         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
528
529         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
530         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
531         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
532
533         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
534         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
535         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
536
537         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
538         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
539         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
540
541         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
542         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
543         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
544
545         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
546         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
547         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
548
549         eor3    $acc_hb, $acc_hb, $t1.16b,$t2.16b                       @ GHASH block 8k+2, 8k+3 - high
550         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
551         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
552
553         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
554         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
555         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
556
557         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
558         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
559         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
560
561         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
562         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
563         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
564
565         rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
566         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
567
568         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
569         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
570         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
571
572         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
573         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
574         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
575
576         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
577         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
578         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
579
580         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
581         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
582         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
583
584         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
585         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
586         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
587         rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
588
589         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
590         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
591         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
592
593         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
594         ext     $h1.16b, $h1.16b, $h1.16b, #8
595         ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
596         ext     $h2.16b, $h2.16b, $h2.16b, #8
597         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
598         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
599
600         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
601         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
602
603         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
604         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
605
606         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
607         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
608
609         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
610         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
611
612         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
613         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
614         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
615
616         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
617         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
618         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
619
620         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
621         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
622         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
623
624         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
625         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
626         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
627
628         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
629         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
630         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
631
632         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
633         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
634         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
635
636         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
637         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
638         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
639
640         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
641         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
642
643         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
644         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
645         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
646
647         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
648         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
649         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
650
651         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
652         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
653         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
654
655         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
656         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
657
658         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
659         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
660         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
661
662         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
663         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
664         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
665
666         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
667         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
668         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
669
670         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
671         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
672         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
673
674         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
675         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
676         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
677
678         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
679         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
680         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
681
682         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
683         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
684         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
685
686         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
687         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
688         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
689
690         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
691         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
692
693         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
694         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 8k+12, 8k+13 - load plaintext
695         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
696
697         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
698         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
699         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
700
701         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
702         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
703         ldr     $rk10q, [$cc, #160]                                     @ load rk10
704
705         ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
706         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
707         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
708         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
709
710         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
711         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
712         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
713
714         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
715         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
716         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
717
718         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 8k+14, 8k+15 - load plaintext
719         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
720         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
721
722         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
723         eor3    $res4b, $ctr_t4b, $ctr4b, $rk10                         @ AES block 4 - result
724         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
725
726         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
727         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
728
729         eor3    $res2b, $ctr_t2b, $ctr2b, $rk10                         @ AES block 8k+10 - result
730
731         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
732         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
733
734         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
735         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
736
737         eor3    $res7b, $ctr_t7b, $ctr7b, $rk10                         @ AES block 7 - result
738         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
739         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
740
741         eor3    $res1b, $ctr_t1b, $ctr1b, $rk10                         @ AES block 8k+9 - result
742         eor3    $res3b, $ctr_t3b, $ctr3b, $rk10                         @ AES block 8k+11 - result
743         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
744
745         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
746         eor3    $res5b, $ctr_t5b, $ctr5b, $rk10                         @ AES block 5 - result
747         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
748
749         eor3    $res0b, $ctr_t0b, $ctr0b, $rk10                         @ AES block 8k+8 - result
750         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
751         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
752
753         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
754         eor3    $res6b, $ctr_t6b, $ctr6b, $rk10                         @ AES block 6 - result
755
756         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
757         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
758
759         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
760         b.lt    .L128_enc_main_loop
761
762 .L128_enc_prepretail:                                                   @ PREPRETAIL
763         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
764         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
765         ext     $h7.16b, $h7.16b, $h7.16b, #8
766         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
767         ext     $h8.16b, $h8.16b, $h8.16b, #8
768         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
769
770         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
771         ext     $h5.16b, $h5.16b, $h5.16b, #8
772         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
773         ext     $h6.16b, $h6.16b, $h6.16b, #8
774         rev64   $res0b, $res0b                                          @ GHASH block 8k
775         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
776
777         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
778         ldr     $h78kq, [$current_tag, #192]                            @ load h6k | h5k
779         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
780         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
781
782         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
783         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
784
785         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
786
787         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
788         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
789         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
790
791         rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
792         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
793
794         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
795         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
796         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
797
798         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
799         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
800
801         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
802         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
803
804         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
805         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
806
807         rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
808         rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
809
810         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
811
812         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
813
814         rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
815
816         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
817
818         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
819         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
820
821         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
822         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
823
824         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
825         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
826
827         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
828         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
829         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
830
831         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
832         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
833
834         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
835         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
836         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
837
838         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
839         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
840
841         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
842         ext     $h3.16b, $h3.16b, $h3.16b, #8
843         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
844         ext     $h4.16b, $h4.16b, $h4.16b, #8
845
846         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
847         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
848         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
849
850         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
851         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
852
853         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
854         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
855
856         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
857         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
858         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
859         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
860
861         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
862         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
863
864         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
865         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
866         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
867
868         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
869         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
870         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
871
872         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
873         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
874
875         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
876         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
877         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
878
879         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
880         ext     $h1.16b, $h1.16b, $h1.16b, #8
881         ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
882         ext     $h2.16b, $h2.16b, $h2.16b, #8
883         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
884         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
885
886         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
887         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
888         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
889
890         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
891         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
892         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
893
894         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
895         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
896
897         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
898         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
899         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
900
901         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
902         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
903         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
904
905         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
906         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
907         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
908
909         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
910         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
911         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
912
913         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
914         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
915         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
916
917         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
918         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
919         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
920
921         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
922         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
923
924         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
925         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
926         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
927
928         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
929         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
930         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
931
932         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
933         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
934         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
935
936         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
937         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
938         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
939
940         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
941         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
942
943         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
944         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
945         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
946
947         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
948         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
949
950         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
951         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
952         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
953
954         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
955         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
956         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
957
958         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
959         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
960         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
961
962         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
963         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
964         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
965
966         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
967         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
968         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
969         ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
970
971         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
972         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
973         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
974
975         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
976         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
977
978         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
979         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
980
981         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
982         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
983         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
984         ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
985
986         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
987         eor3    $acc_lb, $acc_lb, $acc_hb, $acc_mb                      @ MODULO - fold into low
988         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
989
990         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
991         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
992         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
993
994         ldr     $rk10q, [$cc, #160]                                     @ load rk10
995         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
996         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
997
998         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
999         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
1000
1001         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
1002         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
1003
1004         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
1005         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
1006 .L128_enc_tail:                                                         @ TAIL
1007
1008         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
1009         ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - load plaintext
1010
1011         mov     $t1.16b, $rk10
1012         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
1013         ext     $h5.16b, $h5.16b, $h5.16b, #8
1014
1015         eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                       @ AES block 8k+8 - result
1016         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
1017         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
1018         ext     $h6.16b, $h6.16b, $h6.16b, #8
1019         ext     $h7.16b, $h7.16b, $h7.16b, #8
1020
1021         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
1022         ext     $h8.16b, $h8.16b, $h8.16b, #8
1023         cmp     $main_end_input_ptr, #112
1024         b.gt    .L128_enc_blocks_more_than_7
1025
1026         mov     $ctr7b, $ctr6b
1027         mov     $ctr6b, $ctr5b
1028         movi    $acc_h.8b, #0
1029
1030         cmp     $main_end_input_ptr, #96
1031         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1032         mov     $ctr5b, $ctr4b
1033
1034         mov     $ctr4b, $ctr3b
1035         mov     $ctr3b, $ctr2b
1036         mov     $ctr2b, $ctr1b
1037
1038         movi    $acc_l.8b, #0
1039         movi    $acc_m.8b, #0
1040         b.gt    .L128_enc_blocks_more_than_6
1041
1042         mov     $ctr7b, $ctr6b
1043         cmp     $main_end_input_ptr, #80
1044
1045         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1046         mov     $ctr6b, $ctr5b
1047         mov     $ctr5b, $ctr4b
1048
1049         mov     $ctr4b, $ctr3b
1050         mov     $ctr3b, $ctr1b
1051         b.gt    .L128_enc_blocks_more_than_5
1052
1053         cmp     $main_end_input_ptr, #64
1054         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1055
1056         mov     $ctr7b, $ctr6b
1057         mov     $ctr6b, $ctr5b
1058
1059         mov     $ctr5b, $ctr4b
1060         mov     $ctr4b, $ctr1b
1061         b.gt    .L128_enc_blocks_more_than_4
1062
1063         mov     $ctr7b, $ctr6b
1064         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1065         mov     $ctr6b, $ctr5b
1066
1067         mov     $ctr5b, $ctr1b
1068         cmp     $main_end_input_ptr, #48
1069         b.gt    .L128_enc_blocks_more_than_3
1070
1071         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1072         mov     $ctr7b, $ctr6b
1073         mov     $ctr6b, $ctr1b
1074
1075         cmp     $main_end_input_ptr, #32
1076         ldr     $h34kq, [$current_tag, #96]                                     @ load h4k | h3k
1077         b.gt    .L128_enc_blocks_more_than_2
1078
1079         cmp     $main_end_input_ptr, #16
1080
1081         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1082         mov     $ctr7b, $ctr1b
1083         b.gt    .L128_enc_blocks_more_than_1
1084
1085         ldr     $h12kq, [$current_tag, #48]                                     @ load h2k | h1k
1086         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1087         b        .L128_enc_blocks_less_than_1
1088 .L128_enc_blocks_more_than_7:                                           @ blocks left >  7
1089         st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
1090
1091         rev64   $res0b, $res1b                                          @ GHASH final-7 block
1092         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
1093
1094         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1095
1096         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
1097
1098         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
1099
1100         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
1101
1102         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
1103         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
1104
1105         eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
1106
1107         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
1108         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
1109 .L128_enc_blocks_more_than_6:                                           @ blocks left >  6
1110
1111         st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
1112
1113         rev64   $res0b, $res1b                                          @ GHASH final-6 block
1114         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
1115
1116         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1117
1118         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
1119
1120         eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
1121         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
1122
1123         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
1124         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
1125
1126         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
1127         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
1128
1129         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
1130
1131         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
1132         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
1133 .L128_enc_blocks_more_than_5:                                           @ blocks left >  5
1134
1135         st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
1136
1137         rev64   $res0b, $res1b                                          @ GHASH final-5 block
1138
1139         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1140
1141         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
1142         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
1143         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
1144
1145         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
1146
1147         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
1148
1149         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
1150
1151         eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
1152         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
1153         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
1154
1155         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
1156         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
1157
1158         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
1159 .L128_enc_blocks_more_than_4:                                           @ blocks left >  4
1160
1161         st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
1162
1163         rev64   $res0b, $res1b                                          @ GHASH final-4 block
1164
1165         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
1166
1167         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1168
1169         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
1170         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
1171         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
1172
1173         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
1174
1175         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
1176
1177         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
1178         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
1179
1180         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
1181
1182         eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
1183         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
1184 .L128_enc_blocks_more_than_3:                                           @ blocks left >  3
1185
1186         st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
1187
1188         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
1189         ext     $h4.16b, $h4.16b, $h4.16b, #8
1190
1191         rev64   $res0b, $res1b                                          @ GHASH final-3 block
1192
1193         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1194         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
1195
1196         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
1197         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
1198         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
1199
1200         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
1201
1202         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
1203
1204         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
1205         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
1206
1207         eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
1208
1209         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
1210         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
1211
1212         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
1213         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
1214 .L128_enc_blocks_more_than_2:                                           @ blocks left >  2
1215
1216         st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
1217
1218         rev64   $res0b, $res1b                                          @ GHASH final-2 block
1219
1220         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1221
1222         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
1223
1224         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
1225         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
1226         ext     $h3.16b, $h3.16b, $h3.16b, #8
1227         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
1228
1229         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
1230         eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
1231
1232         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
1233
1234         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
1235         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
1236
1237         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
1238
1239         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
1240         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
1241 .L128_enc_blocks_more_than_1:                                           @ blocks left >  1
1242
1243         st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
1244
1245         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
1246         ext     $h2.16b, $h2.16b, $h2.16b, #8
1247         rev64   $res0b, $res1b                                          @ GHASH final-1 block
1248         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
1249
1250         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1251
1252         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
1253         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
1254         eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
1255
1256         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
1257
1258         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
1259
1260         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
1261
1262         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
1263
1264         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
1265         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
1266
1267         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
1268
1269         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
1270         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
1271 .L128_enc_blocks_less_than_1:                                           @ blocks left <= 1
1272
1273         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
1274         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
1275         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
1276
1277         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
1278
1279         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
1280
1281         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
1282         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
1283         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
1284
1285         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
1286         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
1287         cmp     $bit_length, #64
1288
1289         csel    $temp2_x, $temp1_x, $temp0_x, lt
1290         csel    $temp3_x, $temp0_x, xzr, lt
1291
1292         mov     $ctr0.d[1], $temp3_x
1293         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
1294
1295         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
1296
1297         rev64   $res0b, $res1b                                          @ GHASH final block
1298
1299         bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
1300         st1     { $res1b}, [$output_ptr]                                @ store all 16B
1301
1302         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1303
1304         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
1305
1306         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
1307         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
1308         ext     $h1.16b, $h1.16b, $h1.16b, #8
1309
1310         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
1311
1312         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
1313         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
1314         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
1315
1316         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
1317
1318         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
1319
1320         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
1321
1322         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
1323         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
1324
1325         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
1326
1327         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
1328
1329         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
1330         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
1331
1332         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
1333         ext     $acc_lb, $acc_lb, $acc_lb, #8
1334         rev64   $acc_lb, $acc_lb
1335         st1     { $acc_l.16b }, [$current_tag]
1336         mov     x0, $byte_length
1337
1338         ldp     d10, d11, [sp, #16]
1339         ldp     d12, d13, [sp, #32]
1340         ldp     d14, d15, [sp, #48]
1341         ldp     d8, d9, [sp], #80
1342         ret
1343
1344 .L128_enc_ret:
1345         mov w0, #0x0
1346         ret
1347 .size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
1348 ___
1349
1350 #########################################################################################
1351 # size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
1352 #                               size_t len,
1353 #                               unsigned char *out,
1354 #                               u64 *Xi,
1355 #                               unsigned char ivec[16],
1356 #                               const void *key);
1357 #
1358 $code.=<<___;
1359 .global unroll8_eor3_aes_gcm_dec_128_kernel
1360 .type   unroll8_eor3_aes_gcm_dec_128_kernel,%function
1361 .align  4
1362 unroll8_eor3_aes_gcm_dec_128_kernel:
1363         AARCH64_VALID_CALL_TARGET
1364         cbz     x1, .L128_dec_ret
1365         stp     d8, d9, [sp, #-80]!
1366         lsr     $byte_length, $bit_length, #3
1367         mov     $counter, x4
1368         mov     $cc, x5
1369         stp     d10, d11, [sp, #16]
1370         stp     d12, d13, [sp, #32]
1371         stp     d14, d15, [sp, #48]
1372         mov     x5, #0xc200000000000000
1373         stp     x5, xzr, [sp, #64]
1374         add     $modulo_constant, sp, #64
1375
1376         mov     $main_end_input_ptr, $byte_length
1377         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
1378
1379         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
1380         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
1381
1382         mov     $constant_temp, #0x100000000                            @ set up counter increment
1383         movi    $rctr_inc.16b, #0x0
1384         mov     $rctr_inc.d[1], $constant_temp
1385         ld1     { $acc_lb}, [$current_tag]
1386           ext   $acc_lb, $acc_lb, $acc_lb, #8
1387         rev64   $acc_lb, $acc_lb
1388
1389         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
1390
1391         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
1392
1393         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
1394
1395         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
1396         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
1397
1398         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1399
1400         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
1401         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
1402         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
1403
1404         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
1405         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
1406
1407         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
1408         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
1409
1410         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
1411         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
1412
1413         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
1414         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
1415
1416         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
1417
1418         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
1419         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
1420         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
1421
1422         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
1423         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
1424
1425         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
1426
1427         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
1428         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
1429
1430         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
1431
1432         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
1433
1434         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
1435         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
1436
1437         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
1438         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
1439
1440         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
1441         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
1442         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
1443
1444         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
1445         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
1446         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
1447
1448         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
1449         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
1450         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
1451
1452         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
1453         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
1454
1455         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
1456         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
1457
1458         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
1459         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
1460
1461         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
1462         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
1463
1464         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
1465         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
1466         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
1467
1468         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
1469         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
1470         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
1471
1472         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
1473         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
1474         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
1475
1476         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
1477         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
1478         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
1479
1480         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
1481         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
1482
1483         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
1484         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
1485
1486         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
1487
1488         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
1489         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
1490         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
1491
1492         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
1493         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
1494         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
1495
1496         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
1497         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
1498         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
1499
1500         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
1501         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
1502         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
1503
1504         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
1505         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
1506         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
1507
1508         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
1509         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
1510         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
1511
1512         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1513         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
1514
1515         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
1516         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
1517
1518         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
1519         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
1520         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
1521
1522         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
1523         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
1524         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
1525
1526         aese    $ctr0b, $rk9                                            @ AES block 0 - round 9
1527         aese    $ctr1b, $rk9                                            @ AES block 1 - round 9
1528         aese    $ctr6b, $rk9                                            @ AES block 6 - round 9
1529
1530         ldr     $rk10q, [$cc, #160]                                     @ load rk10
1531         aese    $ctr4b, $rk9                                            @ AES block 4 - round 9
1532         aese    $ctr3b, $rk9                                            @ AES block 3 - round 9
1533
1534         aese    $ctr2b, $rk9                                            @ AES block 2 - round 9
1535         aese    $ctr5b, $rk9                                            @ AES block 5 - round 9
1536         aese    $ctr7b, $rk9                                            @ AES block 7 - round 9
1537
1538         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
1539         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
1540         b.ge    .L128_dec_tail                                          @ handle tail
1541
1542         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
1543
1544         eor3    $ctr0b, $res0b, $ctr0b, $rk10                           @ AES block 0 - result
1545         eor3    $ctr1b, $res1b, $ctr1b, $rk10                           @ AES block 1 - result
1546         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
1547
1548         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
1549         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
1550         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
1551
1552         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
1553
1554         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
1555         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
1556         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
1557
1558         eor3    $ctr3b, $res3b, $ctr3b, $rk10                           @ AES block 3 - result
1559         eor3    $ctr2b, $res2b, $ctr2b, $rk10                           @ AES block 2 - result
1560         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
1561
1562         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
1563         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
1564
1565         eor3    $ctr6b, $res6b, $ctr6b, $rk10                           @ AES block 6 - result
1566
1567         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
1568         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
1569
1570         eor3    $ctr4b, $res4b, $ctr4b, $rk10                           @ AES block 4 - result
1571         eor3    $ctr5b, $res5b, $ctr5b, $rk10                           @ AES block 5 - result
1572         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
1573
1574         eor3    $ctr7b, $res7b, $ctr7b, $rk10                           @ AES block 7 - result
1575         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
1576         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
1577
1578         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
1579         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
1580         b.ge    .L128_dec_prepretail                                    @ do prepretail
1581
1582 .L128_dec_main_loop:                                                    @ main loop start
1583         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
1584         ext     $h7.16b, $h7.16b, $h7.16b, #8
1585         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
1586         ext     $h8.16b, $h8.16b, $h8.16b, #8
1587
1588         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
1589         rev64   $res0b, $res0b                                          @ GHASH block 8k
1590         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
1591
1592         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
1593         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
1594         ext     $h5.16b, $h5.16b, $h5.16b, #8
1595         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
1596         ext     $h6.16b, $h6.16b, $h6.16b, #8
1597
1598         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
1599         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
1600         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
1601
1602         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
1603         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
1604         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
1605
1606         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
1607         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
1608         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
1609         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
1610
1611         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
1612         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
1613         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
1614
1615         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
1616         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
1617         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
1618
1619         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
1620         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
1621         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
1622
1623         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
1624         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
1625         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
1626
1627         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
1628         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
1629         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
1630
1631         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
1632         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
1633         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
1634
1635         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
1636         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
1637         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
1638
1639         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
1640         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
1641         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
1642
1643         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
1644         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
1645         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
1646
1647         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
1648         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
1649         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
1650
1651         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
1652         ext     $h3.16b, $h3.16b, $h3.16b, #8
1653         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
1654         ext     $h4.16b, $h4.16b, $h4.16b, #8
1655         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
1656         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
1657
1658         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
1659         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
1660         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
1661
1662         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
1663         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
1664         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
1665
1666         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
1667         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
1668         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
1669
1670         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
1671         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
1672         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
1673         ext     $h1.16b, $h1.16b, $h1.16b, #8
1674         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
1675         ext     $h2.16b, $h2.16b, $h2.16b, #8
1676
1677         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
1678         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
1679         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
1680
1681         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
1682         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
1683         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
1684
1685         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
1686         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
1687         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
1688
1689         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
1690         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
1691         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
1692
1693         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
1694         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
1695         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
1696
1697         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
1698         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
1699         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
1700         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
1701
1702         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
1703         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
1704         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
1705
1706         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
1707         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
1708         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
1709
1710         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
1711         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
1712         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
1713
1714         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
1715         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
1716         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
1717
1718         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
1719         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
1720         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
1721
1722         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
1723         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
1724         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
1725
1726         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
1727         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
1728         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
1729
1730         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
1731         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
1732         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
1733
1734         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
1735         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
1736         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
1737
1738         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
1739         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
1740         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
1741
1742         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
1743         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
1744         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
1745
1746         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
1747         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
1748         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
1749
1750         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
1751         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
1752         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
1753
1754         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
1755         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
1756         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
1757
1758         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
1759         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
1760         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
1761
1762         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
1763         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
1764         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
1765
1766         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
1767         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
1768         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
1769
1770         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
1771         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
1772         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
1773
1774         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
1775         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
1776         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
1777
1778         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
1779         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
1780         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
1781
1782         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
1783         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
1784         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
1785
1786         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
1787         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
1788         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
1789
1790         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
1791         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
1792         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
1793
1794         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
1795         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
1796         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
1797
1798         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
1799         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
1800         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
1801
1802         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
1803         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
1804         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
1805
1806         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
1807         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
1808         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
1809
1810         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
1811         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
1812         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
1813
1814         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
1815         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
1816         ldr     $rk10q, [$cc, #160]                                     @ load rk10
1817
1818         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
1819         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
1820         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
1821
1822         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
1823         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
1824         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
1825
1826         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
1827         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
1828
1829         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
1830         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
1831         eor3    $ctr1b, $res1b, $ctr1b, $rk10                           @ AES block 8k+9 - result
1832
1833         eor3    $ctr0b, $res0b, $ctr0b, $rk10                           @ AES block 8k+8 - result
1834         eor3    $ctr7b, $res7b, $ctr7b, $rk10                           @ AES block 8k+15 - result
1835         eor3    $ctr6b, $res6b, $ctr6b, $rk10                           @ AES block 8k+14 - result
1836
1837         eor3    $ctr2b, $res2b, $ctr2b, $rk10                           @ AES block 8k+10 - result
1838         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
1839         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
1840
1841         eor3    $ctr4b, $res4b, $ctr4b, $rk10                           @ AES block 8k+12 - result
1842         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
1843         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
1844
1845         eor3    $ctr3b, $res3b, $ctr3b, $rk10                           @ AES block 8k+11 - result
1846         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
1847         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
1848
1849         eor3    $ctr5b, $res5b, $ctr5b, $rk10                           @ AES block 8k+13 - result
1850         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
1851
1852         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
1853         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
1854         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
1855
1856         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
1857         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
1858         b.lt    .L128_dec_main_loop
1859
1860 .L128_dec_prepretail:                                                   @ PREPRETAIL
1861         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
1862         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
1863         rev64   $res0b, $res0b                                          @ GHASH block 8k
1864
1865         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
1866         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
1867         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
1868
1869         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
1870         ext     $h7.16b, $h7.16b, $h7.16b, #8
1871         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
1872         ext     $h8.16b, $h8.16b, $h8.16b, #8
1873         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
1874         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
1875
1876         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
1877         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
1878         ext     $h5.16b, $h5.16b, $h5.16b, #8
1879         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
1880         ext     $h6.16b, $h6.16b, $h6.16b, #8
1881         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
1882
1883         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
1884
1885         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
1886
1887         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
1888         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
1889         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
1890         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
1891
1892         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
1893         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
1894         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
1895
1896         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
1897         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
1898         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
1899
1900         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
1901         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
1902         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
1903
1904         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
1905         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
1906         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
1907
1908         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
1909         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
1910         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
1911
1912         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
1913         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
1914         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
1915
1916         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
1917         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
1918         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
1919
1920         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k - mid
1921         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
1922         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
1923
1924         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
1925         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
1926         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
1927
1928         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
1929         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
1930         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
1931
1932         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
1933         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
1934         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
1935
1936         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
1937         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
1938         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
1939
1940         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
1941         ext     $h3.16b, $h3.16b, $h3.16b, #8
1942         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
1943         ext     $h4.16b, $h4.16b, $h4.16b, #8
1944         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
1945         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
1946
1947         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
1948         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
1949         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
1950
1951         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
1952         ext     $h1.16b, $h1.16b, $h1.16b, #8
1953         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
1954         ext     $h2.16b, $h2.16b, $h2.16b, #8
1955         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
1956
1957         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
1958         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
1959         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
1960
1961         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
1962         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
1963         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
1964
1965         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
1966         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
1967         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
1968
1969         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
1970         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
1971         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
1972
1973         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
1974         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
1975         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
1976
1977         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
1978         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
1979         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
1980         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
1981
1982         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
1983         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
1984         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
1985
1986         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
1987         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
1988         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
1989
1990         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
1991         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
1992         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
1993
1994         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
1995         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
1996         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
1997
1998         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
1999         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
2000         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
2001
2002         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
2003         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
2004         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
2005
2006         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
2007         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
2008         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
2009
2010         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
2011         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
2012         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
2013
2014         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
2015         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
2016         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
2017
2018         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
2019         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
2020         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
2021
2022         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
2023         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
2024         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
2025
2026         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
2027         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
2028         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
2029
2030         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
2031         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
2032         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
2033
2034         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
2035         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
2036         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
2037
2038         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
2039         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
2040         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
2041
2042         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
2043         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
2044         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
2045
2046         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
2047         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
2048         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
2049
2050         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
2051         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
2052         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
2053
2054         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
2055         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
2056         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
2057
2058         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
2059         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
2060         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
2061
2062         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
2063         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
2064         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
2065
2066         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
2067         ldr     $rk10q, [$cc, #160]                                     @ load rk10
2068
2069         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
2070         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
2071
2072         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
2073         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
2074         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
2075
2076         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
2077         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
2078         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
2079
2080         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
2081         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
2082         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
2083
2084         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
2085         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
2086         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
2087
2088         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
2089         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
2090         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
2091
2092         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
2093         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
2094         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
2095
2096 .L128_dec_tail:                                                         @ TAIL
2097
2098         mov     $t1.16b, $rk10
2099         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
2100
2101         cmp     $main_end_input_ptr, #112
2102
2103         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
2104         ext     $h8.16b, $h8.16b, $h8.16b, #8
2105         ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
2106
2107         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
2108         ext     $h5.16b, $h5.16b, $h5.16b, #8
2109         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
2110
2111         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
2112         ext     $h6.16b, $h6.16b, $h6.16b, #8
2113         ext     $h7.16b, $h7.16b, $h7.16b, #8
2114
2115         eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
2116         b.gt    .L128_dec_blocks_more_than_7
2117
2118         cmp     $main_end_input_ptr, #96
2119         mov     $ctr7b, $ctr6b
2120         movi    $acc_l.8b, #0
2121
2122         movi    $acc_h.8b, #0
2123         mov     $ctr6b, $ctr5b
2124         mov     $ctr5b, $ctr4b
2125
2126         mov     $ctr4b, $ctr3b
2127         mov     $ctr3b, $ctr2b
2128         mov     $ctr2b, $ctr1b
2129
2130         movi    $acc_m.8b, #0
2131         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2132         b.gt    .L128_dec_blocks_more_than_6
2133
2134         cmp     $main_end_input_ptr, #80
2135         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2136
2137         mov     $ctr7b, $ctr6b
2138         mov     $ctr6b, $ctr5b
2139         mov     $ctr5b, $ctr4b
2140
2141         mov     $ctr4b, $ctr3b
2142         mov     $ctr3b, $ctr1b
2143         b.gt    .L128_dec_blocks_more_than_5
2144
2145         cmp     $main_end_input_ptr, #64
2146
2147         mov     $ctr7b, $ctr6b
2148         mov     $ctr6b, $ctr5b
2149         mov     $ctr5b, $ctr4b
2150
2151         mov     $ctr4b, $ctr1b
2152         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2153         b.gt    .L128_dec_blocks_more_than_4
2154
2155         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2156         mov     $ctr7b, $ctr6b
2157         mov     $ctr6b, $ctr5b
2158
2159         mov     $ctr5b, $ctr1b
2160         cmp     $main_end_input_ptr, #48
2161         b.gt    .L128_dec_blocks_more_than_3
2162
2163         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2164         mov     $ctr7b, $ctr6b
2165         cmp     $main_end_input_ptr, #32
2166
2167         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
2168         mov     $ctr6b, $ctr1b
2169         b.gt    .L128_dec_blocks_more_than_2
2170
2171         cmp     $main_end_input_ptr, #16
2172
2173         mov     $ctr7b, $ctr1b
2174         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2175         b.gt    L128_dec_blocks_more_than_1
2176
2177         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2178         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
2179         b        .L128_dec_blocks_less_than_1
2180 .L128_dec_blocks_more_than_7:                                           @ blocks left >  7
2181         rev64   $res0b, $res1b                                          @ GHASH final-7 block
2182
2183         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2184
2185         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
2186
2187         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
2188         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
2189
2190         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
2191         ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
2192
2193         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
2194
2195         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
2196         st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
2197         eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
2198
2199         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
2200 .L128_dec_blocks_more_than_6:                                           @ blocks left >  6
2201
2202         rev64   $res0b, $res1b                                          @ GHASH final-6 block
2203
2204         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2205
2206         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
2207
2208         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
2209
2210         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
2211         ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
2212         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
2213
2214         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
2215         st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
2216         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
2217
2218         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
2219         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
2220
2221         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
2222         eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
2223 .L128_dec_blocks_more_than_5:                                           @ blocks left >  5
2224
2225         rev64   $res0b, $res1b                                          @ GHASH final-5 block
2226
2227         ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
2228         st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
2229
2230         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2231
2232         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
2233
2234         eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
2235
2236         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
2237
2238         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
2239         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
2240         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
2241
2242         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
2243         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
2244         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
2245
2246         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
2247         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
2248 .L128_dec_blocks_more_than_4:                                           @ blocks left >  4
2249
2250         rev64   $res0b, $res1b                                          @ GHASH final-4 block
2251
2252         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2253         ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
2254
2255         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
2256         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
2257         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
2258
2259         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
2260
2261         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
2262
2263         st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
2264         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
2265
2266         eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
2267         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
2268
2269         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
2270
2271         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
2272 .L128_dec_blocks_more_than_3:                                           @ blocks left >  3
2273
2274         st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
2275         rev64   $res0b, $res1b                                          @ GHASH final-3 block
2276
2277         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2278
2279         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
2280
2281         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
2282         ext     $h4.16b, $h4.16b, $h4.16b, #8
2283         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
2284
2285         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
2286
2287         ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
2288
2289         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
2290         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
2291         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
2292
2293         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
2294         eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
2295         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
2296
2297         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
2298
2299         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
2300         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
2301 .L128_dec_blocks_more_than_2:                                           @ blocks left >  2
2302
2303         rev64   $res0b, $res1b                                          @ GHASH final-2 block
2304
2305         st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
2306
2307         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2308         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
2309         ext     $h3.16b, $h3.16b, $h3.16b, #8
2310         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
2311
2312         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
2313
2314         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
2315
2316         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
2317
2318         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
2319         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
2320         ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
2321
2322         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
2323
2324         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
2325
2326         eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
2327         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
2328 .L128_dec_blocks_more_than_1:                                           @ blocks left >  1
2329
2330         st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
2331         rev64   $res0b, $res1b                                          @ GHASH final-1 block
2332
2333         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
2334         ext     $h2.16b, $h2.16b, $h2.16b, #8
2335
2336         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2337
2338         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
2339
2340         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
2341
2342         ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
2343         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
2344
2345         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
2346         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
2347         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
2348
2349         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
2350         eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
2351
2352         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
2353
2354         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
2355
2356         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
2357
2358         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
2359 .L128_dec_blocks_less_than_1:                                           @ blocks left <= 1
2360
2361         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
2362
2363         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
2364
2365         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
2366
2367         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
2368         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
2369
2370         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
2371         cmp     $bit_length, #64
2372         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
2373
2374         csel    $temp2_x, $temp1_x, $temp0_x, lt
2375         csel    $temp3_x, $temp0_x, xzr, lt
2376
2377         mov     $ctr0.d[1], $temp3_x
2378         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
2379
2380         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
2381         ext     $h1.16b, $h1.16b, $h1.16b, #8
2382         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
2383
2384         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
2385
2386         rev64   $res0b, $res1b                                          @ GHASH final block
2387
2388         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2389
2390         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
2391         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
2392
2393         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
2394         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
2395
2396         bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
2397
2398         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
2399         st1     { $res4b}, [$output_ptr]                                @ store all 16B
2400
2401         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
2402
2403         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
2404         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
2405
2406         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
2407
2408         eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
2409
2410         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
2411         ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
2412
2413         eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
2414
2415         eor3    $acc_mb, $acc_mb, $acc_hb, $t11.16b                     @ MODULO - fold into mid
2416
2417         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
2418         ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
2419
2420         eor3    $acc_lb, $acc_lb, $acc_mb, $acc_hb                      @ MODULO - fold into low
2421         ext     $acc_lb, $acc_lb, $acc_lb, #8
2422         rev64   $acc_lb, $acc_lb
2423         st1     { $acc_l.16b }, [$current_tag]
2424         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
2425
2426         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
2427
2428         mov     x0, $byte_length
2429
2430         ldp     d10, d11, [sp, #16]
2431         ldp     d12, d13, [sp, #32]
2432         ldp     d14, d15, [sp, #48]
2433         ldp     d8, d9, [sp], #80
2434         ret
2435 .L128_dec_ret:
2436         mov w0, #0x0
2437         ret
2438 .size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
2439 ___
2440 }
2441
2442 {
2443 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
2444 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
2445 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
2446 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
2447 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
2448 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
2449 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
2450
2451 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
2452 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
2453 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
2454
2455 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
2456 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
2457
2458 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
2459 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
2460 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
2461 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
2462
2463 my $t0="v16";
2464 my $t0d="d16";
2465
2466 my $t1="v29";
2467 my $t2=$res1;
2468 my $t3=$t1;
2469
2470 my $t4=$res0;
2471 my $t5=$res2;
2472 my $t6=$t0;
2473
2474 my $t7=$res3;
2475 my $t8=$res4;
2476 my $t9=$res5;
2477
2478 my $t10=$res6;
2479 my $t11="v21";
2480 my $t12=$t1;
2481
2482 my $rtmp_ctr="v30";
2483 my $rtmp_ctrq="q30";
2484 my $rctr_inc="v31";
2485 my $rctr_incd="d31";
2486
2487 my $mod_constantd=$t0d;
2488 my $mod_constant=$t0;
2489
2490 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
2491 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
2492 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
2493 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
2494 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
2495 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
2496 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
2497 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
2498 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
2499 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
2500 my $rk2q1="v28.1q";
2501 my $rk3q1="v26.1q";
2502 my $rk4v="v27";
2503
2504 #########################################################################################
2505 # size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
2506 #                               size_t len,
2507 #                               unsigned char *out,
2508 #                               const void *key,
2509 #                               unsigned char ivec[16],
2510 #                               u64 *Xi);
2511 #
2512 $code.=<<___;
2513 .global unroll8_eor3_aes_gcm_enc_192_kernel
2514 .type   unroll8_eor3_aes_gcm_enc_192_kernel,%function
2515 .align  4
2516 unroll8_eor3_aes_gcm_enc_192_kernel:
2517         AARCH64_VALID_CALL_TARGET
2518         cbz     x1, .L192_enc_ret
2519         stp     d8, d9, [sp, #-80]!
2520         lsr     $byte_length, $bit_length, #3
2521         mov     $counter, x4
2522         mov     $cc, x5
2523         stp     d10, d11, [sp, #16]
2524         stp     d12, d13, [sp, #32]
2525         stp     d14, d15, [sp, #48]
2526         mov     x5, #0xc200000000000000
2527         stp     x5, xzr, [sp, #64]
2528         add     $modulo_constant, sp, #64
2529
2530         mov     $main_end_input_ptr, $byte_length
2531         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
2532
2533         mov     $constant_temp, #0x100000000                            @ set up counter increment
2534         movi    $rctr_inc.16b, #0x0
2535         mov     $rctr_inc.d[1], $constant_temp
2536
2537         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
2538
2539         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
2540
2541         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
2542         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
2543
2544         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
2545         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
2546
2547         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
2548         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
2549
2550         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
2551         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
2552         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
2553
2554         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2555
2556         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
2557         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
2558         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
2559
2560         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2561
2562         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
2563         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
2564
2565         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
2566
2567         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
2568         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
2569         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
2570
2571         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
2572         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
2573         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
2574
2575         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
2576         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
2577         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
2578
2579         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
2580         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
2581
2582         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
2583         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
2584         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
2585
2586         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
2587         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
2588         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
2589
2590         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
2591         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
2592         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
2593
2594         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
2595         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
2596
2597         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
2598         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
2599         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
2600
2601         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
2602         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
2603
2604         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
2605         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
2606         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
2607
2608         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
2609
2610         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
2611
2612         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
2613
2614         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
2615         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
2616         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
2617
2618         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
2619         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
2620         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
2621
2622         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
2623         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
2624         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
2625
2626         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
2627         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
2628         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
2629
2630         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
2631         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
2632         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
2633
2634         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
2635         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
2636         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
2637
2638         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
2639
2640         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
2641         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
2642         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
2643
2644         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
2645         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
2646         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
2647
2648         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
2649         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
2650         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
2651
2652         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
2653         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
2654
2655         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
2656         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
2657
2658         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
2659         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
2660
2661         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
2662         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
2663
2664         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
2665         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
2666
2667         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
2668         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
2669         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
2670
2671         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
2672         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
2673         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
2674
2675         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
2676         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
2677         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
2678
2679         ld1     { $acc_lb}, [$current_tag]
2680         ext     $acc_lb, $acc_lb, $acc_lb, #8
2681         rev64   $acc_lb, $acc_lb
2682         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
2683
2684         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
2685         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
2686
2687         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
2688         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
2689
2690         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
2691         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
2692
2693         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 14 - round 10
2694         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
2695         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 11 - round 10
2696
2697         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 9 - round 10
2698         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 13 - round 10
2699         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 12 - round 10
2700
2701         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8 - round 10
2702         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 10 - round 10
2703         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 15 - round 10
2704
2705         aese    $ctr6b, $rk11                                           @ AES block 14 - round 11
2706         aese    $ctr3b, $rk11                                           @ AES block 11 - round 11
2707
2708         aese    $ctr4b, $rk11                                           @ AES block 12 - round 11
2709         aese    $ctr7b, $rk11                                           @ AES block 15 - round 11
2710         ldr     $rk12q, [$cc, #192]                                     @ load rk12
2711
2712         aese    $ctr1b, $rk11                                           @ AES block 9 - round 11
2713         aese    $ctr5b, $rk11                                           @ AES block 13 - round 11
2714
2715         aese    $ctr2b, $rk11                                           @ AES block 10 - round 11
2716         aese    $ctr0b, $rk11                                           @ AES block 8 - round 11
2717         b.ge    .L192_enc_tail                                          @ handle tail
2718
2719         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
2720
2721         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
2722
2723         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
2724
2725         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
2726
2727         eor3    $res0b, $ctr_t0b, $ctr0b, $rk12                         @ AES block 0 - result
2728         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
2729         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
2730
2731         eor3    $res3b, $ctr_t3b, $ctr3b, $rk12                         @ AES block 3 - result
2732         eor3    $res1b, $ctr_t1b, $ctr1b, $rk12                         @ AES block 1 - result
2733
2734         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
2735         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
2736         eor3    $res4b, $ctr_t4b, $ctr4b, $rk12                         @ AES block 4 - result
2737
2738         eor3    $res5b, $ctr_t5b, $ctr5b, $rk12                         @ AES block 5 - result
2739         eor3    $res7b, $ctr_t7b, $ctr7b, $rk12                         @ AES block 7 - result
2740         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
2741
2742         eor3    $res2b, $ctr_t2b, $ctr2b, $rk12                         @ AES block 2 - result
2743         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
2744         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
2745
2746         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
2747         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
2748
2749         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
2750         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
2751         eor3    $res6b, $ctr_t6b, $ctr6b, $rk12                         @ AES block 6 - result
2752
2753         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
2754
2755         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
2756         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
2757         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
2758
2759         b.ge    .L192_enc_prepretail                                    @ do prepretail
2760
2761 .L192_enc_main_loop:                                                    @ main loop start
2762         rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
2763         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
2764         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
2765
2766         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
2767         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
2768         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
2769         ext     $h7.16b, $h7.16b, $h7.16b, #8
2770         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
2771         ext     $h8.16b, $h8.16b, $h8.16b, #8
2772
2773         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
2774         rev64   $res0b, $res0b                                          @ GHASH block 8k
2775         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
2776         ext     $h5.16b, $h5.16b, $h5.16b, #8
2777         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
2778         ext     $h6.16b, $h6.16b, $h6.16b, #8
2779
2780         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
2781         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
2782         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
2783
2784         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
2785         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
2786         rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
2787
2788         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
2789         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
2790         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
2791
2792         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
2793         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
2794         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
2795
2796         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
2797         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
2798         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
2799
2800         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
2801         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
2802         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
2803
2804         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
2805         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
2806         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
2807
2808         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
2809         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
2810         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
2811         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
2812
2813         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
2814         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
2815         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
2816
2817         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
2818         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
2819         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
2820
2821         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
2822         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
2823         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
2824
2825         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
2826         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
2827         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
2828
2829         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
2830         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
2831         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
2832
2833         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
2834         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
2835         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
2836
2837         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
2838         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
2839         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
2840
2841         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
2842         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
2843         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
2844
2845         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
2846         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
2847         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
2848
2849         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
2850         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
2851         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
2852         ext     $h3.16b, $h3.16b, $h3.16b, #8
2853         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
2854         ext     $h4.16b, $h4.16b, $h4.16b, #8
2855
2856         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k - mid
2857         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
2858         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
2859
2860         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
2861         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
2862         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
2863
2864         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
2865         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
2866         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
2867
2868         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
2869         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
2870         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
2871
2872         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
2873         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
2874         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
2875
2876         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
2877         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
2878         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
2879
2880         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
2881         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
2882         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
2883
2884         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
2885         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
2886         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
2887         ext     $h1.16b, $h1.16b, $h1.16b, #8
2888         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
2889         ext     $h2.16b, $h2.16b, $h2.16b, #8
2890
2891         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
2892         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
2893         rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
2894
2895         rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
2896         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
2897         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
2898
2899         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
2900         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
2901
2902         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
2903         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
2904         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
2905
2906         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
2907         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
2908         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
2909
2910         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
2911         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
2912         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
2913
2914         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
2915         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
2916         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
2917
2918         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
2919         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
2920         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
2921
2922         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
2923         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
2924         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
2925
2926         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
2927         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
2928
2929         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
2930         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
2931         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
2932
2933         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
2934         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
2935
2936         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
2937         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
2938         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
2939
2940         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
2941         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
2942         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
2943
2944         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
2945         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
2946         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
2947
2948         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
2949         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
2950         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
2951
2952         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
2953         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
2954         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
2955
2956         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
2957         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
2958         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
2959
2960         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
2961         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
2962         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
2963
2964         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
2965         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
2966         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
2967
2968         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
2969         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
2970         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
2971
2972         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
2973         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
2974         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
2975
2976         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
2977         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
2978         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
2979
2980         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
2981         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
2982         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
2983
2984         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
2985         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
2986         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
2987
2988         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
2989         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
2990         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
2991
2992         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
2993         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
2994         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
2995
2996         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
2997         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
2998         ldr     $rk12q, [$cc, #192]                                     @ load rk12
2999         ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
3000
3001         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
3002         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
3003         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
3004
3005         aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
3006         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
3007         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 8k+12, 8k+13 - load plaintext
3008
3009         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 8k+14, 8k+15 - load plaintext
3010         aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
3011         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
3012
3013         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
3014         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
3015
3016         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
3017         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
3018
3019         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
3020         aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
3021         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
3022
3023         aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
3024         aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
3025         eor3    $res4b, $ctr_t4b, $ctr4b, $rk12                         @ AES block 4 - result
3026
3027         aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
3028         aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
3029         aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
3030
3031         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
3032         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
3033         eor3    $res7b, $ctr_t7b, $ctr7b, $rk12                         @ AES block 7 - result
3034
3035         eor3    $res2b, $ctr_t2b, $ctr2b, $rk12                         @ AES block 8k+10 - result
3036         eor3    $res0b, $ctr_t0b, $ctr0b, $rk12                         @ AES block 8k+8 - result
3037         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
3038
3039         eor3    $res1b, $ctr_t1b, $ctr1b, $rk12                         @ AES block 8k+9 - result
3040         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
3041         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
3042         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
3043
3044         eor3    $res6b, $ctr_t6b, $ctr6b, $rk12                         @ AES block 6 - result
3045         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
3046         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
3047
3048         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
3049         eor3    $res5b, $ctr_t5b, $ctr5b, $rk12                         @ AES block 5 - result
3050         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
3051
3052         eor3    $res3b, $ctr_t3b, $ctr3b, $rk12                         @ AES block 8k+11 - result
3053         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
3054
3055         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
3056
3057         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
3058
3059         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
3060         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
3061         b.lt    .L192_enc_main_loop
3062
3063 .L192_enc_prepretail:                                                   @ PREPRETAIL
3064         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
3065         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
3066         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
3067
3068         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
3069         ext     $h7.16b, $h7.16b, $h7.16b, #8
3070         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
3071         ext     $h8.16b, $h8.16b, $h8.16b, #8
3072         rev64   $res0b, $res0b                                          @ GHASH block 8k
3073         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
3074
3075         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
3076         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
3077         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
3078         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
3079
3080         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
3081         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
3082         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
3083         ext     $h5.16b, $h5.16b, $h5.16b, #8
3084         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
3085         ext     $h6.16b, $h6.16b, $h6.16b, #8
3086
3087         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
3088         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
3089         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
3090
3091         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
3092         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
3093         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
3094
3095         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
3096         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
3097         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
3098
3099         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
3100         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
3101         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
3102
3103         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
3104         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
3105         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
3106
3107         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
3108         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
3109         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
3110
3111         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
3112         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
3113         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
3114
3115         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
3116         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
3117         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
3118
3119         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
3120         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
3121         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
3122
3123         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
3124         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
3125         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
3126
3127         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
3128         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
3129         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
3130
3131         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
3132         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
3133         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
3134
3135         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
3136         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
3137         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
3138
3139         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
3140         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
3141         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
3142
3143         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
3144         rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
3145         rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
3146
3147         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
3148         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
3149         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
3150
3151         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
3152         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
3153         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
3154
3155         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
3156         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
3157         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
3158
3159         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
3160         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
3161         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
3162
3163         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
3164         ext     $h3.16b, $h3.16b, $h3.16b, #8
3165         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
3166         ext     $h4.16b, $h4.16b, $h4.16b, #8
3167         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
3168         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
3169
3170         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
3171         ext     $h1.16b, $h1.16b, $h1.16b, #8
3172         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
3173         ext     $h2.16b, $h2.16b, $h2.16b, #8
3174         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
3175         rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
3176
3177         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
3178         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
3179         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
3180
3181         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
3182         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
3183         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
3184
3185         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
3186         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
3187         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
3188
3189         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
3190         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
3191         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
3192
3193         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
3194         rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
3195         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
3196         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
3197
3198         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
3199         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
3200         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
3201
3202         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
3203         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
3204         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
3205
3206         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
3207         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
3208
3209         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
3210         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
3211         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
3212
3213         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
3214         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
3215         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
3216
3217         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
3218         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
3219         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
3220
3221         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
3222         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
3223         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
3224
3225         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
3226         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
3227
3228         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
3229         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
3230         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
3231
3232         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
3233         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
3234         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
3235
3236         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
3237         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
3238         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
3239
3240         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
3241         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
3242         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
3243
3244         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
3245         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
3246         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
3247
3248         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
3249         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
3250         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
3251
3252         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
3253         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
3254         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
3255
3256         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
3257         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
3258
3259         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
3260         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
3261
3262         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
3263         ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
3264         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
3265         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
3266
3267         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
3268         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
3269
3270         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
3271         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
3272         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
3273
3274         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
3275         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
3276         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
3277
3278         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
3279         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
3280         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
3281
3282         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
3283         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
3284         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
3285
3286         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
3287         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
3288
3289         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
3290         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
3291         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
3292         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
3293
3294         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
3295         ldr     $rk12q, [$cc, #192]                                     @ load rk12
3296
3297         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
3298         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
3299         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
3300
3301         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
3302         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
3303         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
3304
3305         aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
3306         aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
3307
3308         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
3309         aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
3310
3311         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
3312         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
3313
3314         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
3315         aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
3316         aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
3317
3318         aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
3319         aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
3320         aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
3321
3322 .L192_enc_tail:                                                         @ TAIL
3323
3324         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
3325         ext     $h5.16b, $h5.16b, $h5.16b, #8
3326         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
3327
3328         ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - l3ad plaintext
3329
3330         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
3331         ext     $h8.16b, $h8.16b, $h8.16b, #8
3332
3333         mov     $t1.16b, $rk12
3334
3335         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
3336         ext     $h6.16b, $h6.16b, $h6.16b, #8
3337         ext     $h7.16b, $h7.16b, $h7.16b, #8
3338         cmp     $main_end_input_ptr, #112
3339
3340         eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                       @ AES block 8k+8 - result
3341         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
3342         b.gt    .L192_enc_blocks_more_than_7
3343
3344         cmp     $main_end_input_ptr, #96
3345         mov     $ctr7b, $ctr6b
3346         movi    $acc_h.8b, #0
3347
3348         mov     $ctr6b, $ctr5b
3349         movi    $acc_l.8b, #0
3350         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3351
3352         mov     $ctr5b, $ctr4b
3353         mov     $ctr4b, $ctr3b
3354         mov     $ctr3b, $ctr2b
3355
3356         mov     $ctr2b, $ctr1b
3357         movi    $acc_m.8b, #0
3358         b.gt    .L192_enc_blocks_more_than_6
3359
3360         mov     $ctr7b, $ctr6b
3361         cmp     $main_end_input_ptr, #80
3362
3363         mov     $ctr6b, $ctr5b
3364         mov     $ctr5b, $ctr4b
3365         mov     $ctr4b, $ctr3b
3366
3367         mov     $ctr3b, $ctr1b
3368         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3369         b.gt    .L192_enc_blocks_more_than_5
3370
3371         cmp     $main_end_input_ptr, #64
3372         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3373
3374         mov     $ctr7b, $ctr6b
3375         mov     $ctr6b, $ctr5b
3376         mov     $ctr5b, $ctr4b
3377
3378         mov     $ctr4b, $ctr1b
3379         b.gt    .L192_enc_blocks_more_than_4
3380
3381         mov     $ctr7b, $ctr6b
3382         mov     $ctr6b, $ctr5b
3383         mov     $ctr5b, $ctr1b
3384
3385         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3386         cmp     $main_end_input_ptr, #48
3387         b.gt    .L192_enc_blocks_more_than_3
3388
3389         mov     $ctr7b, $ctr6b
3390         mov     $ctr6b, $ctr1b
3391         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3392
3393         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
3394         cmp     $main_end_input_ptr, #32
3395         b.gt    .L192_enc_blocks_more_than_2
3396
3397         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3398
3399         cmp     $main_end_input_ptr, #16
3400         mov     $ctr7b, $ctr1b
3401         b.gt    .L192_enc_blocks_more_than_1
3402
3403         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3404         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
3405         b        .L192_enc_blocks_less_than_1
3406 .L192_enc_blocks_more_than_7:                                           @ blocks left >  7
3407         st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
3408
3409         rev64   $res0b, $res1b                                          @ GHASH final-7 block
3410         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
3411
3412         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3413
3414         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
3415
3416         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
3417
3418         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
3419         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
3420         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
3421
3422         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
3423
3424         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
3425         eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
3426 .L192_enc_blocks_more_than_6:                                           @ blocks left >  6
3427
3428         st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
3429
3430         rev64   $res0b, $res1b                                          @ GHASH final-6 block
3431
3432         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
3433
3434         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3435
3436         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
3437
3438         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
3439         eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
3440
3441         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
3442         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
3443         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
3444
3445         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
3446
3447         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
3448         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
3449
3450         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
3451 .L192_enc_blocks_more_than_5:                                           @ blocks left >  5
3452
3453         st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
3454
3455         rev64   $res0b, $res1b                                          @ GHASH final-5 block
3456
3457         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3458
3459         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
3460
3461         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
3462         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
3463
3464         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
3465         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
3466
3467         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
3468         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
3469
3470         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
3471         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
3472
3473         eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
3474         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
3475
3476         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
3477 .L192_enc_blocks_more_than_4:                                           @ blocks left >  4
3478
3479         st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
3480
3481         rev64   $res0b, $res1b                                          @ GHASH final-4 block
3482
3483         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3484
3485         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
3486         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
3487         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
3488
3489         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
3490         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
3491
3492         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
3493
3494         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
3495         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
3496
3497         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
3498
3499         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
3500         eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
3501 .L192_enc_blocks_more_than_3:                                           @ blocks left >  3
3502
3503         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
3504         st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
3505
3506         rev64   $res0b, $res1b                                          @ GHASH final-3 block
3507
3508         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3509         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
3510
3511         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
3512         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
3513         ext     $h4.16b, $h4.16b, $h4.16b, #8
3514
3515         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
3516
3517         eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
3518         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
3519
3520         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
3521         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
3522
3523         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
3524         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
3525
3526         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
3527
3528         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
3529         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
3530 .L192_enc_blocks_more_than_2:                                           @ blocks left >  2
3531
3532         st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
3533
3534         rev64   $res0b, $res1b                                          @ GHASH final-2 block
3535         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
3536         ext     $h3.16b, $h3.16b, $h3.16b, #8
3537
3538         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3539
3540         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
3541         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
3542
3543         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
3544
3545         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
3546         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
3547         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
3548
3549         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
3550
3551         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
3552         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
3553
3554         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
3555         eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
3556 .L192_enc_blocks_more_than_1:                                           @ blocks left >  1
3557
3558         ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
3559         ext     $h2.16b, $h2.16b, $h2.16b, #8
3560         st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
3561
3562         rev64   $res0b, $res1b                                          @ GHASH final-1 block
3563
3564         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3565
3566         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
3567         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
3568
3569         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
3570         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
3571         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
3572
3573         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
3574         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
3575
3576         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
3577
3578         eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
3579         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
3580
3581         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
3582
3583         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
3584         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
3585 .L192_enc_blocks_less_than_1:                                           @ blocks left <= 1
3586
3587         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
3588         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
3589
3590         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
3591
3592         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
3593
3594         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
3595
3596         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
3597         cmp     $bit_length, #64
3598         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
3599
3600         csel    $temp2_x, $temp1_x, $temp0_x, lt
3601         csel    $temp3_x, $temp0_x, xzr, lt
3602
3603         mov     $ctr0.d[1], $temp3_x
3604         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
3605         ext     $h1.16b, $h1.16b, $h1.16b, #8
3606
3607         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
3608         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
3609
3610         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
3611
3612         rev64   $res0b, $res1b                                          @ GHASH final block
3613         bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
3614
3615         st1     { $res1b}, [$output_ptr]                                @ store all 16B
3616
3617         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3618
3619         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
3620         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
3621
3622         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
3623         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
3624
3625         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
3626
3627         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
3628
3629         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
3630         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
3631
3632         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
3633         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
3634
3635         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
3636
3637         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
3638         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
3639
3640         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
3641
3642         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
3643
3644         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
3645         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
3646
3647         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
3648                 ext     $acc_lb, $acc_lb, $acc_lb, #8
3649         rev64   $acc_lb, $acc_lb
3650         st1     { $acc_l.16b }, [$current_tag]
3651
3652         mov     x0, $byte_length                                        @ return sizes
3653
3654         ldp     d10, d11, [sp, #16]
3655         ldp     d12, d13, [sp, #32]
3656         ldp     d14, d15, [sp, #48]
3657         ldp     d8, d9, [sp], #80
3658         ret
3659
3660 .L192_enc_ret:
3661         mov w0, #0x0
3662         ret
3663 .size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
3664 ___
3665
3666 #########################################################################################
3667 # size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
3668 #                               size_t len,
3669 #                               unsigned char *out,
3670 #                               const void *key,
3671 #                               unsigned char ivec[16],
3672 #                               u64 *Xi);
3673 #
3674 $code.=<<___;
3675 .global unroll8_eor3_aes_gcm_dec_192_kernel
3676 .type   unroll8_eor3_aes_gcm_dec_192_kernel,%function
3677 .align  4
3678 unroll8_eor3_aes_gcm_dec_192_kernel:
3679         AARCH64_VALID_CALL_TARGET
3680         cbz     x1, .L192_dec_ret
3681         stp     d8, d9, [sp, #-80]!
3682         lsr     $byte_length, $bit_length, #3
3683         mov     $counter, x4
3684         mov     $cc, x5
3685         stp     d10, d11, [sp, #16]
3686         stp     d12, d13, [sp, #32]
3687         stp     d14, d15, [sp, #48]
3688         mov     x5, #0xc200000000000000
3689         stp     x5, xzr, [sp, #64]
3690         add     $modulo_constant, sp, #64
3691
3692         mov     $main_end_input_ptr, $byte_length
3693         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
3694         ld1     { $acc_lb}, [$current_tag]
3695
3696                 mov     $constant_temp, #0x100000000                    @ set up counter increment
3697         movi    $rctr_inc.16b, #0x0
3698         mov     $rctr_inc.d[1], $constant_temp
3699
3700         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
3701
3702         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
3703
3704         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
3705         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
3706
3707         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
3708         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
3709
3710         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
3711         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
3712
3713         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
3714         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
3715
3716         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
3717         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
3718         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
3719
3720         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
3721         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
3722
3723         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
3724
3725         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
3726         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
3727         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
3728
3729         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
3730         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
3731         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
3732
3733         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
3734         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
3735         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
3736
3737         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
3738
3739         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
3740
3741         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
3742         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
3743         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
3744
3745         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
3746         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
3747
3748         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
3749         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
3750         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
3751
3752         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
3753         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
3754         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
3755
3756         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
3757         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
3758         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
3759
3760         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
3761
3762         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
3763         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
3764         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
3765
3766         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
3767         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
3768
3769         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
3770         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
3771         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
3772
3773         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
3774         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
3775         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
3776
3777         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
3778         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
3779         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
3780
3781         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
3782         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
3783         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
3784
3785         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
3786         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
3787
3788         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
3789         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
3790         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
3791
3792         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
3793         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
3794         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
3795
3796         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
3797
3798         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
3799         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
3800         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
3801
3802         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
3803         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
3804         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
3805
3806         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
3807         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
3808         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
3809
3810         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
3811
3812         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
3813         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
3814
3815         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
3816         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
3817         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
3818
3819         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
3820         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
3821         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
3822
3823         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
3824         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
3825         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3826
3827         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
3828         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
3829         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
3830
3831         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
3832         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
3833         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
3834
3835         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
3836         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
3837
3838         ld1     { $acc_lb}, [$current_tag]
3839         ext     $acc_lb, $acc_lb, $acc_lb, #8
3840         rev64   $acc_lb, $acc_lb
3841
3842         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
3843
3844         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
3845         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3846
3847         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
3848         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
3849         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
3850
3851         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
3852         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
3853
3854         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
3855         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
3856
3857         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
3858         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
3859         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
3860
3861         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
3862         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
3863         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
3864
3865         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
3866         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
3867         ldr     $rk12q, [$cc, #192]                                     @ load rk12
3868
3869         aese    $ctr0b, $rk11                                           @ AES block 0 - round 11
3870         aese    $ctr1b, $rk11                                           @ AES block 1 - round 11
3871         aese    $ctr4b, $rk11                                           @ AES block 4 - round 11
3872
3873         aese    $ctr6b, $rk11                                           @ AES block 6 - round 11
3874         aese    $ctr5b, $rk11                                           @ AES block 5 - round 11
3875         aese    $ctr7b, $rk11                                           @ AES block 7 - round 11
3876
3877         aese    $ctr2b, $rk11                                           @ AES block 2 - round 11
3878         aese    $ctr3b, $rk11                                           @ AES block 3 - round 11
3879         b.ge    .L192_dec_tail                                          @ handle tail
3880
3881         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
3882
3883         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
3884
3885         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
3886
3887         eor3    $ctr1b, $res1b, $ctr1b, $rk12                           @ AES block 1 - result
3888         eor3    $ctr0b, $res0b, $ctr0b, $rk12                           @ AES block 0 - result
3889         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
3890
3891         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
3892         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
3893
3894         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
3895         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
3896         eor3    $ctr3b, $res3b, $ctr3b, $rk12                           @ AES block 3 - result
3897
3898         eor3    $ctr2b, $res2b, $ctr2b, $rk12                           @ AES block 2 - result
3899         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
3900         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
3901
3902         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
3903         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
3904
3905         eor3    $ctr4b, $res4b, $ctr4b, $rk12                           @ AES block 4 - result
3906
3907         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
3908         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
3909
3910         eor3    $ctr5b, $res5b, $ctr5b, $rk12                           @ AES block 5 - result
3911         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
3912         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
3913
3914         eor3    $ctr6b, $res6b, $ctr6b, $rk12                           @ AES block 6 - result
3915         eor3    $ctr7b, $res7b, $ctr7b, $rk12                           @ AES block 7 - result
3916         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
3917
3918         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
3919         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
3920         b.ge    .L192_dec_prepretail                                    @ do prepretail
3921
3922 .L192_dec_main_loop:                                                    @ main loop start
3923         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
3924         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
3925         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
3926
3927         rev64   $res0b, $res0b                                          @ GHASH block 8k
3928         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
3929         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
3930
3931         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
3932         ext     $h7.16b, $h7.16b, $h7.16b, #8
3933         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
3934         ext     $h8.16b, $h8.16b, $h8.16b, #8
3935         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
3936         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
3937
3938         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
3939         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
3940         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
3941
3942         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
3943
3944         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
3945         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
3946         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
3947
3948         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
3949         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
3950         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
3951
3952         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
3953         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
3954         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
3955
3956         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
3957         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
3958         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
3959
3960         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
3961         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
3962         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
3963         ext     $h5.16b, $h5.16b, $h5.16b, #8
3964         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
3965         ext     $h6.16b, $h6.16b, $h6.16b, #8
3966
3967         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
3968         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
3969         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
3970
3971         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
3972         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
3973         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
3974
3975         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
3976         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
3977         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
3978
3979         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
3980         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
3981         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
3982         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
3983
3984         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
3985         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
3986         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
3987
3988         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
3989         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
3990         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
3991
3992         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
3993         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
3994         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
3995
3996         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
3997         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
3998         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
3999
4000         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
4001         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
4002         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
4003
4004         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
4005         ext     $h3.16b, $h3.16b, $h3.16b, #8
4006         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
4007         ext     $h4.16b, $h4.16b, $h4.16b, #8
4008         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
4009         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
4010
4011         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
4012         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
4013         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
4014
4015         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
4016         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
4017
4018         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
4019         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
4020         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
4021
4022         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
4023         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
4024         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
4025
4026         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
4027         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
4028
4029         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
4030         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
4031         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
4032
4033         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
4034         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
4035         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
4036
4037         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
4038         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
4039         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
4040
4041         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
4042         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
4043         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
4044
4045         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
4046         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
4047         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
4048
4049         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
4050         ext     $h1.16b, $h1.16b, $h1.16b, #8
4051         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
4052         ext     $h2.16b, $h2.16b, $h2.16b, #8
4053         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
4054         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
4055
4056         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
4057         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
4058         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
4059
4060         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
4061         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
4062         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
4063
4064         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
4065         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
4066         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
4067
4068         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
4069         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
4070         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
4071
4072         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
4073         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
4074         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
4075         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
4076
4077         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
4078         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
4079         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
4080
4081         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
4082         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
4083         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
4084
4085         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
4086         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
4087         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
4088
4089         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
4090         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
4091         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
4092
4093         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
4094         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
4095         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
4096
4097         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
4098         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
4099         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
4100
4101         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
4102         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
4103         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
4104
4105         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
4106         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
4107         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
4108
4109         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
4110         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
4111         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
4112
4113         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
4114         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
4115         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
4116
4117         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
4118         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
4119         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
4120
4121         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
4122         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
4123         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
4124
4125         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
4126         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
4127         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
4128
4129         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
4130         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
4131         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
4132
4133         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
4134         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
4135         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
4136
4137         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
4138         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
4139         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
4140
4141         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
4142         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
4143         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
4144
4145         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
4146         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
4147
4148         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
4149         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
4150         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
4151
4152         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
4153         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
4154         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
4155
4156         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
4157         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
4158         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
4159
4160         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
4161         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
4162         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
4163
4164         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
4165         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
4166         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
4167
4168         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
4169         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
4170         ldr     $rk12q, [$cc, #192]                                     @ load rk12
4171
4172         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
4173         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
4174         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
4175
4176         aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
4177         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
4178         aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
4179
4180         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
4181         aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
4182         aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
4183
4184         eor3    $ctr0b, $res0b, $ctr0b, $rk12                           @ AES block 8k+8 - result
4185         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
4186         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
4187
4188         aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
4189         aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
4190         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
4191
4192         aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
4193         aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
4194         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
4195
4196         eor3    $ctr1b, $res1b, $ctr1b, $rk12                           @ AES block 8k+9 - result
4197         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
4198         eor3    $ctr3b, $res3b, $ctr3b, $rk12                           @ AES block 8k+11 - result
4199
4200         eor3    $ctr2b, $res2b, $ctr2b, $rk12                           @ AES block 8k+10 - result
4201         eor3    $ctr7b, $res7b, $ctr7b, $rk12                           @ AES block 8k+15 - result
4202         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
4203
4204         eor3    $ctr5b, $res5b, $ctr5b, $rk12                           @ AES block 8k+13 - result
4205         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
4206         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
4207
4208         eor3    $ctr4b, $res4b, $ctr4b, $rk12                           @ AES block 8k+12 - result
4209         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
4210         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
4211
4212         eor3    $ctr6b, $res6b, $ctr6b, $rk12                           @ AES block 8k+14 - result
4213         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
4214         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
4215
4216         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
4217         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
4218
4219         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
4220         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
4221         b.lt    .L192_dec_main_loop
4222
4223 .L192_dec_prepretail:                                                   @ PREPRETAIL
4224         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
4225         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
4226         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
4227
4228         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
4229         ext     $h7.16b, $h7.16b, $h7.16b, #8
4230         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
4231         ext     $h8.16b, $h8.16b, $h8.16b, #8
4232         rev64   $res0b, $res0b                                          @ GHASH block 8k
4233         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
4234
4235         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
4236         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
4237         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
4238
4239         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
4240         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
4241         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
4242
4243         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
4244         ext     $h5.16b, $h5.16b, $h5.16b, #8
4245         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
4246         ext     $h6.16b, $h6.16b, $h6.16b, #8
4247         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
4248
4249         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
4250         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
4251         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
4252
4253         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
4254         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
4255         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
4256
4257         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
4258         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
4259         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
4260
4261         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
4262         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
4263         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
4264
4265         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
4266         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
4267         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
4268
4269         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
4270         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
4271         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
4272
4273         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
4274         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
4275         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
4276
4277         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
4278         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
4279         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
4280
4281         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
4282         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
4283         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
4284
4285         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
4286         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
4287         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
4288         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
4289
4290         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
4291         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
4292         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
4293
4294         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
4295         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
4296         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
4297
4298         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
4299         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
4300         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
4301
4302         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
4303         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
4304         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
4305
4306         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
4307         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
4308         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
4309
4310         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
4311         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
4312         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
4313
4314         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
4315         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
4316         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
4317
4318         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
4319         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
4320         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
4321
4322         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
4323         ext     $h3.16b, $h3.16b, $h3.16b, #8
4324         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
4325         ext     $h4.16b, $h4.16b, $h4.16b, #8
4326         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
4327         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
4328
4329         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
4330         ext     $h1.16b, $h1.16b, $h1.16b, #8
4331         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
4332         ext     $h2.16b, $h2.16b, $h2.16b, #8
4333         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
4334         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
4335
4336         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
4337
4338         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
4339         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
4340
4341         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
4342         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
4343         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
4344
4345         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
4346         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
4347         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
4348
4349         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
4350         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
4351         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
4352
4353         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
4354         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
4355         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
4356         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
4357
4358         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
4359         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
4360         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
4361
4362         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
4363         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
4364         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
4365
4366         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
4367         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
4368         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
4369
4370         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
4371
4372         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
4373         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
4374         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
4375
4376         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
4377         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
4378         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
4379
4380         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
4381         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
4382         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
4383
4384         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
4385         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
4386
4387         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
4388         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
4389         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
4390
4391         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
4392         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
4393         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
4394
4395         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
4396         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
4397         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
4398
4399         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
4400         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
4401         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
4402
4403         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
4404         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
4405         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
4406
4407         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
4408         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
4409         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
4410
4411         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
4412         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
4413         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
4414
4415         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
4416         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
4417         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
4418
4419         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
4420         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
4421         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
4422
4423         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
4424         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
4425
4426         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
4427         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
4428         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
4429
4430         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
4431         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
4432         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
4433
4434         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
4435         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
4436         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
4437
4438         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
4439         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
4440         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
4441
4442         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
4443         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
4444         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
4445
4446         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
4447         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
4448         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
4449
4450         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
4451         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
4452         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
4453
4454         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
4455         ldr     $rk12q, [$cc, #192]                                     @ load rk12
4456         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
4457
4458         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
4459         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
4460         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
4461
4462         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
4463         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
4464         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
4465
4466         aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
4467         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
4468         aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
4469
4470         aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
4471         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
4472         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
4473
4474         aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
4475         aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
4476         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
4477
4478         aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
4479         aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
4480         aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
4481
4482 .L192_dec_tail:                                                         @ TAIL
4483
4484         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
4485
4486         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
4487         ext     $h5.16b, $h5.16b, $h5.16b, #8
4488         ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
4489
4490         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
4491         ext     $h8.16b, $h8.16b, $h8.16b, #8
4492
4493         mov     $t1.16b, $rk12
4494
4495         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
4496         ext     $h6.16b, $h6.16b, $h6.16b, #8
4497         ext     $h7.16b, $h7.16b, $h7.16b, #8
4498         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
4499
4500         eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
4501         cmp     $main_end_input_ptr, #112
4502         b.gt    .L192_dec_blocks_more_than_7
4503
4504         mov     $ctr7b, $ctr6b
4505         movi    $acc_h.8b, #0
4506         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4507
4508         mov     $ctr6b, $ctr5b
4509         mov     $ctr5b, $ctr4b
4510         mov     $ctr4b, $ctr3b
4511
4512         cmp     $main_end_input_ptr, #96
4513         movi    $acc_l.8b, #0
4514         mov     $ctr3b, $ctr2b
4515
4516         mov     $ctr2b, $ctr1b
4517         movi    $acc_m.8b, #0
4518         b.gt    .L192_dec_blocks_more_than_6
4519
4520         mov     $ctr7b, $ctr6b
4521         mov     $ctr6b, $ctr5b
4522         mov     $ctr5b, $ctr4b
4523
4524         mov     $ctr4b, $ctr3b
4525         mov     $ctr3b, $ctr1b
4526
4527         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4528         cmp     $main_end_input_ptr, #80
4529         b.gt    .L192_dec_blocks_more_than_5
4530
4531         mov     $ctr7b, $ctr6b
4532         mov     $ctr6b, $ctr5b
4533
4534         mov     $ctr5b, $ctr4b
4535         mov     $ctr4b, $ctr1b
4536         cmp     $main_end_input_ptr, #64
4537
4538         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4539         b.gt    .L192_dec_blocks_more_than_4
4540
4541         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4542         mov     $ctr7b, $ctr6b
4543         mov     $ctr6b, $ctr5b
4544
4545         mov     $ctr5b, $ctr1b
4546         cmp     $main_end_input_ptr, #48
4547         b.gt    .L192_dec_blocks_more_than_3
4548
4549         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4550         mov     $ctr7b, $ctr6b
4551         cmp     $main_end_input_ptr, #32
4552
4553         mov     $ctr6b, $ctr1b
4554         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
4555         b.gt    .L192_dec_blocks_more_than_2
4556
4557         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4558
4559         mov     $ctr7b, $ctr1b
4560         cmp     $main_end_input_ptr, #16
4561         b.gt    .L192_dec_blocks_more_than_1
4562
4563         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4564         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
4565         b        .L192_dec_blocks_less_than_1
4566 .L192_dec_blocks_more_than_7:                                           @ blocks left >  7
4567         rev64   $res0b, $res1b                                          @ GHASH final-7 block
4568
4569         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
4570         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4571
4572         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
4573         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
4574         ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
4575
4576         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
4577
4578         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
4579         st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
4580
4581         eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
4582
4583         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
4584         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
4585 .L192_dec_blocks_more_than_6:                                           @ blocks left >  6
4586
4587         rev64   $res0b, $res1b                                          @ GHASH final-6 block
4588
4589         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4590
4591         ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
4592         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
4593
4594         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
4595         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
4596         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
4597
4598         st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
4599         eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
4600
4601         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
4602         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
4603         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
4604
4605         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
4606         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
4607 .L192_dec_blocks_more_than_5:                                           @ blocks left >  5
4608
4609         rev64   $res0b, $res1b                                          @ GHASH final-5 block
4610
4611         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4612
4613         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
4614
4615         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
4616
4617         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
4618         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
4619
4620         ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
4621
4622         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
4623         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
4624
4625         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
4626
4627         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
4628         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
4629         st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
4630
4631         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
4632         eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
4633 .L192_dec_blocks_more_than_4:                                           @ blocks left >  4
4634
4635         rev64   $res0b, $res1b                                          @ GHASH final-4 block
4636
4637         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4638         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
4639
4640         ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
4641         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
4642         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
4643
4644         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
4645
4646         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
4647
4648         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
4649         st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
4650         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
4651
4652         eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
4653
4654         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
4655         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
4656 .L192_dec_blocks_more_than_3:                                           @ blocks left >  3
4657
4658         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
4659         ext     $h4.16b, $h4.16b, $h4.16b, #8
4660         rev64   $res0b, $res1b                                          @ GHASH final-3 block
4661         ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
4662
4663         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4664
4665         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
4666         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
4667
4668         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
4669         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
4670         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
4671
4672         st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
4673         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
4674         eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
4675
4676         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
4677         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
4678
4679         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
4680
4681         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
4682
4683         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
4684 .L192_dec_blocks_more_than_2:                                           @ blocks left >  2
4685
4686         rev64   $res0b, $res1b                                          @ GHASH final-2 block
4687         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
4688         ext     $h3.16b, $h3.16b, $h3.16b, #8
4689
4690         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4691
4692         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
4693         ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
4694
4695         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
4696
4697         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
4698
4699         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
4700         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
4701
4702         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
4703         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
4704
4705         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
4706         st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
4707
4708         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
4709         eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
4710 .L192_dec_blocks_more_than_1:                                           @ blocks left >  1
4711
4712         rev64   $res0b, $res1b                                          @ GHASH final-1 block
4713         ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
4714         ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
4715         ext     $h2.16b, $h2.16b, $h2.16b, #8
4716
4717         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4718         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
4719         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
4720
4721         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
4722         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
4723         st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
4724
4725         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
4726
4727         eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
4728
4729         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
4730
4731         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
4732
4733         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
4734
4735         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
4736
4737         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
4738         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
4739 .L192_dec_blocks_less_than_1:                                           @ blocks left <= 1
4740
4741         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
4742         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
4743
4744         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
4745         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
4746
4747         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
4748         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
4749
4750         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
4751
4752         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
4753         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
4754         cmp     $bit_length, #64
4755
4756         csel    $temp2_x, $temp1_x, $temp0_x, lt
4757         csel    $temp3_x, $temp0_x, xzr, lt
4758         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
4759         ext     $h1.16b, $h1.16b, $h1.16b, #8
4760
4761         mov     $ctr0.d[1], $temp3_x
4762         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
4763
4764         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
4765
4766         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
4767         bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
4768
4769         rev64   $res0b, $res1b                                          @ GHASH final block
4770
4771         st1     { $res4b}, [$output_ptr]                                @ store all 16B
4772
4773         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4774
4775         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
4776         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
4777
4778         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
4779         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
4780         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
4781
4782         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
4783         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
4784
4785         eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
4786         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
4787         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
4788
4789         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
4790         ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
4791
4792         eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
4793
4794         eor3    $acc_mb, $acc_mb, $acc_hb, $t11.16b                     @ MODULO - fold into mid
4795
4796         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
4797         ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
4798
4799         eor3    $acc_lb, $acc_lb, $acc_mb, $acc_hb                      @ MODULO - fold into low
4800         ext     $acc_lb, $acc_lb, $acc_lb, #8
4801         rev64   $acc_lb, $acc_lb
4802         st1     { $acc_l.16b }, [$current_tag]
4803
4804         mov     x0, $byte_length
4805
4806         ldp     d10, d11, [sp, #16]
4807         ldp     d12, d13, [sp, #32]
4808         ldp     d14, d15, [sp, #48]
4809         ldp     d8, d9, [sp], #80
4810         ret
4811
4812 .L192_dec_ret:
4813         mov w0, #0x0
4814         ret
4815 .size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
4816 ___
4817 }
4818
4819 {
4820
4821 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
4822 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
4823 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
4824 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
4825 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
4826 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
4827 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
4828
4829 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
4830 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
4831 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
4832
4833 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
4834 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
4835
4836 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
4837 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
4838 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
4839 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
4840
4841 my $t0="v16";
4842 my $t0d="d16";
4843
4844 my $t1="v29";
4845 my $t2=$res1;
4846 my $t3=$t1;
4847
4848 my $t4=$res0;
4849 my $t5=$res2;
4850 my $t6=$t0;
4851
4852 my $t7=$res3;
4853 my $t8=$res4;
4854 my $t9=$res5;
4855
4856 my $t10=$res6;
4857 my $t11="v21";
4858 my $t12=$t1;
4859
4860 my $rtmp_ctr="v30";
4861 my $rtmp_ctrq="q30";
4862 my $rctr_inc="v31";
4863 my $rctr_incd="d31";
4864
4865 my $mod_constantd=$t0d;
4866 my $mod_constant=$t0;
4867
4868 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
4869 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
4870 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
4871 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
4872 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
4873 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
4874 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
4875 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
4876 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
4877 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
4878 my $rk2q1="v28.1q";
4879 my $rk3q1="v26.1q";
4880 my $rk4v="v27";
4881 #########################################################################################
4882 # size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
4883 #                               size_t len,
4884 #                               unsigned char *out,
4885 #                               const void *key,
4886 #                               unsigned char ivec[16],
4887 #                               u64 *Xi);
4888 #
4889 $code.=<<___;
4890 .global unroll8_eor3_aes_gcm_enc_256_kernel
4891 .type   unroll8_eor3_aes_gcm_enc_256_kernel,%function
4892 .align  4
4893 unroll8_eor3_aes_gcm_enc_256_kernel:
4894         AARCH64_VALID_CALL_TARGET
4895         cbz     x1, .L256_enc_ret
4896         stp     d8, d9, [sp, #-80]!
4897         lsr     $byte_length, $bit_length, #3
4898         mov     $counter, x4
4899         mov     $cc, x5
4900         stp     d10, d11, [sp, #16]
4901         stp     d12, d13, [sp, #32]
4902         stp     d14, d15, [sp, #48]
4903         mov     x5, #0xc200000000000000
4904         stp     x5, xzr, [sp, #64]
4905         add     $modulo_constant, sp, #64
4906
4907         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
4908
4909         mov     $main_end_input_ptr, $byte_length
4910
4911         mov     $constant_temp, #0x100000000                    @ set up counter increment
4912         movi    $rctr_inc.16b, #0x0
4913         mov     $rctr_inc.d[1], $constant_temp
4914         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
4915
4916         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4917
4918         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4919
4920         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
4921
4922         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
4923
4924         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
4925         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
4926
4927         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
4928         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
4929
4930         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
4931         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
4932
4933         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
4934         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
4935
4936         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
4937         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
4938         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
4939
4940         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
4941         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
4942
4943         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
4944
4945         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
4946         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
4947         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
4948
4949         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
4950         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
4951         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
4952
4953         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
4954         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
4955         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
4956
4957         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
4958         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
4959         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
4960
4961         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
4962         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
4963
4964         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
4965
4966         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
4967
4968         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
4969         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
4970         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
4971
4972         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
4973         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
4974         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
4975
4976         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
4977         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
4978         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
4979
4980         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
4981         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
4982         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
4983
4984         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
4985
4986         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
4987         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
4988         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
4989
4990         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
4991         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
4992
4993         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
4994         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
4995         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
4996
4997         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
4998         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
4999
5000         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
5001         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
5002         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
5003
5004         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
5005         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
5006         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
5007
5008         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
5009         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
5010         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
5011
5012         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
5013         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
5014         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
5015
5016         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
5017         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
5018         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
5019
5020         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
5021         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
5022         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
5023
5024         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
5025         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
5026         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
5027
5028         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
5029         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
5030
5031         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
5032         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
5033         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
5034
5035         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
5036         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
5037
5038         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
5039
5040         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
5041         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
5042
5043         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
5044         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
5045         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
5046
5047         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
5048         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
5049         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
5050
5051         ld1     { $acc_lb}, [$current_tag]
5052         ext     $acc_lb, $acc_lb, $acc_lb, #8
5053         rev64   $acc_lb, $acc_lb
5054         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
5055
5056         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
5057         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
5058         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
5059
5060         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
5061         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
5062         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
5063
5064         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
5065
5066         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
5067         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
5068         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
5069
5070         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
5071         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
5072         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
5073
5074         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
5075         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
5076         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
5077
5078         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 11
5079         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
5080         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 11
5081
5082         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 11
5083         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 11
5084         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 11
5085
5086         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 11
5087         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 11
5088         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 11
5089
5090         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
5091         ldr     $rk14q, [$cc, #224]                                     @ load rk14
5092
5093         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 12
5094         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 12
5095         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 12
5096
5097         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 12
5098         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 12
5099         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 12
5100
5101         aese    $ctr2b, $rk13                                           @ AES block 2 - round 13
5102         aese    $ctr1b, $rk13                                           @ AES block 1 - round 13
5103         aese    $ctr4b, $rk13                                           @ AES block 4 - round 13
5104
5105         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 12
5106         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 12
5107
5108         aese    $ctr0b, $rk13                                           @ AES block 0 - round 13
5109         aese    $ctr5b, $rk13                                           @ AES block 5 - round 13
5110
5111         aese    $ctr6b, $rk13                                           @ AES block 6 - round 13
5112         aese    $ctr7b, $rk13                                           @ AES block 7 - round 13
5113         aese    $ctr3b, $rk13                                           @ AES block 3 - round 13
5114
5115         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
5116         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
5117         b.ge    .L256_enc_tail                                          @ handle tail
5118
5119         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
5120
5121         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
5122
5123         eor3    $res0b, $ctr_t0b, $ctr0b, $rk14                         @ AES block 0 - result
5124         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
5125         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
5126
5127         eor3    $res1b, $ctr_t1b, $ctr1b, $rk14                         @ AES block 1 - result
5128         eor3    $res3b, $ctr_t3b, $ctr3b, $rk14                         @ AES block 3 - result
5129
5130         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
5131         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
5132         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
5133
5134         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
5135         eor3    $res2b, $ctr_t2b, $ctr2b, $rk14                         @ AES block 2 - result
5136         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
5137
5138         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
5139         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
5140         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
5141
5142         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
5143
5144         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
5145         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
5146
5147         eor3    $res4b, $ctr_t4b, $ctr4b, $rk14                         @ AES block 4 - result
5148
5149         eor3    $res7b, $ctr_t7b, $ctr7b, $rk14                         @ AES block 7 - result
5150         eor3    $res6b, $ctr_t6b, $ctr6b, $rk14                         @ AES block 6 - result
5151         eor3    $res5b, $ctr_t5b, $ctr5b, $rk14                         @ AES block 5 - result
5152
5153         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
5154         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
5155
5156         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
5157         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
5158         b.ge    .L256_enc_prepretail                                    @ do prepretail
5159
5160 .L256_enc_main_loop:                                                    @ main loop start
5161         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
5162
5163         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
5164         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
5165         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
5166         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
5167
5168         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
5169         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
5170         ext     $h5.16b, $h5.16b, $h5.16b, #8
5171         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
5172         ext     $h6.16b, $h6.16b, $h6.16b, #8
5173         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
5174
5175         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
5176         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
5177         rev64   $res0b, $res0b                                          @ GHASH block 8k
5178
5179         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
5180         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
5181         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
5182         ext     $h7.16b, $h7.16b, $h7.16b, #8
5183         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
5184         ext     $h8.16b, $h8.16b, $h8.16b, #8
5185
5186         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
5187         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
5188         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
5189
5190         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
5191         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
5192         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
5193
5194         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
5195         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
5196         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
5197
5198         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
5199         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
5200         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
5201
5202         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
5203         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
5204         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
5205
5206         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
5207         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
5208         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
5209
5210         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
5211         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
5212         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
5213
5214         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
5215         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
5216         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
5217
5218         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
5219         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
5220         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
5221
5222         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
5223         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
5224         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
5225
5226         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
5227         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
5228         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
5229
5230         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
5231         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
5232         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
5233
5234         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
5235         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
5236         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
5237
5238         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
5239         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
5240         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
5241
5242         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
5243         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
5244         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
5245
5246         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
5247         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
5248         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
5249
5250         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
5251         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
5252         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
5253         ext     $h3.16b, $h3.16b, $h3.16b, #8
5254         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
5255         ext     $h4.16b, $h4.16b, $h4.16b, #8
5256
5257         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
5258         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
5259         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
5260
5261         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
5262         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
5263         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
5264
5265         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
5266         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
5267         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
5268
5269         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
5270         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
5271         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
5272
5273         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
5274         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
5275         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
5276
5277         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
5278         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
5279         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
5280
5281         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
5282         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
5283         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
5284
5285         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
5286         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
5287         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
5288
5289         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
5290         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
5291         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
5292
5293         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
5294         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
5295         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
5296
5297         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
5298         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
5299         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
5300
5301         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
5302         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
5303         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
5304
5305         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
5306         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
5307         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
5308
5309         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
5310         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
5311         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
5312
5313         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
5314         ext     $h1.16b, $h1.16b, $h1.16b, #8
5315         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
5316         ext     $h2.16b, $h2.16b, $h2.16b, #8
5317         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
5318         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
5319
5320         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
5321         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
5322         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
5323         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
5324
5325         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
5326         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
5327         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
5328
5329         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
5330         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
5331         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
5332
5333         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
5334         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
5335         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
5336
5337         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
5338         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
5339         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
5340
5341         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
5342         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
5343         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
5344
5345         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
5346         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
5347         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
5348
5349         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
5350         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
5351         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
5352
5353         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
5354         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
5355         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
5356
5357         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
5358         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
5359         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
5360
5361         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
5362         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
5363         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
5364
5365         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
5366         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
5367         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
5368
5369         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
5370         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
5371         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
5372
5373         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
5374         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
5375         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
5376
5377         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
5378
5379         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
5380         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
5381         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
5382
5383         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
5384         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
5385         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
5386
5387         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
5388         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
5389         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
5390
5391         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
5392         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
5393         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
5394
5395         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
5396
5397         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
5398         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
5399
5400         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
5401         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
5402         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
5403
5404         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
5405         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
5406         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
5407
5408         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
5409         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
5410
5411         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
5412         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
5413
5414         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
5415         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
5416
5417         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
5418         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
5419         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
5420
5421         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
5422         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
5423         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
5424
5425         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
5426         ldr     $rk14q, [$cc, #224]                                     @ load rk14
5427         aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
5428
5429         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
5430         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
5431         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
5432
5433         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
5434         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
5435         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
5436
5437         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
5438         aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
5439         aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
5440
5441         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
5442         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
5443         aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
5444
5445         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
5446         aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
5447         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
5448
5449         eor3    $res2b, $ctr_t2b, $ctr2b, $rk14                         @ AES block 8k+10 - result
5450         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
5451         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
5452
5453         aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
5454         aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
5455         eor3    $res5b, $ctr_t5b, $ctr5b, $rk14                         @ AES block 5 - result
5456
5457         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
5458         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
5459         aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
5460
5461         eor3    $res4b, $ctr_t4b, $ctr4b, $rk14                         @ AES block 4 - result
5462         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
5463         eor3    $res3b, $ctr_t3b, $ctr3b, $rk14                         @ AES block 8k+11 - result
5464
5465         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
5466         eor3    $res1b, $ctr_t1b, $ctr1b, $rk14                         @ AES block 8k+9 - result
5467         eor3    $res0b, $ctr_t0b, $ctr0b, $rk14                         @ AES block 8k+8 - result
5468
5469         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
5470         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
5471         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
5472
5473         eor3    $res7b, $ctr_t7b, $ctr7b, $rk14                         @ AES block 7 - result
5474         eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
5475         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
5476
5477         eor3    $res6b, $ctr_t6b, $ctr6b, $rk14                         @ AES block 6 - result
5478         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
5479         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
5480
5481         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
5482         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
5483         b.lt    .L256_enc_main_loop
5484
5485 .L256_enc_prepretail:                                                   @ PREPRETAIL
5486         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
5487         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
5488         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
5489
5490         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
5491
5492         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
5493         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
5494
5495         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
5496         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
5497         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
5498
5499         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
5500
5501         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
5502         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
5503         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
5504
5505         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
5506         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
5507
5508         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
5509         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
5510         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
5511
5512         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
5513         rev64   $res0b, $res0b                                          @ GHASH block 8k
5514         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
5515
5516         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
5517         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
5518         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
5519
5520         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
5521         ext     $h7.16b, $h7.16b, $h7.16b, #8
5522         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
5523         ext     $h8.16b, $h8.16b, $h8.16b, #8
5524         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
5525
5526         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
5527         ext     $h5.16b, $h5.16b, $h5.16b, #8
5528         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
5529         ext     $h6.16b, $h6.16b, $h6.16b, #8
5530         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
5531         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
5532
5533         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
5534         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
5535
5536         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
5537         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
5538
5539         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
5540         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
5541         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
5542
5543         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
5544         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
5545         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
5546
5547         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
5548         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
5549         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
5550
5551         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
5552         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
5553         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
5554
5555         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
5556         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
5557         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
5558
5559         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
5560         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
5561         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
5562
5563         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
5564         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
5565
5566         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
5567         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
5568         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
5569
5570         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
5571         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
5572         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
5573
5574         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
5575         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
5576         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
5577
5578         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
5579         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
5580         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
5581
5582         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
5583         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
5584         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
5585
5586         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
5587         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
5588         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
5589
5590         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
5591         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
5592         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
5593
5594         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
5595         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
5596         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
5597
5598         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
5599         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
5600         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
5601
5602         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
5603         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
5604         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
5605
5606         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
5607         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
5608         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
5609
5610         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
5611         ext     $h3.16b, $h3.16b, $h3.16b, #8
5612         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
5613         ext     $h4.16b, $h4.16b, $h4.16b, #8
5614         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
5615         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
5616
5617         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
5618         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
5619
5620         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
5621         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
5622         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
5623
5624         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
5625         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
5626         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
5627
5628         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
5629         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
5630         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
5631
5632         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
5633         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
5634         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
5635         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
5636
5637         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
5638         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
5639         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
5640
5641         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
5642         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
5643         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
5644         ext     $h1.16b, $h1.16b, $h1.16b, #8
5645         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
5646         ext     $h2.16b, $h2.16b, $h2.16b, #8
5647
5648         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
5649         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
5650         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
5651
5652         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
5653         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
5654
5655         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
5656         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
5657         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
5658
5659         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
5660         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
5661         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
5662
5663         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
5664         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
5665         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
5666
5667         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
5668         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
5669         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
5670
5671         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
5672         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
5673         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
5674
5675         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
5676         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
5677         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
5678
5679         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
5680         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
5681         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
5682
5683         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
5684         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
5685         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
5686
5687         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
5688         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
5689         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
5690
5691         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
5692         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
5693         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
5694
5695         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
5696         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
5697         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
5698
5699         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
5700         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
5701         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
5702
5703         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
5704
5705         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
5706         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
5707         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
5708
5709         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
5710         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
5711
5712         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
5713         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
5714         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
5715
5716         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
5717         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
5718         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
5719
5720         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
5721         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
5722         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
5723
5724         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
5725         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
5726         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
5727
5728         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
5729         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
5730         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
5731
5732         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
5733         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
5734         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
5735
5736         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
5737         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
5738         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
5739
5740         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
5741         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
5742         ldr     $rk14q, [$cc, #224]                                     @ load rk14
5743
5744         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
5745         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
5746         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
5747
5748         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
5749         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
5750         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
5751
5752         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
5753         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
5754
5755         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
5756         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
5757         aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
5758
5759         eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
5760         aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
5761         aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
5762
5763         aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
5764         aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
5765         aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
5766
5767         aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
5768         aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
5769 .L256_enc_tail:                                                         @ TAIL
5770
5771         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8l | h8h
5772         ext     $h8.16b, $h8.16b, $h8.16b, #8
5773         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
5774
5775         ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - load plaintext
5776
5777         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
5778         ext     $h5.16b, $h5.16b, $h5.16b, #8
5779
5780         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
5781         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
5782         ext     $h6.16b, $h6.16b, $h6.16b, #8
5783         ext     $h7.16b, $h7.16b, $h7.16b, #8
5784         mov     $t1.16b, $rk14
5785
5786         cmp     $main_end_input_ptr, #112
5787         eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                               @ AES block 8k+8 - result
5788         b.gt    .L256_enc_blocks_more_than_7
5789
5790         movi    $acc_l.8b, #0
5791         mov     $ctr7b, $ctr6b
5792         movi    $acc_h.8b, #0
5793
5794         mov     $ctr6b, $ctr5b
5795         mov     $ctr5b, $ctr4b
5796         mov     $ctr4b, $ctr3b
5797
5798         mov     $ctr3b, $ctr2b
5799         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5800         mov     $ctr2b, $ctr1b
5801
5802         movi    $acc_m.8b, #0
5803         cmp     $main_end_input_ptr, #96
5804         b.gt    .L256_enc_blocks_more_than_6
5805
5806         mov     $ctr7b, $ctr6b
5807         mov     $ctr6b, $ctr5b
5808         cmp     $main_end_input_ptr, #80
5809
5810         mov     $ctr5b, $ctr4b
5811         mov     $ctr4b, $ctr3b
5812         mov     $ctr3b, $ctr1b
5813
5814         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5815         b.gt    .L256_enc_blocks_more_than_5
5816
5817         mov     $ctr7b, $ctr6b
5818         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5819
5820         mov     $ctr6b, $ctr5b
5821         mov     $ctr5b, $ctr4b
5822
5823         cmp     $main_end_input_ptr, #64
5824         mov     $ctr4b, $ctr1b
5825         b.gt    .L256_enc_blocks_more_than_4
5826
5827         cmp     $main_end_input_ptr, #48
5828         mov     $ctr7b, $ctr6b
5829         mov     $ctr6b, $ctr5b
5830
5831         mov     $ctr5b, $ctr1b
5832         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5833         b.gt    .L256_enc_blocks_more_than_3
5834
5835         cmp     $main_end_input_ptr, #32
5836         mov     $ctr7b, $ctr6b
5837         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
5838
5839         mov     $ctr6b, $ctr1b
5840         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5841         b.gt    .L256_enc_blocks_more_than_2
5842
5843         mov     $ctr7b, $ctr1b
5844
5845         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5846         cmp     $main_end_input_ptr, #16
5847         b.gt    .L256_enc_blocks_more_than_1
5848
5849         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5850         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
5851         b        .L256_enc_blocks_less_than_1
5852 .L256_enc_blocks_more_than_7:                                           @ blocks left >  7
5853         st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
5854
5855         rev64   $res0b, $res1b                                          @ GHASH final-7 block
5856
5857         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5858
5859         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
5860
5861         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
5862         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
5863         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
5864
5865         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
5866
5867         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
5868         eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
5869
5870         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
5871         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
5872 .L256_enc_blocks_more_than_6:                                           @ blocks left >  6
5873
5874         st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
5875
5876         rev64   $res0b, $res1b                                          @ GHASH final-6 block
5877
5878         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5879
5880         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
5881         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
5882         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
5883
5884         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
5885
5886         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
5887
5888         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
5889
5890         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
5891         eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
5892
5893         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
5894
5895         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
5896         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
5897 .L256_enc_blocks_more_than_5:                                           @ blocks left >  5
5898
5899         st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
5900
5901         rev64   $res0b, $res1b                                          @ GHASH final-5 block
5902
5903         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5904
5905         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
5906
5907         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
5908
5909         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
5910         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
5911
5912         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
5913
5914         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
5915         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
5916
5917         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
5918         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
5919         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
5920
5921         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
5922         eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
5923 .L256_enc_blocks_more_than_4:                                           @ blocks left >  4
5924
5925         st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
5926
5927         rev64   $res0b, $res1b                                          @ GHASH final-4 block
5928
5929         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
5930
5931         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5932
5933         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
5934         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
5935
5936         eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
5937         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
5938
5939         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
5940         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
5941
5942         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
5943
5944         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
5945
5946         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
5947         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
5948 .L256_enc_blocks_more_than_3:                                           @ blocks left >  3
5949
5950         st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
5951
5952         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
5953         ext     $h4.16b, $h4.16b, $h4.16b, #8
5954         rev64   $res0b, $res1b                                          @ GHASH final-3 block
5955
5956         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5957
5958         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
5959         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
5960
5961         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
5962         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
5963         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
5964
5965         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
5966         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
5967
5968         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
5969         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
5970
5971         eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
5972         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
5973
5974         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
5975         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
5976 .L256_enc_blocks_more_than_2:                                           @ blocks left >  2
5977
5978         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
5979         ext     $h3.16b, $h3.16b, $h3.16b, #8
5980
5981         st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
5982
5983         rev64   $res0b, $res1b                                          @ GHASH final-2 block
5984         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
5985
5986         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5987
5988         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
5989
5990         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
5991
5992         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
5993         eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
5994
5995         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
5996
5997         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
5998
5999         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
6000         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
6001
6002         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
6003         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
6004 .L256_enc_blocks_more_than_1:                                           @ blocks left >  1
6005
6006         st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
6007
6008         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
6009         ext     $h2.16b, $h2.16b, $h2.16b, #8
6010         rev64   $res0b, $res1b                                          @ GHASH final-1 block
6011         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
6012
6013         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
6014         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
6015
6016         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
6017         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
6018
6019         eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
6020         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
6021
6022         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
6023         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
6024
6025         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
6026
6027         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
6028         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
6029
6030         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
6031
6032         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
6033 .L256_enc_blocks_less_than_1:                                           @ blocks left <= 1
6034
6035         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
6036
6037         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
6038
6039         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
6040
6041         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
6042         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
6043
6044         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
6045         cmp     $bit_length, #64
6046         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
6047
6048         csel    $temp3_x, $temp0_x, xzr, lt
6049         csel    $temp2_x, $temp1_x, $temp0_x, lt
6050
6051         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
6052         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
6053         ext     $h1.16b, $h1.16b, $h1.16b, #8
6054
6055         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
6056         mov     $ctr0.d[1], $temp3_x
6057
6058         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
6059
6060         rev64   $res0b, $res1b                                          @ GHASH final block
6061
6062         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
6063         bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
6064         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
6065
6066         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
6067         st1     { $res1b}, [$output_ptr]                                @ store all 16B
6068
6069         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
6070         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
6071         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
6072
6073         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
6074         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
6075
6076         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
6077
6078         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
6079
6080         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
6081         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
6082
6083         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
6084
6085         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
6086         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
6087
6088         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
6089
6090         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
6091         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
6092
6093         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
6094                 ext     $acc_lb, $acc_lb, $acc_lb, #8
6095         rev64   $acc_lb, $acc_lb
6096         st1     { $acc_l.16b }, [$current_tag]
6097         mov     x0, $byte_length                                        @ return sizes
6098
6099         ldp     d10, d11, [sp, #16]
6100         ldp     d12, d13, [sp, #32]
6101         ldp     d14, d15, [sp, #48]
6102         ldp     d8, d9, [sp], #80
6103         ret
6104
6105 .L256_enc_ret:
6106         mov w0, #0x0
6107         ret
6108 .size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
6109 ___
6110
6111 {
6112 #########################################################################################
6113 # size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
6114 #                               size_t len,
6115 #                               unsigned char *out,
6116 #                               const void *key,
6117 #                               unsigned char ivec[16],
6118 #                               u64 *Xi);
6119 #
6120 $code.=<<___;
6121 .global unroll8_eor3_aes_gcm_dec_256_kernel
6122 .type   unroll8_eor3_aes_gcm_dec_256_kernel,%function
6123 .align  4
6124 unroll8_eor3_aes_gcm_dec_256_kernel:
6125         AARCH64_VALID_CALL_TARGET
6126         cbz     x1, .L256_dec_ret
6127         stp     d8, d9, [sp, #-80]!
6128         lsr     $byte_length, $bit_length, #3
6129         mov     $counter, x4
6130         mov     $cc, x5
6131         stp     d10, d11, [sp, #16]
6132         stp     d12, d13, [sp, #32]
6133         stp     d14, d15, [sp, #48]
6134         mov     x5, #0xc200000000000000
6135         stp     x5, xzr, [sp, #64]
6136         add     $modulo_constant, sp, #64
6137
6138         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
6139
6140         mov     $constant_temp, #0x100000000                    @ set up counter increment
6141         movi    $rctr_inc.16b, #0x0
6142         mov     $rctr_inc.d[1], $constant_temp
6143         mov     $main_end_input_ptr, $byte_length
6144
6145         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
6146
6147         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
6148
6149         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
6150
6151         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
6152         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
6153
6154         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
6155         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
6156         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
6157
6158         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
6159         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
6160
6161         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
6162         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
6163
6164         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
6165
6166         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
6167         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
6168
6169         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
6170         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
6171
6172         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
6173         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
6174
6175         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
6176         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
6177
6178         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
6179         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
6180
6181         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
6182         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
6183         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
6184
6185         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
6186         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
6187         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
6188
6189         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
6190         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
6191         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
6192
6193         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
6194         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
6195
6196         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
6197         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
6198         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
6199
6200         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
6201         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
6202         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
6203
6204         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
6205         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
6206         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
6207
6208         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
6209         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
6210
6211         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
6212         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
6213
6214         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
6215         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
6216         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
6217
6218         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
6219
6220         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
6221         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
6222
6223         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
6224         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
6225         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
6226
6227         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
6228         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
6229         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
6230
6231         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
6232         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
6233
6234         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
6235         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
6236         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
6237
6238         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
6239
6240         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
6241         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
6242
6243         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
6244
6245         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
6246         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
6247         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
6248
6249         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
6250         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
6251         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
6252
6253         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
6254         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
6255         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
6256
6257         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
6258         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
6259
6260         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
6261         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
6262         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
6263
6264         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
6265         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
6266         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
6267
6268         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
6269         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
6270         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
6271
6272         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
6273         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
6274         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
6275
6276         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
6277         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
6278         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
6279
6280         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
6281
6282         ld1     { $acc_lb}, [$current_tag]
6283         ext     $acc_lb, $acc_lb, $acc_lb, #8
6284         rev64   $acc_lb, $acc_lb
6285         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
6286         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
6287         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
6288
6289         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
6290         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
6291
6292         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
6293         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
6294
6295         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
6296
6297         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
6298         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
6299
6300         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
6301         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
6302         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
6303
6304         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
6305         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
6306         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
6307
6308         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
6309         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
6310         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
6311
6312         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 11
6313         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
6314
6315         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 11
6316         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 11
6317         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 11
6318
6319         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 11
6320         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 11
6321         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 11
6322
6323         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 11
6324         ldr     $rk14q, [$cc, #224]                                     @ load rk14
6325
6326         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 12
6327         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 12
6328         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 12
6329
6330         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
6331         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 12
6332         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 12
6333
6334         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 12
6335         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 12
6336         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 12
6337
6338         aese    $ctr5b, $rk13                                           @ AES block 5 - round 13
6339         aese    $ctr1b, $rk13                                           @ AES block 1 - round 13
6340         aese    $ctr2b, $rk13                                           @ AES block 2 - round 13
6341
6342         aese    $ctr0b, $rk13                                           @ AES block 0 - round 13
6343         aese    $ctr4b, $rk13                                           @ AES block 4 - round 13
6344         aese    $ctr6b, $rk13                                           @ AES block 6 - round 13
6345
6346         aese    $ctr3b, $rk13                                           @ AES block 3 - round 13
6347         aese    $ctr7b, $rk13                                           @ AES block 7 - round 13
6348         b.ge    .L256_dec_tail                                          @ handle tail
6349
6350         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
6351
6352         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
6353
6354         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
6355
6356         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
6357         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
6358
6359         eor3    $ctr1b, $res1b, $ctr1b, $rk14                           @ AES block 1 - result
6360         eor3    $ctr0b, $res0b, $ctr0b, $rk14                           @ AES block 0 - result
6361         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
6362
6363         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
6364         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
6365         eor3    $ctr3b, $res3b, $ctr3b, $rk14                           @ AES block 3 - result
6366
6367         eor3    $ctr5b, $res5b, $ctr5b, $rk14                           @ AES block 5 - result
6368
6369         eor3    $ctr4b, $res4b, $ctr4b, $rk14                           @ AES block 4 - result
6370         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
6371         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
6372
6373         eor3    $ctr2b, $res2b, $ctr2b, $rk14                           @ AES block 2 - result
6374         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
6375
6376         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
6377         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
6378
6379         eor3    $ctr6b, $res6b, $ctr6b, $rk14                           @ AES block 6 - result
6380
6381         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
6382         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
6383         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
6384
6385         eor3    $ctr7b, $res7b, $ctr7b, $rk14                           @ AES block 7 - result
6386         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
6387
6388         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
6389         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
6390         b.ge    .L256_dec_prepretail                                    @ do prepretail
6391
6392 .L256_dec_main_loop:                                                    @ main loop start
6393         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
6394         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
6395         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
6396
6397         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
6398         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
6399         ext     $h7.16b, $h7.16b, $h7.16b, #8
6400         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
6401         ext     $h8.16b, $h8.16b, $h8.16b, #8
6402
6403         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
6404         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
6405         rev64   $res0b, $res0b                                          @ GHASH block 8k
6406
6407         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
6408         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
6409         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
6410
6411         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
6412         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
6413
6414         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
6415         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
6416         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
6417
6418         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
6419         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
6420         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
6421
6422         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
6423         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
6424         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
6425
6426         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
6427         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
6428         ext     $h5.16b, $h5.16b, $h5.16b, #8
6429         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
6430         ext     $h6.16b, $h6.16b, $h6.16b, #8
6431         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
6432
6433         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
6434         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
6435         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
6436
6437         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
6438         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
6439         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
6440
6441         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
6442         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
6443         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
6444
6445         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
6446         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
6447         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
6448
6449         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
6450         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
6451         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
6452
6453         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
6454         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
6455         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
6456
6457         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
6458         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
6459         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
6460
6461         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
6462         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
6463         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
6464
6465         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
6466         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
6467         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
6468
6469         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
6470         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
6471         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
6472
6473         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
6474         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
6475         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
6476
6477         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
6478         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
6479         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
6480
6481         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
6482         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
6483         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
6484
6485         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
6486         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
6487         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
6488
6489         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
6490         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
6491         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
6492         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
6493
6494         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
6495         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
6496         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
6497
6498         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
6499         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
6500         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
6501
6502         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
6503         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
6504         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
6505
6506         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
6507         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
6508         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
6509
6510         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
6511         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
6512         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
6513
6514         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
6515         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
6516         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
6517
6518         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
6519         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
6520         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
6521
6522         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
6523         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
6524         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
6525
6526         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
6527         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
6528         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
6529
6530         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
6531         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
6532         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
6533
6534         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
6535         ext     $h3.16b, $h3.16b, $h3.16b, #8
6536         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
6537         ext     $h4.16b, $h4.16b, $h4.16b, #8
6538         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
6539         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
6540
6541         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
6542         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
6543         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
6544
6545         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
6546         ext     $h1.16b, $h1.16b, $h1.16b, #8
6547         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
6548         ext     $h2.16b, $h2.16b, $h2.16b, #8
6549         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
6550         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
6551
6552         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
6553         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
6554         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
6555
6556         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
6557         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
6558         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
6559         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
6560
6561         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
6562         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
6563         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
6564
6565         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
6566         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
6567         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
6568
6569         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
6570         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
6571         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
6572
6573         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
6574         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
6575         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
6576
6577         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
6578         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
6579         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
6580
6581         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
6582         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
6583         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
6584
6585         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
6586         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
6587         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
6588
6589         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
6590         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
6591         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
6592
6593         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
6594         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
6595         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
6596
6597         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
6598         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
6599         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
6600
6601         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
6602         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
6603         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
6604
6605         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
6606         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
6607         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
6608
6609         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
6610         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
6611         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
6612
6613         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
6614         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
6615         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
6616
6617         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
6618         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
6619         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
6620
6621         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
6622         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
6623         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
6624
6625         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
6626         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
6627         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
6628
6629         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
6630         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
6631         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
6632
6633         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
6634         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
6635
6636         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
6637         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
6638         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
6639
6640         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
6641         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
6642         ext     $t11.16b, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
6643
6644         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
6645         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
6646         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
6647
6648         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
6649         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
6650         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
6651
6652         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
6653         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
6654         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
6655
6656         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
6657         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
6658         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
6659
6660         ldr     $rk14q, [$cc, #224]                                     @ load rk14
6661         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
6662         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
6663
6664         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
6665         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
6666         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
6667
6668         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
6669         aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
6670         aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
6671
6672         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
6673         aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
6674         aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
6675
6676         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
6677         eor3    $ctr2b, $res2b, $ctr2b, $rk14                           @ AES block 8k+10 - result
6678         eor3    $ctr1b, $res1b, $ctr1b, $rk14                           @ AES block 8k+9 - result
6679
6680         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
6681         aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
6682
6683         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
6684         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
6685         aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
6686
6687         eor3    $ctr5b, $res5b, $ctr5b, $rk14                           @ AES block 8k+13 - result
6688         eor3    $ctr0b, $res0b, $ctr0b, $rk14                           @ AES block 8k+8 - result
6689         aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
6690
6691         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
6692         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
6693         eor3    $ctr4b, $res4b, $ctr4b, $rk14                           @ AES block 8k+12 - result
6694
6695         eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
6696         eor3    $ctr3b, $res3b, $ctr3b, $rk14                           @ AES block 8k+11 - result
6697         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
6698
6699         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
6700         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
6701         aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
6702
6703         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
6704         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
6705         eor3    $ctr7b, $res7b, $ctr7b, $rk14                           @ AES block 8k+15 - result
6706
6707         eor3    $ctr6b, $res6b, $ctr6b, $rk14                           @ AES block 8k+14 - result
6708         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
6709         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
6710
6711         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
6712         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
6713         b.lt    .L256_dec_main_loop
6714
6715 .L256_dec_prepretail:                                                   @ PREPRETAIL
6716         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
6717         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
6718         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
6719
6720         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
6721         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
6722         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
6723
6724         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
6725         rev64   $res0b, $res0b                                          @ GHASH block 8k
6726         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
6727
6728         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
6729         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
6730         ext     $h7.16b, $h7.16b, $h7.16b, #8
6731         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
6732         ext     $h8.16b, $h8.16b, $h8.16b, #8
6733         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
6734
6735         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
6736         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
6737         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
6738         ext     $h5.16b, $h5.16b, $h5.16b, #8
6739         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
6740         ext     $h6.16b, $h6.16b, $h6.16b, #8
6741
6742         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
6743         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
6744         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
6745
6746         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
6747         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
6748         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
6749
6750         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
6751         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
6752         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
6753
6754         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
6755         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
6756         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
6757
6758         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
6759         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
6760         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
6761
6762         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
6763         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
6764         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
6765
6766         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
6767         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
6768         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
6769
6770         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
6771         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
6772
6773         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
6774         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
6775         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
6776
6777         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
6778         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
6779         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
6780
6781         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
6782         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
6783
6784         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
6785         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
6786
6787         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
6788         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
6789         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
6790
6791         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
6792         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
6793         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
6794
6795         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
6796         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
6797         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
6798
6799         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
6800         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
6801         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
6802
6803         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
6804         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
6805         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
6806
6807         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
6808         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
6809         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
6810
6811         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
6812         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
6813         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
6814
6815         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
6816         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
6817         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
6818
6819         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
6820         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
6821         ext     $h1.16b, $h1.16b, $h1.16b, #8
6822         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
6823         ext     $h2.16b, $h2.16b, $h2.16b, #8
6824         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
6825
6826         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
6827         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
6828         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
6829
6830         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
6831         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
6832         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
6833
6834         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
6835         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
6836         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
6837
6838         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
6839         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
6840         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
6841
6842         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
6843         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
6844         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
6845
6846         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
6847         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
6848         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
6849
6850         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
6851         ext     $h3.16b, $h3.16b, $h3.16b, #8
6852         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
6853         ext     $h4.16b, $h4.16b, $h4.16b, #8
6854         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
6855         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
6856
6857         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
6858
6859         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
6860
6861         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
6862         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
6863         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
6864         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
6865
6866         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
6867         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
6868
6869         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
6870         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
6871         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
6872
6873         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
6874         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
6875         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
6876
6877         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
6878         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
6879         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
6880
6881         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
6882         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
6883         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
6884
6885         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
6886         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
6887         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
6888
6889         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
6890         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
6891
6892         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
6893         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
6894         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
6895
6896         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
6897         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
6898         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
6899
6900         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
6901         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
6902         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
6903
6904         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
6905         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
6906         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
6907
6908         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
6909         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
6910         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
6911
6912         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
6913         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
6914         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
6915
6916         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
6917         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
6918         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
6919
6920         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
6921         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
6922         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
6923
6924         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
6925         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
6926         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
6927
6928         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
6929         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
6930         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
6931
6932         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
6933         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
6934         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
6935
6936         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
6937         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
6938         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
6939
6940         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
6941
6942         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
6943         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
6944         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
6945
6946         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
6947         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
6948         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
6949
6950         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
6951
6952         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
6953         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
6954         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
6955
6956         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
6957
6958         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
6959         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
6960         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
6961
6962         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
6963         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
6964
6965         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
6966         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
6967         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
6968
6969         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
6970         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
6971
6972         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
6973
6974         aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
6975         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
6976         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
6977
6978         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
6979         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
6980         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
6981
6982         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
6983         ldr     $rk14q, [$cc, #224]                                     @ load rk14
6984         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
6985
6986         aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
6987         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
6988         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
6989
6990         aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
6991         aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
6992         aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
6993
6994         aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
6995         eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
6996         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
6997
6998         aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
6999         aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
7000 .L256_dec_tail:                                                         @ TAIL
7001
7002         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
7003         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
7004         cmp     $main_end_input_ptr, #112
7005
7006         ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
7007
7008         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
7009         ext     $h8.16b, $h8.16b, $h8.16b, #8
7010         mov     $t1.16b, $rk14
7011
7012         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
7013         ext     $h5.16b, $h5.16b, $h5.16b, #8
7014
7015         eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
7016         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
7017         ext     $h6.16b, $h6.16b, $h6.16b, #8
7018         ext     $h7.16b, $h7.16b, $h7.16b, #8
7019         b.gt    .L256_dec_blocks_more_than_7
7020
7021         mov     $ctr7b, $ctr6b
7022         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7023         mov     $ctr6b, $ctr5b
7024
7025         mov     $ctr5b, $ctr4b
7026         mov     $ctr4b, $ctr3b
7027         movi    $acc_l.8b, #0
7028
7029         movi    $acc_h.8b, #0
7030         movi    $acc_m.8b, #0
7031         mov     $ctr3b, $ctr2b
7032
7033         cmp     $main_end_input_ptr, #96
7034         mov     $ctr2b, $ctr1b
7035         b.gt    .L256_dec_blocks_more_than_6
7036
7037         mov     $ctr7b, $ctr6b
7038         mov     $ctr6b, $ctr5b
7039
7040         mov     $ctr5b, $ctr4b
7041         cmp     $main_end_input_ptr, #80
7042         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7043
7044         mov     $ctr4b, $ctr3b
7045         mov     $ctr3b, $ctr1b
7046         b.gt    .L256_dec_blocks_more_than_5
7047
7048         cmp     $main_end_input_ptr, #64
7049         mov     $ctr7b, $ctr6b
7050         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7051
7052         mov     $ctr6b, $ctr5b
7053
7054         mov     $ctr5b, $ctr4b
7055         mov     $ctr4b, $ctr1b
7056         b.gt    .L256_dec_blocks_more_than_4
7057
7058         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7059         mov     $ctr7b, $ctr6b
7060         cmp     $main_end_input_ptr, #48
7061
7062         mov     $ctr6b, $ctr5b
7063         mov     $ctr5b, $ctr1b
7064         b.gt    .L256_dec_blocks_more_than_3
7065
7066         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
7067         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7068         mov     $ctr7b, $ctr6b
7069
7070         cmp     $main_end_input_ptr, #32
7071         mov     $ctr6b, $ctr1b
7072         b.gt    .L256_dec_blocks_more_than_2
7073
7074         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7075
7076         mov     $ctr7b, $ctr1b
7077         cmp     $main_end_input_ptr, #16
7078         b.gt    .L256_dec_blocks_more_than_1
7079
7080         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7081         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
7082         b        .L256_dec_blocks_less_than_1
7083 .L256_dec_blocks_more_than_7:                                           @ blocks left >  7
7084         rev64   $res0b, $res1b                                          @ GHASH final-7 block
7085         ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
7086         st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
7087
7088         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
7089
7090         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7091
7092         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
7093         eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
7094
7095         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
7096
7097         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
7098         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
7099
7100         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
7101         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
7102 .L256_dec_blocks_more_than_6:                                           @ blocks left >  6
7103
7104         rev64   $res0b, $res1b                                          @ GHASH final-6 block
7105
7106         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7107         ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
7108         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
7109
7110         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
7111         st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
7112         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
7113
7114         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
7115
7116         eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
7117         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
7118         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
7119
7120         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
7121
7122         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
7123         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
7124 .L256_dec_blocks_more_than_5:                                           @ blocks left >  5
7125
7126         rev64   $res0b, $res1b                                          @ GHASH final-5 block
7127
7128         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7129
7130         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
7131         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
7132
7133         ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
7134
7135         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
7136         st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
7137
7138         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
7139         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
7140
7141         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
7142
7143         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
7144         eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
7145         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
7146
7147         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
7148         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
7149 .L256_dec_blocks_more_than_4:                                           @ blocks left >  4
7150
7151         rev64   $res0b, $res1b                                          @ GHASH final-4 block
7152
7153         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7154
7155         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
7156         ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
7157
7158         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
7159
7160         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
7161         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
7162
7163         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
7164
7165         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
7166
7167         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
7168
7169         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
7170         st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
7171
7172         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
7173         eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
7174 .L256_dec_blocks_more_than_3:                                           @ blocks left >  3
7175
7176         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
7177         ext     $h4.16b, $h4.16b, $h4.16b, #8
7178         rev64   $res0b, $res1b                                          @ GHASH final-3 block
7179
7180         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7181         ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
7182         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
7183
7184         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
7185         st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
7186
7187         eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
7188
7189         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
7190
7191         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
7192         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
7193         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
7194
7195         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
7196         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
7197         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
7198
7199         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
7200
7201         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
7202 .L256_dec_blocks_more_than_2:                                           @ blocks left >  2
7203
7204         rev64   $res0b, $res1b                                          @ GHASH final-2 block
7205
7206         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
7207         ext     $h3.16b, $h3.16b, $h3.16b, #8
7208         ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
7209
7210         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7211
7212         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
7213
7214         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
7215         st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
7216         eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
7217
7218         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
7219         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
7220         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
7221
7222         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
7223         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
7224
7225         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
7226         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
7227 .L256_dec_blocks_more_than_1:                                           @ blocks left >  1
7228
7229         rev64   $res0b, $res1b                                          @ GHASH final-1 block
7230
7231         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7232
7233         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
7234         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
7235         ext     $h2.16b, $h2.16b, $h2.16b, #8
7236
7237         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
7238         ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
7239         st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
7240
7241         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
7242         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
7243
7244         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
7245
7246         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
7247
7248         eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
7249         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
7250
7251         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
7252
7253         movi    $t0.8b, #0                                              @ suppress further partial tag feed in
7254         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
7255
7256         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
7257 .L256_dec_blocks_less_than_1:                                           @ blocks left <= 1
7258
7259         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
7260         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
7261         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
7262
7263         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
7264         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
7265         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
7266
7267         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
7268
7269         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
7270
7271         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
7272         cmp     $bit_length, #64
7273         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
7274
7275         csel    $temp3_x, $temp0_x, xzr, lt
7276         csel    $temp2_x, $temp1_x, $temp0_x, lt
7277
7278         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
7279         mov     $ctr0.d[1], $temp3_x
7280
7281         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
7282         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
7283         ext     $h1.16b, $h1.16b, $h1.16b, #8
7284         bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
7285
7286         rev64   $res0b, $res1b                                          @ GHASH final block
7287
7288         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7289
7290         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
7291         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
7292
7293         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
7294
7295         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
7296         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
7297
7298         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
7299
7300         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
7301         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
7302         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
7303
7304         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
7305         eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
7306
7307         ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
7308         st1     { $res4b}, [$output_ptr]                                @ store all 16B
7309
7310         eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
7311
7312         eor     $t11.16b, $acc_hb, $t11.16b                             @ MODULO - fold into mid
7313         eor     $acc_mb, $acc_mb, $t11.16b                              @ MODULO - fold into mid
7314
7315         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
7316
7317         ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
7318         eor     $acc_lb, $acc_lb, $acc_hb                               @ MODULO - fold into low
7319
7320         eor     $acc_lb, $acc_lb, $acc_mb                               @ MODULO - fold into low
7321         ext     $acc_lb, $acc_lb, $acc_lb, #8
7322         rev64   $acc_lb, $acc_lb
7323         st1     { $acc_l.16b }, [$current_tag]
7324         mov     x0, $byte_length
7325
7326         ldp     d10, d11, [sp, #16]
7327         ldp     d12, d13, [sp, #32]
7328         ldp     d14, d15, [sp, #48]
7329         ldp     d8, d9, [sp], #80
7330         ret
7331
7332 .L256_dec_ret:
7333         mov w0, #0x0
7334         ret
7335 .size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
7336 ___
7337 }
7338 }
7339
7340 $code.=<<___;
7341 .asciz  "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
7342 .align  2
7343 #endif
7344 ___
7345
7346 {
7347     my  %opcode = (
7348     "rax1"    => 0xce608c00,    "eor3"    => 0xce000000,
7349     "bcax"    => 0xce200000,    "xar"    => 0xce800000    );
7350
7351     sub unsha3 {
7352          my ($mnemonic,$arg)=@_;
7353
7354          $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
7355          &&
7356          sprintf ".inst\t0x%08x\t//%s %s",
7357             $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
7358             $mnemonic,$arg;
7359     }
7360     sub unvmov {
7361         my $arg=shift;
7362
7363         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
7364         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
7365                              $3<8?$3:$3+8,($4 eq "lo")?0:1;
7366     }
7367
7368      foreach(split("\n",$code)) {
7369         s/@\s/\/\//o;               # old->new style commentary
7370         s/\`([^\`]*)\`/eval($1)/ge;
7371
7372         m/\bld1r\b/ and s/\.16b/.2d/g    or
7373         s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
7374         print $_,"\n";
7375      }
7376 }
7377
7378 close STDOUT or die "error closing STDOUT: $!"; # enforce flush