Update copyright year
[openssl.git] / crypto / modes / asm / aes-gcm-armv8-unroll8_64.pl
1 #! /usr/bin/env perl
2 # Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 #========================================================================
11 # Written by Xiaokang Qian <xiaokang.qian@arm.com> for the OpenSSL project,
12 # derived from https://github.com/ARM-software/AArch64cryptolib, original
13 # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14 # licensed under OpenSSL and SPDX BSD-3-Clause licenses depending on where you
15 # obtain it.
16 #========================================================================
17 #
18 # Approach - We want to reload constants as we have plenty of spare ASIMD slots around crypto units for loading
19 # Unroll x8 in main loop, main loop to act on 8 16B blocks per iteration, and then do modulo of the accumulated
20 # intermediate hashesfrom the 8 blocks.
21 #
22 #  ____________________________________________________
23 # |                                                    |
24 # | PRE                                                |
25 # |____________________________________________________|
26 # |                |                |                  |
27 # | CTR block 8k+13| AES block 8k+8 | GHASH block 8k+0 |
28 # |________________|________________|__________________|
29 # |                |                |                  |
30 # | CTR block 8k+14| AES block 8k+9 | GHASH block 8k+1 |
31 # |________________|________________|__________________|
32 # |                |                |                  |
33 # | CTR block 8k+15| AES block 8k+10| GHASH block 8k+2 |
34 # |________________|________________|__________________|
35 # |                |                |                  |
36 # | CTR block 8k+16| AES block 8k+11| GHASH block 8k+3 |
37 # |________________|________________|__________________|
38 # |                |                |                  |
39 # | CTR block 8k+17| AES block 8k+12| GHASH block 8k+4 |
40 # |________________|________________|__________________|
41 # |                |                |                  |
42 # | CTR block 8k+18| AES block 8k+13| GHASH block 8k+5 |
43 # |________________|________________|__________________|
44 # |                |                |                  |
45 # | CTR block 8k+19| AES block 8k+14| GHASH block 8k+6 |
46 # |________________|________________|__________________|
47 # |                |                |                  |
48 # | CTR block 8k+20| AES block 8k+15| GHASH block 8k+7 |
49 # |________________|____(mostly)____|__________________|
50 # |                                                    |
51 # | MODULO                                             |
52 # |____________________________________________________|
53 #
54 # PRE:
55 #     Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
56 # EXT low_acc, low_acc, low_acc, #8
57 # EOR res_curr (8k+0), res_curr (4k+0), low_acc
58 #
59 # CTR block:
60 #     Increment and byte reverse counter in scalar registers and transfer to SIMD registers
61 # REV     ctr32, rev_ctr32
62 # ORR     ctr64, constctr96_top32, ctr32, LSL #32
63 # INS     ctr_next.d[0], constctr96_bottom64      // Keeping this in scalar registers to free up space in SIMD RF
64 # INS     ctr_next.d[1], ctr64X
65 # ADD     rev_ctr32, #1
66 #
67 # AES block:
68 #      Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
69 #      Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
70 #      Given we are very constrained in our ASIMD registers this is quite important
71 #
72 #      Encrypt:
73 # LDR     input_low, [ input_ptr  ], #8
74 # LDR     input_high, [ input_ptr  ], #8
75 # EOR     input_low, k14_low
76 # EOR     input_high, k14_high
77 # INS     res_curr.d[0], input_low
78 # INS     res_curr.d[1], input_high
79 # AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
80 # AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
81 # AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
82 # AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
83 # AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
84 # AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
85 # AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
86 # AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
87 # AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
88 # AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
89 # AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
90 # AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
91 # AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
92 # AESE    ctr_curr, k13
93 # EOR     res_curr, res_curr, ctr_curr
94 # ST1     { res_curr.16b  }, [ output_ptr  ], #16
95 #
96 #     Decrypt:
97 # AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
98 # AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
99 # AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
100 # AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
101 # AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
102 # AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
103 # AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
104 # AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
105 # AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
106 # AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
107 # AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
108 # AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
109 # AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
110 # AESE    ctr_curr, k13
111 # LDR     res_curr, [ input_ptr  ], #16
112 # EOR     res_curr, res_curr, ctr_curr
113 # MOV     output_low, res_curr.d[0]
114 # MOV     output_high, res_curr.d[1]
115 # EOR     output_low, k14_low
116 # EOR     output_high, k14_high
117 # STP     output_low, output_high, [ output_ptr  ], #16
118
119 # GHASH block X:
120 #     Do 128b karatsuba polynomial multiplication on block
121 #     We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
122 #
123 # multiplication:
124 #     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
125 #
126 #     The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
127 #     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
128 #
129 #     There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
130 #     multiplying with "twisted" powers of H
131 #
132 # Note: We can PMULL directly into the acc_x in first GHASH of the loop
133 # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
134 #       path latency dominates the performance
135 #
136 #       This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
137 #       than indicated here
138 # REV64   res_curr, res_curr
139 # INS     t_m.d[0], res_curr.d[1]
140 # EOR     t_m.8B, t_m.8B, res_curr.8B
141 # PMULL2  t_h, res_curr, HX
142 # PMULL   t_l, res_curr, HX
143 # PMULL   t_m, t_m, HX_k
144 # EOR     acc_h, acc_h, t_h
145 # EOR     acc_l, acc_l, t_l
146 # EOR     acc_m, acc_m, t_m
147 #
148 # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
149 #         There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
150 #         with a reversed constant
151 # EOR3    acc_m, acc_m, acc_l, acc_h                     // Finish off karatsuba processing
152 # PMULL   t_mod, acc_h, mod_constant
153 # EXT     acc_h, acc_h, acc_h, #8
154 # EOR3     acc_m, acc_m, t_mod, acc_h
155 # PMULL   acc_h, acc_m, mod_constant
156 # EXT     acc_m, acc_m, acc_m, #8
157 # EOR3    acc_l, acc_l, acc_m, acc_h
158
159 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
160 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
161
162 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
163 ( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
164 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
165 die "can't locate arm-xlate.pl";
166
167 die "only for 64 bit" if $flavour !~ /64/;
168
169 open OUT,"| \"$^X\" $xlate $flavour $output";
170 *STDOUT=*OUT;
171
172 $code=<<___;
173 #include "arm_arch.h"
174
175 #if __ARM_MAX_ARCH__>=8
176 ___
177 $code.=".arch   armv8.2-a+crypto\n.text\n";
178
179 $input_ptr="x0";  #argument block
180 $bit_length="x1";
181 $output_ptr="x2";
182 $current_tag="x3";
183 $counter="x16";
184 $constant_temp="x15";
185 $modulo_constant="x10";
186 $cc="x8";
187 {
188 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
189 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
190 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
191 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
192 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
193 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
194 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
195
196 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
197 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
198 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
199
200 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
201 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
202
203 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
204 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
205 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
206 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
207
208 my $t0="v16";
209 my $t0d="d16";
210
211 my $t1="v29";
212 my $t2=$res1;
213 my $t3=$t1;
214
215 my $t4=$res0;
216 my $t5=$res2;
217 my $t6=$t0;
218
219 my $t7=$res3;
220 my $t8=$res4;
221 my $t9=$res5;
222
223 my $t10=$res6;
224 my $t11="v21";
225 my $t12=$t1;
226
227 my $rtmp_ctr="v30";
228 my $rtmp_ctrq="q30";
229 my $rctr_inc="v31";
230 my $rctr_incd="d31";
231
232 my $mod_constantd=$t0d;
233 my $mod_constant=$t0;
234
235 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
236 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
237 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
238 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
239 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
240 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
241 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
242 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
243 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
244 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
245 my $rk2q1="v28.1q";
246 my $rk3q1="v26.1q";
247 my $rk4v="v27";
248
249
250 #########################################################################################
251 # size_t unroll8_eor3_aes_gcm_enc_128_kernel(const unsigned char *in,
252 #                               size_t len,
253 #                               unsigned char *out,
254 #                               const void *key,
255 #                               unsigned char ivec[16],
256 #                               u64 *Xi);
257 #
258 $code.=<<___;
259 .global unroll8_eor3_aes_gcm_enc_128_kernel
260 .type   unroll8_eor3_aes_gcm_enc_128_kernel,%function
261 .align  4
262 unroll8_eor3_aes_gcm_enc_128_kernel:
263         AARCH64_VALID_CALL_TARGET
264         cbz     x1, .L128_enc_ret
265         stp     d8, d9, [sp, #-80]!
266         mov     $counter, x4
267         mov     $cc, x5
268         stp     d10, d11, [sp, #16]
269         stp     d12, d13, [sp, #32]
270         stp     d14, d15, [sp, #48]
271         mov     x5, #0xc200000000000000
272         stp     x5, xzr, [sp, #64]
273         add     $modulo_constant, sp, #64
274
275         mov     $constant_temp, #0x100000000                            @ set up counter increment
276         movi    $rctr_inc.16b, #0x0
277         mov     $rctr_inc.d[1], $constant_temp
278         lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
279         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
280
281         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
282
283         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80           @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
284
285         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
286
287         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
288
289         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
290         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
291
292         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
293         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
294
295         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
296         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
297
298         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
299         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
300
301         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
302         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
303         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
304
305         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
306         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
307
308         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
309         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
310
311         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
312         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
313         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
314
315         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
316         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
317         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
318
319         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
320         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
321         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
322
323         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
324
325         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
326         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
327         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
328
329         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
330         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
331         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
332
333         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
334         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
335         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
336
337         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
338         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
339         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
340
341         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
342         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
343         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
344
345         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
346
347         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
348         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
349         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
350
351         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
352         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
353         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
354
355         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
356
357         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
358         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
359         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
360
361         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
362         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
363         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
364
365         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
366         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
367         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
368
369         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
370         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
371         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
372
373         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
374         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
375         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
376
377         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
378         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
379         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
380
381         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
382         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
383         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
384
385         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
386         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
387         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
388
389         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
390         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
391         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
392
393         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
394
395         ld1     { $acc_lb}, [$current_tag]
396         ext     $acc_lb, $acc_lb, $acc_lb, #8
397         rev64   $acc_lb, $acc_lb
398
399         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
400
401         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
402         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
403         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
404
405         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
406         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
407         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
408
409         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
410         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
411         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
412
413         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
414         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
415         ldr     $rk10q, [$cc, #160]                                     @ load rk10
416
417         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
418         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
419         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
420
421         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
422         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
423         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
424
425         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
426         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
427         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
428
429         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
430         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
431         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
432
433         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
434         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
435         b.ge    .L128_enc_tail                                          @ handle tail
436
437         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
438
439         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
440
441         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
442
443         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
444         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
445
446         eor3    $res0b, $ctr_t0b, $ctr0b, $rk10                         @ AES block 0 - result
447         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
448         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
449
450         eor3    $res1b, $ctr_t1b, $ctr1b, $rk10                         @ AES block 1 - result
451         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
452
453         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
454         eor3    $res5b, $ctr_t5b, $ctr5b, $rk10                         @ AES block 5 - result
455         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
456
457         eor3    $res2b, $ctr_t2b, $ctr2b, $rk10                         @ AES block 2 - result
458         eor3    $res6b, $ctr_t6b, $ctr6b, $rk10                         @ AES block 6 - result
459         eor3    $res4b, $ctr_t4b, $ctr4b, $rk10                         @ AES block 4 - result
460
461         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
462         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
463
464         eor3    $res3b, $ctr_t3b, $ctr3b, $rk10                         @ AES block 3 - result
465         eor3    $res7b, $ctr_t7b, $ctr7b,$rk10                          @ AES block 7 - result
466         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
467
468         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
469         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
470         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
471
472         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
473
474         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
475         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
476         b.ge    .L128_enc_prepretail                                    @ do prepretail
477
478 .L128_enc_main_loop:                                                    @ main loop start
479         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
480         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
481         ext     $h5.16b, $h5.16b, $h5.16b, #8
482         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
483         ext     $h6.16b, $h6.16b, $h6.16b, #8
484         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
485
486         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
487         rev64   $res0b, $res0b                                          @ GHASH block 8k
488         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
489         ext     $h7.16b, $h7.16b, $h7.16b, #8
490         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
491         ext     $h8.16b, $h8.16b, $h8.16b, #8
492
493         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
494         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
495         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
496
497         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
498         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
499         rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
500         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
501
502         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
503         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
504         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
505
506         rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
507
508         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
509         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
510         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
511
512         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
513         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
514         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
515
516         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
517         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
518         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
519
520         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
521         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
522         ext     $h3.16b, $h3.16b, $h3.16b, #8
523         ldr     $h4q, [$current_tag, #112]                              @ load h3l | h3h
524         ext     $h4.16b, $h4.16b, $h4.16b, #8
525         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
526
527         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
528         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
529         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
530
531         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
532         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
533         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
534
535         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
536         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
537         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
538
539         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
540         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
541         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
542
543         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
544         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
545         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
546
547         eor3    $acc_hb, $acc_hb, $t1.16b,$t2.16b                       @ GHASH block 8k+2, 8k+3 - high
548         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
549         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
550
551         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
552         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
553         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
554
555         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
556         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
557         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
558
559         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
560         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
561         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
562
563         rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
564         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
565
566         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
567         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
568         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
569
570         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
571         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
572         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
573
574         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
575         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
576         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
577
578         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
579         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
580         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
581
582         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
583         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
584         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
585         rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
586
587         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
588         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
589         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
590
591         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
592         ext     $h1.16b, $h1.16b, $h1.16b, #8
593         ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
594         ext     $h2.16b, $h2.16b, $h2.16b, #8
595         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
596         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
597
598         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
599         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
600
601         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
602         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
603
604         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
605         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
606
607         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
608         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
609
610         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
611         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
612         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
613
614         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
615         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
616         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
617
618         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
619         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
620         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
621
622         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
623         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
624         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
625
626         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
627         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
628         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
629
630         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
631         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
632         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
633
634         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
635         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
636         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
637
638         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
639         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
640
641         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
642         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
643         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
644
645         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
646         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
647         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
648
649         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
650         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
651         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
652
653         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
654         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
655
656         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
657         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
658         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
659
660         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
661         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
662         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
663
664         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
665         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
666         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
667
668         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
669         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
670         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
671
672         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
673         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
674         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
675
676         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
677         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
678         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
679
680         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
681         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
682         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
683
684         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
685         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
686         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
687
688         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
689         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
690
691         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
692         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 8k+12, 8k+13 - load plaintext
693         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
694
695         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
696         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
697         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
698
699         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
700         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
701         ldr     $rk10q, [$cc, #160]                                     @ load rk10
702
703         ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
704         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
705         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
706         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
707
708         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
709         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
710         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
711
712         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
713         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
714         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
715
716         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 8k+14, 8k+15 - load plaintext
717         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
718         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
719
720         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
721         eor3    $res4b, $ctr_t4b, $ctr4b, $rk10                         @ AES block 4 - result
722         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
723
724         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
725         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
726
727         eor3    $res2b, $ctr_t2b, $ctr2b, $rk10                         @ AES block 8k+10 - result
728
729         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
730         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
731
732         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
733         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
734
735         eor3    $res7b, $ctr_t7b, $ctr7b, $rk10                         @ AES block 7 - result
736         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
737         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
738
739         eor3    $res1b, $ctr_t1b, $ctr1b, $rk10                         @ AES block 8k+9 - result
740         eor3    $res3b, $ctr_t3b, $ctr3b, $rk10                         @ AES block 8k+11 - result
741         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
742
743         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
744         eor3    $res5b, $ctr_t5b, $ctr5b, $rk10                         @ AES block 5 - result
745         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
746
747         eor3    $res0b, $ctr_t0b, $ctr0b, $rk10                         @ AES block 8k+8 - result
748         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
749         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
750
751         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
752         eor3    $res6b, $ctr_t6b, $ctr6b, $rk10                         @ AES block 6 - result
753
754         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
755         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
756
757         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
758         b.lt    .L128_enc_main_loop
759
760 .L128_enc_prepretail:                                                   @ PREPRETAIL
761         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
762         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
763         ext     $h7.16b, $h7.16b, $h7.16b, #8
764         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
765         ext     $h8.16b, $h8.16b, $h8.16b, #8
766         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
767
768         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
769         ext     $h5.16b, $h5.16b, $h5.16b, #8
770         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
771         ext     $h6.16b, $h6.16b, $h6.16b, #8
772         rev64   $res0b, $res0b                                          @ GHASH block 8k
773         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
774
775         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
776         ldr     $h78kq, [$current_tag, #192]                            @ load h6k | h5k
777         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
778         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
779
780         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
781         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
782
783         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
784
785         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
786         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
787         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
788
789         rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
790         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
791
792         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
793         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
794         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
795
796         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
797         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
798
799         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
800         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
801
802         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
803         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
804
805         rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
806         rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
807
808         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
809
810         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
811
812         rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
813
814         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
815
816         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
817         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
818
819         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
820         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
821
822         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
823         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
824
825         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
826         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
827         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
828
829         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
830         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
831
832         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
833         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
834         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
835
836         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
837         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
838
839         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
840         ext     $h3.16b, $h3.16b, $h3.16b, #8
841         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
842         ext     $h4.16b, $h4.16b, $h4.16b, #8
843
844         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
845         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
846         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
847
848         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
849         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
850
851         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
852         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
853
854         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
855         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
856         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
857         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
858
859         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
860         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
861
862         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
863         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
864         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
865
866         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
867         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
868         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
869
870         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
871         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
872
873         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
874         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
875         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
876
877         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
878         ext     $h1.16b, $h1.16b, $h1.16b, #8
879         ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
880         ext     $h2.16b, $h2.16b, $h2.16b, #8
881         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
882         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
883
884         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
885         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
886         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
887
888         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
889         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
890         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
891
892         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
893         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
894
895         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
896         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
897         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
898
899         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
900         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
901         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
902
903         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
904         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
905         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
906
907         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
908         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
909         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
910
911         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
912         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
913         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
914
915         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
916         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
917         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
918
919         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
920         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
921
922         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
923         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
924         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
925
926         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
927         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
928         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
929
930         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
931         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
932         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
933
934         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
935         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
936         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
937
938         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
939         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
940
941         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
942         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
943         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
944
945         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
946         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
947
948         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
949         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
950         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
951
952         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
953         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
954         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
955
956         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
957         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
958         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
959
960         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
961         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
962         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
963
964         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
965         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
966         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
967         ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
968
969         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
970         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
971         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
972
973         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
974         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
975
976         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
977         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
978
979         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
980         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
981         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
982         ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
983
984         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
985         eor3    $acc_lb, $acc_lb, $acc_hb, $acc_mb                      @ MODULO - fold into low
986         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
987
988         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
989         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
990         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
991
992         ldr     $rk10q, [$cc, #160]                                     @ load rk10
993         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
994         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
995
996         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
997         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
998
999         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
1000         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
1001
1002         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
1003         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
1004 .L128_enc_tail:                                                         @ TAIL
1005
1006         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
1007         ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - load plaintext
1008
1009         mov     $t1.16b, $rk10
1010         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
1011         ext     $h5.16b, $h5.16b, $h5.16b, #8
1012
1013         eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                       @ AES block 8k+8 - result
1014         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
1015         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
1016         ext     $h6.16b, $h6.16b, $h6.16b, #8
1017         ext     $h7.16b, $h7.16b, $h7.16b, #8
1018
1019         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
1020         ext     $h8.16b, $h8.16b, $h8.16b, #8
1021         cmp     $main_end_input_ptr, #112
1022         b.gt    .L128_enc_blocks_more_than_7
1023
1024         mov     $ctr7b, $ctr6b
1025         mov     $ctr6b, $ctr5b
1026         movi    $acc_h.8b, #0
1027
1028         cmp     $main_end_input_ptr, #96
1029         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1030         mov     $ctr5b, $ctr4b
1031
1032         mov     $ctr4b, $ctr3b
1033         mov     $ctr3b, $ctr2b
1034         mov     $ctr2b, $ctr1b
1035
1036         movi    $acc_l.8b, #0
1037         movi    $acc_m.8b, #0
1038         b.gt    .L128_enc_blocks_more_than_6
1039
1040         mov     $ctr7b, $ctr6b
1041         cmp     $main_end_input_ptr, #80
1042
1043         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1044         mov     $ctr6b, $ctr5b
1045         mov     $ctr5b, $ctr4b
1046
1047         mov     $ctr4b, $ctr3b
1048         mov     $ctr3b, $ctr1b
1049         b.gt    .L128_enc_blocks_more_than_5
1050
1051         cmp     $main_end_input_ptr, #64
1052         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1053
1054         mov     $ctr7b, $ctr6b
1055         mov     $ctr6b, $ctr5b
1056
1057         mov     $ctr5b, $ctr4b
1058         mov     $ctr4b, $ctr1b
1059         b.gt    .L128_enc_blocks_more_than_4
1060
1061         mov     $ctr7b, $ctr6b
1062         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1063         mov     $ctr6b, $ctr5b
1064
1065         mov     $ctr5b, $ctr1b
1066         cmp     $main_end_input_ptr, #48
1067         b.gt    .L128_enc_blocks_more_than_3
1068
1069         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1070         mov     $ctr7b, $ctr6b
1071         mov     $ctr6b, $ctr1b
1072
1073         cmp     $main_end_input_ptr, #32
1074         ldr     $h34kq, [$current_tag, #96]                                     @ load h4k | h3k
1075         b.gt    .L128_enc_blocks_more_than_2
1076
1077         cmp     $main_end_input_ptr, #16
1078
1079         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1080         mov     $ctr7b, $ctr1b
1081         b.gt    .L128_enc_blocks_more_than_1
1082
1083         ldr     $h12kq, [$current_tag, #48]                                     @ load h2k | h1k
1084         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
1085         b        .L128_enc_blocks_less_than_1
1086 .L128_enc_blocks_more_than_7:                                           @ blocks left >  7
1087         st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
1088
1089         rev64   $res0b, $res1b                                          @ GHASH final-7 block
1090         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
1091
1092         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1093
1094         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
1095
1096         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
1097
1098         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
1099
1100         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
1101         movi    $t0.8b, #0                                              @ supress further partial tag feed in
1102
1103         eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
1104
1105         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
1106         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
1107 .L128_enc_blocks_more_than_6:                                           @ blocks left >  6
1108
1109         st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
1110
1111         rev64   $res0b, $res1b                                          @ GHASH final-6 block
1112         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
1113
1114         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1115
1116         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
1117
1118         eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
1119         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
1120
1121         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
1122         movi    $t0.8b, #0                                              @ supress further partial tag feed in
1123
1124         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
1125         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
1126
1127         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
1128
1129         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
1130         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
1131 .L128_enc_blocks_more_than_5:                                           @ blocks left >  5
1132
1133         st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
1134
1135         rev64   $res0b, $res1b                                          @ GHASH final-5 block
1136
1137         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1138
1139         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
1140         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
1141         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
1142
1143         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
1144
1145         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
1146
1147         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
1148
1149         eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
1150         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
1151         movi    $t0.8b, #0                                              @ supress further partial tag feed in
1152
1153         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
1154         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
1155
1156         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
1157 .L128_enc_blocks_more_than_4:                                           @ blocks left >  4
1158
1159         st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
1160
1161         rev64   $res0b, $res1b                                          @ GHASH final-4 block
1162
1163         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
1164
1165         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1166
1167         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
1168         movi    $t0.8b, #0                                              @ supress further partial tag feed in
1169         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
1170
1171         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
1172
1173         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
1174
1175         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
1176         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
1177
1178         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
1179
1180         eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
1181         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
1182 .L128_enc_blocks_more_than_3:                                           @ blocks left >  3
1183
1184         st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
1185
1186         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
1187         ext     $h4.16b, $h4.16b, $h4.16b, #8
1188
1189         rev64   $res0b, $res1b                                          @ GHASH final-3 block
1190
1191         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1192         movi    $t0.8b, #0                                              @ supress further partial tag feed in
1193
1194         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
1195         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
1196         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
1197
1198         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
1199
1200         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
1201
1202         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
1203         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
1204
1205         eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
1206
1207         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
1208         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
1209
1210         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
1211         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
1212 .L128_enc_blocks_more_than_2:                                           @ blocks left >  2
1213
1214         st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
1215
1216         rev64   $res0b, $res1b                                          @ GHASH final-2 block
1217
1218         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1219
1220         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
1221
1222         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
1223         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
1224         ext     $h3.16b, $h3.16b, $h3.16b, #8
1225         movi    $t0.8b, #0                                              @ supress further partial tag feed in
1226
1227         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
1228         eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
1229
1230         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
1231
1232         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
1233         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
1234
1235         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
1236
1237         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
1238         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
1239 .L128_enc_blocks_more_than_1:                                           @ blocks left >  1
1240
1241         st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
1242
1243         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
1244         ext     $h2.16b, $h2.16b, $h2.16b, #8
1245         rev64   $res0b, $res1b                                          @ GHASH final-1 block
1246         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
1247
1248         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1249
1250         movi    $t0.8b, #0                                              @ supress further partial tag feed in
1251         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
1252         eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
1253
1254         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
1255
1256         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
1257
1258         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
1259
1260         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
1261
1262         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
1263         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
1264
1265         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
1266
1267         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
1268         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
1269 .L128_enc_blocks_less_than_1:                                           @ blocks left <= 1
1270
1271         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
1272         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
1273         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
1274
1275         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
1276
1277         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
1278
1279         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
1280         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
1281         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
1282
1283         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
1284         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
1285         cmp     $bit_length, #64
1286
1287         csel    $temp2_x, $temp1_x, $temp0_x, lt
1288         csel    $temp3_x, $temp0_x, xzr, lt
1289
1290         mov     $ctr0.d[1], $temp3_x
1291         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
1292
1293         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
1294
1295         rev64   $res0b, $res1b                                          @ GHASH final block
1296
1297         bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
1298         st1     { $res1b}, [$output_ptr]                                @ store all 16B
1299
1300         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
1301
1302         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
1303
1304         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
1305         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
1306         ext     $h1.16b, $h1.16b, $h1.16b, #8
1307
1308         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
1309
1310         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
1311         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
1312         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
1313
1314         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
1315
1316         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
1317
1318         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
1319
1320         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
1321         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
1322
1323         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
1324
1325         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
1326
1327         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
1328         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
1329
1330         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
1331         ext     $acc_lb, $acc_lb, $acc_lb, #8
1332         rev64   $acc_lb, $acc_lb
1333         st1     { $acc_l.16b }, [$current_tag]
1334         lsr     x0, $bit_length, #3                                     @ return sizes
1335
1336         ldp     d10, d11, [sp, #16]
1337         ldp     d12, d13, [sp, #32]
1338         ldp     d14, d15, [sp, #48]
1339         ldp     d8, d9, [sp], #80
1340         ret
1341
1342 .L128_enc_ret:
1343         mov w0, #0x0
1344         ret
1345 .size unroll8_eor3_aes_gcm_enc_128_kernel,.-unroll8_eor3_aes_gcm_enc_128_kernel
1346 ___
1347
1348 #########################################################################################
1349 # size_t unroll8_eor3_aes_gcm_dec_128_kernel(const unsigned char *in,
1350 #                               size_t len,
1351 #                               unsigned char *out,
1352 #                               u64 *Xi,
1353 #                               unsigned char ivec[16],
1354 #                               const void *key);
1355 #
1356 $code.=<<___;
1357 .global unroll8_eor3_aes_gcm_dec_128_kernel
1358 .type   unroll8_eor3_aes_gcm_dec_128_kernel,%function
1359 .align  4
1360 unroll8_eor3_aes_gcm_dec_128_kernel:
1361         AARCH64_VALID_CALL_TARGET
1362         cbz     x1, .L128_dec_ret
1363         stp     d8, d9, [sp, #-80]!
1364         mov     $counter, x4
1365         mov     $cc, x5
1366         stp     d10, d11, [sp, #16]
1367         stp     d12, d13, [sp, #32]
1368         stp     d14, d15, [sp, #48]
1369         mov     x5, #0xc200000000000000
1370         stp     x5, xzr, [sp, #64]
1371         add     $modulo_constant, sp, #64
1372
1373         lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
1374         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
1375
1376         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
1377         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
1378
1379         mov     $constant_temp, #0x100000000                            @ set up counter increment
1380         movi    $rctr_inc.16b, #0x0
1381         mov     $rctr_inc.d[1], $constant_temp
1382         ld1     { $acc_lb}, [$current_tag]
1383           ext   $acc_lb, $acc_lb, $acc_lb, #8
1384         rev64   $acc_lb, $acc_lb
1385
1386         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
1387
1388         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
1389
1390         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
1391
1392         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
1393         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
1394
1395         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1396
1397         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
1398         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
1399         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
1400
1401         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
1402         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
1403
1404         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
1405         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
1406
1407         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
1408         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
1409
1410         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
1411         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
1412
1413         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
1414
1415         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
1416         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
1417         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
1418
1419         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
1420         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
1421
1422         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
1423
1424         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
1425         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
1426
1427         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
1428
1429         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
1430
1431         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
1432         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
1433
1434         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
1435         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
1436
1437         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
1438         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
1439         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
1440
1441         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
1442         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
1443         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
1444
1445         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
1446         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
1447         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
1448
1449         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
1450         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
1451
1452         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
1453         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
1454
1455         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
1456         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
1457
1458         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
1459         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
1460
1461         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
1462         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
1463         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
1464
1465         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
1466         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
1467         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
1468
1469         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
1470         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
1471         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
1472
1473         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
1474         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
1475         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
1476
1477         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
1478         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
1479
1480         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
1481         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
1482
1483         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
1484
1485         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
1486         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
1487         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
1488
1489         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
1490         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
1491         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
1492
1493         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
1494         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
1495         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
1496
1497         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
1498         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
1499         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
1500
1501         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
1502         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
1503         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
1504
1505         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
1506         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
1507         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
1508
1509         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1510         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
1511
1512         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
1513         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
1514
1515         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
1516         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
1517         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
1518
1519         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
1520         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
1521         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
1522
1523         aese    $ctr0b, $rk9                                            @ AES block 0 - round 9
1524         aese    $ctr1b, $rk9                                            @ AES block 1 - round 9
1525         aese    $ctr6b, $rk9                                            @ AES block 6 - round 9
1526
1527         ldr     $rk10q, [$cc, #160]                                     @ load rk10
1528         aese    $ctr4b, $rk9                                            @ AES block 4 - round 9
1529         aese    $ctr3b, $rk9                                            @ AES block 3 - round 9
1530
1531         aese    $ctr2b, $rk9                                            @ AES block 2 - round 9
1532         aese    $ctr5b, $rk9                                            @ AES block 5 - round 9
1533         aese    $ctr7b, $rk9                                            @ AES block 7 - round 9
1534
1535         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
1536         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
1537         b.ge    .L128_dec_tail                                          @ handle tail
1538
1539         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
1540
1541         eor3    $ctr0b, $res0b, $ctr0b, $rk10                           @ AES block 0 - result
1542         eor3    $ctr1b, $res1b, $ctr1b, $rk10                           @ AES block 1 - result
1543         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
1544
1545         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
1546         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
1547         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
1548
1549         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
1550
1551         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
1552         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
1553         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
1554
1555         eor3    $ctr3b, $res3b, $ctr3b, $rk10                           @ AES block 3 - result
1556         eor3    $ctr2b, $res2b, $ctr2b, $rk10                           @ AES block 2 - result
1557         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
1558
1559         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
1560         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
1561
1562         eor3    $ctr6b, $res6b, $ctr6b, $rk10                           @ AES block 6 - result
1563
1564         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
1565         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
1566
1567         eor3    $ctr4b, $res4b, $ctr4b, $rk10                           @ AES block 4 - result
1568         eor3    $ctr5b, $res5b, $ctr5b, $rk10                           @ AES block 5 - result
1569         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
1570
1571         eor3    $ctr7b, $res7b, $ctr7b, $rk10                           @ AES block 7 - result
1572         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
1573         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
1574
1575         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
1576         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
1577         b.ge    .L128_dec_prepretail                                    @ do prepretail
1578
1579 .L128_dec_main_loop:                                                    @ main loop start
1580         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
1581         ext     $h7.16b, $h7.16b, $h7.16b, #8
1582         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
1583         ext     $h8.16b, $h8.16b, $h8.16b, #8
1584
1585         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
1586         rev64   $res0b, $res0b                                          @ GHASH block 8k
1587         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
1588
1589         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
1590         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
1591         ext     $h5.16b, $h5.16b, $h5.16b, #8
1592         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
1593         ext     $h6.16b, $h6.16b, $h6.16b, #8
1594
1595         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
1596         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
1597         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
1598
1599         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
1600         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
1601         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
1602
1603         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
1604         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
1605         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
1606         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
1607
1608         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
1609         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
1610         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
1611
1612         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
1613         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
1614         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
1615
1616         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
1617         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
1618         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
1619
1620         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
1621         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
1622         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
1623
1624         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
1625         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
1626         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
1627
1628         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
1629         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
1630         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
1631
1632         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
1633         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
1634         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
1635
1636         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
1637         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
1638         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
1639
1640         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
1641         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
1642         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
1643
1644         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
1645         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
1646         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
1647
1648         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
1649         ext     $h3.16b, $h3.16b, $h3.16b, #8
1650         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
1651         ext     $h4.16b, $h4.16b, $h4.16b, #8
1652         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
1653         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
1654
1655         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
1656         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
1657         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
1658
1659         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
1660         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
1661         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
1662
1663         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
1664         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
1665         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
1666
1667         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
1668         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
1669         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
1670         ext     $h1.16b, $h1.16b, $h1.16b, #8
1671         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
1672         ext     $h2.16b, $h2.16b, $h2.16b, #8
1673
1674         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
1675         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
1676         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
1677
1678         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
1679         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
1680         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
1681
1682         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
1683         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
1684         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
1685
1686         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
1687         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
1688         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
1689
1690         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
1691         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
1692         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
1693
1694         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
1695         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
1696         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
1697         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
1698
1699         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
1700         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
1701         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
1702
1703         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
1704         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
1705         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
1706
1707         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
1708         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
1709         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
1710
1711         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
1712         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
1713         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
1714
1715         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
1716         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
1717         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
1718
1719         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
1720         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
1721         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
1722
1723         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
1724         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
1725         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
1726
1727         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
1728         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
1729         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
1730
1731         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
1732         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
1733         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
1734
1735         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
1736         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
1737         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
1738
1739         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
1740         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
1741         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
1742
1743         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
1744         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
1745         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
1746
1747         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
1748         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
1749         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
1750
1751         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
1752         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
1753         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
1754
1755         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
1756         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
1757         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
1758
1759         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
1760         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
1761         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
1762
1763         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
1764         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
1765         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
1766
1767         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
1768         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
1769         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
1770
1771         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
1772         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
1773         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
1774
1775         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
1776         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
1777         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
1778
1779         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
1780         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
1781         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
1782
1783         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
1784         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
1785         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
1786
1787         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
1788         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
1789         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
1790
1791         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
1792         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
1793         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
1794
1795         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
1796         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
1797         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
1798
1799         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
1800         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
1801         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
1802
1803         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
1804         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
1805         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
1806
1807         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
1808         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
1809         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
1810
1811         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
1812         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
1813         ldr     $rk10q, [$cc, #160]                                     @ load rk10
1814
1815         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
1816         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
1817         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
1818
1819         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
1820         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
1821         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
1822
1823         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
1824         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
1825
1826         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
1827         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
1828         eor3    $ctr1b, $res1b, $ctr1b, $rk10                           @ AES block 8k+9 - result
1829
1830         eor3    $ctr0b, $res0b, $ctr0b, $rk10                           @ AES block 8k+8 - result
1831         eor3    $ctr7b, $res7b, $ctr7b, $rk10                           @ AES block 8k+15 - result
1832         eor3    $ctr6b, $res6b, $ctr6b, $rk10                           @ AES block 8k+14 - result
1833
1834         eor3    $ctr2b, $res2b, $ctr2b, $rk10                           @ AES block 8k+10 - result
1835         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
1836         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
1837
1838         eor3    $ctr4b, $res4b, $ctr4b, $rk10                           @ AES block 8k+12 - result
1839         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
1840         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
1841
1842         eor3    $ctr3b, $res3b, $ctr3b, $rk10                           @ AES block 8k+11 - result
1843         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
1844         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
1845
1846         eor3    $ctr5b, $res5b, $ctr5b, $rk10                           @ AES block 8k+13 - result
1847         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
1848
1849         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
1850         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
1851         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
1852
1853         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
1854         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
1855         b.lt    .L128_dec_main_loop
1856
1857 .L128_dec_prepretail:                                                   @ PREPRETAIL
1858         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
1859         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
1860         rev64   $res0b, $res0b                                          @ GHASH block 8k
1861
1862         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
1863         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
1864         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
1865
1866         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
1867         ext     $h7.16b, $h7.16b, $h7.16b, #8
1868         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
1869         ext     $h8.16b, $h8.16b, $h8.16b, #8
1870         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
1871         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
1872
1873         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
1874         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
1875         ext     $h5.16b, $h5.16b, $h5.16b, #8
1876         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
1877         ext     $h6.16b, $h6.16b, $h6.16b, #8
1878         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
1879
1880         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
1881
1882         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
1883
1884         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
1885         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
1886         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
1887         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
1888
1889         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
1890         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
1891         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
1892
1893         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
1894         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
1895         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
1896
1897         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
1898         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
1899         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
1900
1901         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
1902         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
1903         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
1904
1905         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
1906         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
1907         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
1908
1909         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
1910         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
1911         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
1912
1913         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
1914         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
1915         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
1916
1917         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k - mid
1918         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
1919         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
1920
1921         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
1922         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
1923         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
1924
1925         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
1926         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
1927         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
1928
1929         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
1930         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
1931         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
1932
1933         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
1934         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
1935         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
1936
1937         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
1938         ext     $h3.16b, $h3.16b, $h3.16b, #8
1939         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
1940         ext     $h4.16b, $h4.16b, $h4.16b, #8
1941         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
1942         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
1943
1944         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
1945         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
1946         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
1947
1948         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
1949         ext     $h1.16b, $h1.16b, $h1.16b, #8
1950         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
1951         ext     $h2.16b, $h2.16b, $h2.16b, #8
1952         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
1953
1954         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
1955         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
1956         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
1957
1958         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
1959         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
1960         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
1961
1962         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
1963         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
1964         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
1965
1966         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
1967         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
1968         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
1969
1970         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
1971         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
1972         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
1973
1974         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
1975         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
1976         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
1977         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
1978
1979         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
1980         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
1981         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
1982
1983         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
1984         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
1985         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
1986
1987         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
1988         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
1989         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
1990
1991         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
1992         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
1993         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
1994
1995         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
1996         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
1997         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
1998
1999         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
2000         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
2001         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
2002
2003         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
2004         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
2005         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
2006
2007         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
2008         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
2009         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
2010
2011         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
2012         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
2013         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
2014
2015         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
2016         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
2017         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
2018
2019         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
2020         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
2021         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
2022
2023         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
2024         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
2025         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
2026
2027         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
2028         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
2029         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
2030
2031         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
2032         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
2033         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
2034
2035         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
2036         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
2037         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
2038
2039         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
2040         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
2041         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
2042
2043         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
2044         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
2045         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
2046
2047         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
2048         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
2049         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
2050
2051         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
2052         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
2053         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
2054
2055         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
2056         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
2057         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
2058
2059         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
2060         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
2061         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
2062
2063         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
2064         ldr     $rk10q, [$cc, #160]                                     @ load rk10
2065
2066         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
2067         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
2068
2069         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
2070         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
2071         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
2072
2073         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
2074         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
2075         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
2076
2077         aese    $ctr6b, $rk9                                            @ AES block 8k+14 - round 9
2078         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
2079         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
2080
2081         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
2082         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
2083         aese    $ctr2b, $rk9                                            @ AES block 8k+10 - round 9
2084
2085         aese    $ctr3b, $rk9                                            @ AES block 8k+11 - round 9
2086         aese    $ctr5b, $rk9                                            @ AES block 8k+13 - round 9
2087         aese    $ctr0b, $rk9                                            @ AES block 8k+8 - round 9
2088
2089         aese    $ctr4b, $rk9                                            @ AES block 8k+12 - round 9
2090         aese    $ctr1b, $rk9                                            @ AES block 8k+9 - round 9
2091         aese    $ctr7b, $rk9                                            @ AES block 8k+15 - round 9
2092
2093 .L128_dec_tail:                                                         @ TAIL
2094
2095         mov     $t1.16b, $rk10
2096         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
2097
2098         cmp     $main_end_input_ptr, #112
2099
2100         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
2101         ext     $h8.16b, $h8.16b, $h8.16b, #8
2102         ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
2103
2104         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
2105         ext     $h5.16b, $h5.16b, $h5.16b, #8
2106         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
2107
2108         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
2109         ext     $h6.16b, $h6.16b, $h6.16b, #8
2110         ext     $h7.16b, $h7.16b, $h7.16b, #8
2111
2112         eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
2113         b.gt    .L128_dec_blocks_more_than_7
2114
2115         cmp     $main_end_input_ptr, #96
2116         mov     $ctr7b, $ctr6b
2117         movi    $acc_l.8b, #0
2118
2119         movi    $acc_h.8b, #0
2120         mov     $ctr6b, $ctr5b
2121         mov     $ctr5b, $ctr4b
2122
2123         mov     $ctr4b, $ctr3b
2124         mov     $ctr3b, $ctr2b
2125         mov     $ctr2b, $ctr1b
2126
2127         movi    $acc_m.8b, #0
2128         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2129         b.gt    .L128_dec_blocks_more_than_6
2130
2131         cmp     $main_end_input_ptr, #80
2132         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2133
2134         mov     $ctr7b, $ctr6b
2135         mov     $ctr6b, $ctr5b
2136         mov     $ctr5b, $ctr4b
2137
2138         mov     $ctr4b, $ctr3b
2139         mov     $ctr3b, $ctr1b
2140         b.gt    .L128_dec_blocks_more_than_5
2141
2142         cmp     $main_end_input_ptr, #64
2143
2144         mov     $ctr7b, $ctr6b
2145         mov     $ctr6b, $ctr5b
2146         mov     $ctr5b, $ctr4b
2147
2148         mov     $ctr4b, $ctr1b
2149         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2150         b.gt    .L128_dec_blocks_more_than_4
2151
2152         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2153         mov     $ctr7b, $ctr6b
2154         mov     $ctr6b, $ctr5b
2155
2156         mov     $ctr5b, $ctr1b
2157         cmp     $main_end_input_ptr, #48
2158         b.gt    .L128_dec_blocks_more_than_3
2159
2160         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2161         mov     $ctr7b, $ctr6b
2162         cmp     $main_end_input_ptr, #32
2163
2164         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
2165         mov     $ctr6b, $ctr1b
2166         b.gt    .L128_dec_blocks_more_than_2
2167
2168         cmp     $main_end_input_ptr, #16
2169
2170         mov     $ctr7b, $ctr1b
2171         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2172         b.gt    L128_dec_blocks_more_than_1
2173
2174         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
2175         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
2176         b        .L128_dec_blocks_less_than_1
2177 .L128_dec_blocks_more_than_7:                                           @ blocks left >  7
2178         rev64   $res0b, $res1b                                          @ GHASH final-7 block
2179
2180         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2181
2182         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
2183
2184         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
2185         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
2186
2187         movi    $t0.8b, #0                                              @ supress further partial tag feed in
2188         ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
2189
2190         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
2191
2192         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
2193         st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
2194         eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
2195
2196         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
2197 .L128_dec_blocks_more_than_6:                                           @ blocks left >  6
2198
2199         rev64   $res0b, $res1b                                          @ GHASH final-6 block
2200
2201         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2202
2203         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
2204
2205         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
2206
2207         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
2208         ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
2209         movi    $t0.8b, #0                                              @ supress further partial tag feed in
2210
2211         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
2212         st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
2213         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
2214
2215         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
2216         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
2217
2218         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
2219         eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
2220 .L128_dec_blocks_more_than_5:                                           @ blocks left >  5
2221
2222         rev64   $res0b, $res1b                                          @ GHASH final-5 block
2223
2224         ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
2225         st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
2226
2227         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2228
2229         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
2230
2231         eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
2232
2233         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
2234
2235         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
2236         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
2237         movi    $t0.8b, #0                                              @ supress further partial tag feed in
2238
2239         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
2240         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
2241         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
2242
2243         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
2244         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
2245 .L128_dec_blocks_more_than_4:                                           @ blocks left >  4
2246
2247         rev64   $res0b, $res1b                                          @ GHASH final-4 block
2248
2249         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2250         ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
2251
2252         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
2253         movi    $t0.8b, #0                                              @ supress further partial tag feed in
2254         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
2255
2256         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
2257
2258         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
2259
2260         st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
2261         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
2262
2263         eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
2264         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
2265
2266         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
2267
2268         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
2269 .L128_dec_blocks_more_than_3:                                           @ blocks left >  3
2270
2271         st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
2272         rev64   $res0b, $res1b                                          @ GHASH final-3 block
2273
2274         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2275
2276         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
2277
2278         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
2279         ext     $h4.16b, $h4.16b, $h4.16b, #8
2280         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
2281
2282         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
2283
2284         ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
2285
2286         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
2287         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
2288         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
2289
2290         movi    $t0.8b, #0                                              @ supress further partial tag feed in
2291         eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
2292         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
2293
2294         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
2295
2296         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
2297         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
2298 .L128_dec_blocks_more_than_2:                                           @ blocks left >  2
2299
2300         rev64   $res0b, $res1b                                          @ GHASH final-2 block
2301
2302         st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
2303
2304         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2305         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
2306         ext     $h3.16b, $h3.16b, $h3.16b, #8
2307         movi    $t0.8b, #0                                              @ supress further partial tag feed in
2308
2309         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
2310
2311         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
2312
2313         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
2314
2315         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
2316         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
2317         ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
2318
2319         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
2320
2321         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
2322
2323         eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
2324         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
2325 .L128_dec_blocks_more_than_1:                                           @ blocks left >  1
2326
2327         st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
2328         rev64   $res0b, $res1b                                          @ GHASH final-1 block
2329
2330         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
2331         ext     $h2.16b, $h2.16b, $h2.16b, #8
2332
2333         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2334
2335         movi    $t0.8b, #0                                              @ supress further partial tag feed in
2336
2337         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
2338
2339         ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
2340         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
2341
2342         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
2343         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
2344         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
2345
2346         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
2347         eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
2348
2349         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
2350
2351         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
2352
2353         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
2354
2355         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
2356 .L128_dec_blocks_less_than_1:                                           @ blocks left <= 1
2357
2358         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
2359
2360         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
2361
2362         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
2363
2364         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
2365         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
2366
2367         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
2368         cmp     $bit_length, #64
2369         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
2370
2371         csel    $temp2_x, $temp1_x, $temp0_x, lt
2372         csel    $temp3_x, $temp0_x, xzr, lt
2373
2374         mov     $ctr0.d[1], $temp3_x
2375         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
2376
2377         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
2378         ext     $h1.16b, $h1.16b, $h1.16b, #8
2379         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
2380
2381         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
2382
2383         rev64   $res0b, $res1b                                          @ GHASH final block
2384
2385         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
2386
2387         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
2388         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
2389
2390         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
2391         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
2392
2393         bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
2394
2395         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
2396         st1     { $res4b}, [$output_ptr]                                @ store all 16B
2397
2398         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
2399
2400         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
2401         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
2402
2403         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
2404
2405         eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
2406
2407         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
2408         ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
2409
2410         eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
2411
2412         eor3    $acc_mb, $acc_mb, $acc_hb, $t11.16b                     @ MODULO - fold into mid
2413
2414         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
2415         ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
2416
2417         eor3    $acc_lb, $acc_lb, $acc_mb, $acc_hb                      @ MODULO - fold into low
2418         ext     $acc_lb, $acc_lb, $acc_lb, #8
2419         rev64   $acc_lb, $acc_lb
2420         st1     { $acc_l.16b }, [$current_tag]
2421         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
2422
2423         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
2424
2425         lsr     x0, $bit_length, #3
2426
2427         ldp     d10, d11, [sp, #16]
2428         ldp     d12, d13, [sp, #32]
2429         ldp     d14, d15, [sp, #48]
2430         ldp     d8, d9, [sp], #80
2431         ret
2432 .L128_dec_ret:
2433         mov w0, #0x0
2434         ret
2435 .size unroll8_eor3_aes_gcm_dec_128_kernel,.-unroll8_eor3_aes_gcm_dec_128_kernel
2436 ___
2437 }
2438
2439 {
2440 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
2441 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
2442 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
2443 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
2444 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
2445 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
2446 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
2447
2448 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
2449 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
2450 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
2451
2452 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
2453 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
2454
2455 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
2456 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
2457 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
2458 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
2459
2460 my $t0="v16";
2461 my $t0d="d16";
2462
2463 my $t1="v29";
2464 my $t2=$res1;
2465 my $t3=$t1;
2466
2467 my $t4=$res0;
2468 my $t5=$res2;
2469 my $t6=$t0;
2470
2471 my $t7=$res3;
2472 my $t8=$res4;
2473 my $t9=$res5;
2474
2475 my $t10=$res6;
2476 my $t11="v21";
2477 my $t12=$t1;
2478
2479 my $rtmp_ctr="v30";
2480 my $rtmp_ctrq="q30";
2481 my $rctr_inc="v31";
2482 my $rctr_incd="d31";
2483
2484 my $mod_constantd=$t0d;
2485 my $mod_constant=$t0;
2486
2487 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
2488 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
2489 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
2490 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
2491 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
2492 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
2493 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
2494 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
2495 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
2496 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
2497 my $rk2q1="v28.1q";
2498 my $rk3q1="v26.1q";
2499 my $rk4v="v27";
2500
2501 #########################################################################################
2502 # size_t unroll8_eor3_aes_gcm_enc_192_kernel(const unsigned char *in,
2503 #                               size_t len,
2504 #                               unsigned char *out,
2505 #                               const void *key,
2506 #                               unsigned char ivec[16],
2507 #                               u64 *Xi);
2508 #
2509 $code.=<<___;
2510 .global unroll8_eor3_aes_gcm_enc_192_kernel
2511 .type   unroll8_eor3_aes_gcm_enc_192_kernel,%function
2512 .align  4
2513 unroll8_eor3_aes_gcm_enc_192_kernel:
2514         AARCH64_VALID_CALL_TARGET
2515         cbz     x1, .L192_enc_ret
2516         stp     d8, d9, [sp, #-80]!
2517         mov     $counter, x4
2518         mov     $cc, x5
2519         stp     d10, d11, [sp, #16]
2520         stp     d12, d13, [sp, #32]
2521         stp     d14, d15, [sp, #48]
2522         mov     x5, #0xc200000000000000
2523         stp     x5, xzr, [sp, #64]
2524         add     $modulo_constant, sp, #64
2525
2526         lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
2527         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
2528
2529         mov     $constant_temp, #0x100000000                            @ set up counter increment
2530         movi    $rctr_inc.16b, #0x0
2531         mov     $rctr_inc.d[1], $constant_temp
2532
2533         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
2534
2535         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
2536
2537         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
2538         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
2539
2540         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
2541         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
2542
2543         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
2544         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
2545
2546         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
2547         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
2548         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
2549
2550         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2551
2552         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
2553         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
2554         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
2555
2556         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2557
2558         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
2559         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
2560
2561         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
2562
2563         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
2564         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
2565         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
2566
2567         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
2568         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
2569         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
2570
2571         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
2572         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
2573         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
2574
2575         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
2576         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
2577
2578         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
2579         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
2580         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
2581
2582         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
2583         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
2584         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
2585
2586         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
2587         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
2588         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
2589
2590         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
2591         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
2592
2593         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
2594         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
2595         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
2596
2597         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
2598         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
2599
2600         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
2601         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
2602         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
2603
2604         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
2605
2606         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
2607
2608         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
2609
2610         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
2611         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
2612         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
2613
2614         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
2615         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
2616         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
2617
2618         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
2619         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
2620         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
2621
2622         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
2623         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
2624         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
2625
2626         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
2627         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
2628         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
2629
2630         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
2631         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
2632         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
2633
2634         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
2635
2636         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
2637         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
2638         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
2639
2640         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
2641         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
2642         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
2643
2644         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
2645         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
2646         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
2647
2648         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
2649         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
2650
2651         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
2652         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
2653
2654         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
2655         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
2656
2657         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
2658         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
2659
2660         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
2661         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
2662
2663         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
2664         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
2665         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
2666
2667         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
2668         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
2669         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
2670
2671         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
2672         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
2673         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
2674
2675         ld1     { $acc_lb}, [$current_tag]
2676         ext     $acc_lb, $acc_lb, $acc_lb, #8
2677         rev64   $acc_lb, $acc_lb
2678         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
2679
2680         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
2681         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
2682
2683         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
2684         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
2685
2686         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
2687         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
2688
2689         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 14 - round 10
2690         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
2691         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 11 - round 10
2692
2693         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 9 - round 10
2694         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 13 - round 10
2695         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 12 - round 10
2696
2697         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8 - round 10
2698         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 10 - round 10
2699         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 15 - round 10
2700
2701         aese    $ctr6b, $rk11                                           @ AES block 14 - round 11
2702         aese    $ctr3b, $rk11                                           @ AES block 11 - round 11
2703
2704         aese    $ctr4b, $rk11                                           @ AES block 12 - round 11
2705         aese    $ctr7b, $rk11                                           @ AES block 15 - round 11
2706         ldr     $rk12q, [$cc, #192]                                     @ load rk12
2707
2708         aese    $ctr1b, $rk11                                           @ AES block 9 - round 11
2709         aese    $ctr5b, $rk11                                           @ AES block 13 - round 11
2710
2711         aese    $ctr2b, $rk11                                           @ AES block 10 - round 11
2712         aese    $ctr0b, $rk11                                           @ AES block 8 - round 11
2713         b.ge    .L192_enc_tail                                          @ handle tail
2714
2715         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
2716
2717         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
2718
2719         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
2720
2721         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
2722
2723         eor3    $res0b, $ctr_t0b, $ctr0b, $rk12                         @ AES block 0 - result
2724         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
2725         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
2726
2727         eor3    $res3b, $ctr_t3b, $ctr3b, $rk12                         @ AES block 3 - result
2728         eor3    $res1b, $ctr_t1b, $ctr1b, $rk12                         @ AES block 1 - result
2729
2730         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
2731         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
2732         eor3    $res4b, $ctr_t4b, $ctr4b, $rk12                         @ AES block 4 - result
2733
2734         eor3    $res5b, $ctr_t5b, $ctr5b, $rk12                         @ AES block 5 - result
2735         eor3    $res7b, $ctr_t7b, $ctr7b, $rk12                         @ AES block 7 - result
2736         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
2737
2738         eor3    $res2b, $ctr_t2b, $ctr2b, $rk12                         @ AES block 2 - result
2739         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
2740         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
2741
2742         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
2743         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
2744
2745         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
2746         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
2747         eor3    $res6b, $ctr_t6b, $ctr6b, $rk12                         @ AES block 6 - result
2748
2749         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
2750
2751         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
2752         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
2753         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
2754
2755         b.ge    .L192_enc_prepretail                                    @ do prepretail
2756
2757 .L192_enc_main_loop:                                                    @ main loop start
2758         rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
2759         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
2760         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
2761
2762         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
2763         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
2764         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
2765         ext     $h7.16b, $h7.16b, $h7.16b, #8
2766         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
2767         ext     $h8.16b, $h8.16b, $h8.16b, #8
2768
2769         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
2770         rev64   $res0b, $res0b                                          @ GHASH block 8k
2771         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
2772         ext     $h5.16b, $h5.16b, $h5.16b, #8
2773         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
2774         ext     $h6.16b, $h6.16b, $h6.16b, #8
2775
2776         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
2777         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
2778         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
2779
2780         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
2781         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
2782         rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
2783
2784         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
2785         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
2786         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
2787
2788         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
2789         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
2790         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
2791
2792         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
2793         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
2794         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
2795
2796         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
2797         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
2798         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
2799
2800         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
2801         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
2802         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
2803
2804         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
2805         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
2806         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
2807         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
2808
2809         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
2810         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
2811         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
2812
2813         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
2814         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
2815         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
2816
2817         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
2818         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
2819         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
2820
2821         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
2822         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
2823         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
2824
2825         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
2826         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
2827         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
2828
2829         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
2830         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
2831         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
2832
2833         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
2834         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
2835         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
2836
2837         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
2838         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
2839         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
2840
2841         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
2842         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
2843         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
2844
2845         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
2846         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
2847         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
2848         ext     $h3.16b, $h3.16b, $h3.16b, #8
2849         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
2850         ext     $h4.16b, $h4.16b, $h4.16b, #8
2851
2852         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k - mid
2853         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
2854         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
2855
2856         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
2857         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
2858         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
2859
2860         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
2861         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
2862         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
2863
2864         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
2865         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
2866         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
2867
2868         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
2869         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
2870         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
2871
2872         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
2873         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
2874         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
2875
2876         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
2877         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
2878         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
2879
2880         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
2881         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
2882         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
2883         ext     $h1.16b, $h1.16b, $h1.16b, #8
2884         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
2885         ext     $h2.16b, $h2.16b, $h2.16b, #8
2886
2887         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
2888         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
2889         rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
2890
2891         rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
2892         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
2893         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
2894
2895         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
2896         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
2897
2898         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
2899         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
2900         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
2901
2902         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
2903         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
2904         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
2905
2906         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
2907         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
2908         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
2909
2910         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
2911         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
2912         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
2913
2914         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
2915         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
2916         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
2917
2918         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
2919         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
2920         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
2921
2922         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
2923         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
2924
2925         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
2926         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
2927         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
2928
2929         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
2930         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
2931
2932         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
2933         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
2934         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
2935
2936         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
2937         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
2938         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
2939
2940         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
2941         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
2942         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
2943
2944         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
2945         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
2946         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
2947
2948         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
2949         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
2950         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
2951
2952         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
2953         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
2954         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
2955
2956         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
2957         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
2958         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
2959
2960         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
2961         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
2962         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
2963
2964         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
2965         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
2966         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
2967
2968         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
2969         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
2970         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
2971
2972         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
2973         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
2974         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
2975
2976         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
2977         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
2978         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
2979
2980         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
2981         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
2982         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
2983
2984         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
2985         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
2986         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
2987
2988         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
2989         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
2990         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
2991
2992         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
2993         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
2994         ldr     $rk12q, [$cc, #192]                                     @ load rk12
2995         ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
2996
2997         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
2998         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
2999         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
3000
3001         aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
3002         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
3003         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 8k+12, 8k+13 - load plaintext
3004
3005         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 8k+14, 8k+15 - load plaintext
3006         aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
3007         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
3008
3009         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
3010         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
3011
3012         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
3013         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
3014
3015         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
3016         aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
3017         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
3018
3019         aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
3020         aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
3021         eor3    $res4b, $ctr_t4b, $ctr4b, $rk12                         @ AES block 4 - result
3022
3023         aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
3024         aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
3025         aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
3026
3027         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
3028         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
3029         eor3    $res7b, $ctr_t7b, $ctr7b, $rk12                         @ AES block 7 - result
3030
3031         eor3    $res2b, $ctr_t2b, $ctr2b, $rk12                         @ AES block 8k+10 - result
3032         eor3    $res0b, $ctr_t0b, $ctr0b, $rk12                         @ AES block 8k+8 - result
3033         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
3034
3035         eor3    $res1b, $ctr_t1b, $ctr1b, $rk12                         @ AES block 8k+9 - result
3036         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
3037         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
3038         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
3039
3040         eor3    $res6b, $ctr_t6b, $ctr6b, $rk12                         @ AES block 6 - result
3041         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
3042         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
3043
3044         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
3045         eor3    $res5b, $ctr_t5b, $ctr5b, $rk12                         @ AES block 5 - result
3046         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
3047
3048         eor3    $res3b, $ctr_t3b, $ctr3b, $rk12                         @ AES block 8k+11 - result
3049         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
3050
3051         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
3052
3053         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
3054
3055         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
3056         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
3057         b.lt    .L192_enc_main_loop
3058
3059 .L192_enc_prepretail:                                                   @ PREPRETAIL
3060         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
3061         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
3062         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
3063
3064         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
3065         ext     $h7.16b, $h7.16b, $h7.16b, #8
3066         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
3067         ext     $h8.16b, $h8.16b, $h8.16b, #8
3068         rev64   $res0b, $res0b                                          @ GHASH block 8k
3069         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
3070
3071         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
3072         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
3073         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
3074         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
3075
3076         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
3077         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
3078         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
3079         ext     $h5.16b, $h5.16b, $h5.16b, #8
3080         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
3081         ext     $h6.16b, $h6.16b, $h6.16b, #8
3082
3083         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
3084         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
3085         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
3086
3087         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
3088         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
3089         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
3090
3091         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
3092         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
3093         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
3094
3095         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
3096         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
3097         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
3098
3099         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
3100         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
3101         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
3102
3103         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
3104         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
3105         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
3106
3107         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
3108         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
3109         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
3110
3111         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
3112         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
3113         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
3114
3115         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
3116         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
3117         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
3118
3119         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
3120         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
3121         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
3122
3123         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
3124         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
3125         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
3126
3127         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
3128         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
3129         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
3130
3131         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
3132         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
3133         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
3134
3135         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
3136         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
3137         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
3138
3139         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
3140         rev64   $res5b, $res5b                                          @ GHASH block 8k+5 (t0, t1, t2 and t3 free)
3141         rev64   $res6b, $res6b                                          @ GHASH block 8k+6 (t0, t1, and t2 free)
3142
3143         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
3144         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
3145         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
3146
3147         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
3148         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
3149         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
3150
3151         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
3152         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
3153         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
3154
3155         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
3156         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
3157         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
3158
3159         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
3160         ext     $h3.16b, $h3.16b, $h3.16b, #8
3161         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
3162         ext     $h4.16b, $h4.16b, $h4.16b, #8
3163         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
3164         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
3165
3166         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
3167         ext     $h1.16b, $h1.16b, $h1.16b, #8
3168         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
3169         ext     $h2.16b, $h2.16b, $h2.16b, #8
3170         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
3171         rev64   $res4b, $res4b                                          @ GHASH block 8k+4 (t0, t1, and t2 free)
3172
3173         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
3174         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
3175         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
3176
3177         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
3178         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
3179         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
3180
3181         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
3182         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
3183         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
3184
3185         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
3186         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
3187         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
3188
3189         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
3190         rev64   $res7b, $res7b                                          @ GHASH block 8k+7 (t0, t1, t2 and t3 free)
3191         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
3192         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
3193
3194         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
3195         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
3196         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
3197
3198         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
3199         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
3200         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
3201
3202         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
3203         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
3204
3205         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
3206         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
3207         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
3208
3209         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
3210         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
3211         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
3212
3213         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
3214         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
3215         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
3216
3217         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
3218         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
3219         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
3220
3221         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
3222         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
3223
3224         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
3225         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
3226         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
3227
3228         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
3229         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
3230         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
3231
3232         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
3233         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
3234         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
3235
3236         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
3237         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
3238         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
3239
3240         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
3241         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
3242         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
3243
3244         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
3245         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
3246         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
3247
3248         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
3249         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
3250         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
3251
3252         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
3253         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
3254
3255         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
3256         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
3257
3258         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
3259         ext     $t12.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
3260         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
3261         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
3262
3263         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
3264         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
3265
3266         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
3267         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
3268         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
3269
3270         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
3271         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
3272         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
3273
3274         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
3275         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
3276         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
3277
3278         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
3279         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
3280         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
3281
3282         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
3283         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
3284
3285         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
3286         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
3287         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
3288         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
3289
3290         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
3291         ldr     $rk12q, [$cc, #192]                                     @ load rk12
3292
3293         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
3294         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
3295         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
3296
3297         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
3298         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
3299         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
3300
3301         aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
3302         aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
3303
3304         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
3305         aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
3306
3307         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
3308         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
3309
3310         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
3311         aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
3312         aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
3313
3314         aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
3315         aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
3316         aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
3317
3318 .L192_enc_tail:                                                         @ TAIL
3319
3320         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
3321         ext     $h5.16b, $h5.16b, $h5.16b, #8
3322         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
3323
3324         ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - l3ad plaintext
3325
3326         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
3327         ext     $h8.16b, $h8.16b, $h8.16b, #8
3328
3329         mov     $t1.16b, $rk12
3330
3331         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
3332         ext     $h6.16b, $h6.16b, $h6.16b, #8
3333         ext     $h7.16b, $h7.16b, $h7.16b, #8
3334         cmp     $main_end_input_ptr, #112
3335
3336         eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                       @ AES block 8k+8 - result
3337         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
3338         b.gt    .L192_enc_blocks_more_than_7
3339
3340         cmp     $main_end_input_ptr, #96
3341         mov     $ctr7b, $ctr6b
3342         movi    $acc_h.8b, #0
3343
3344         mov     $ctr6b, $ctr5b
3345         movi    $acc_l.8b, #0
3346         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3347
3348         mov     $ctr5b, $ctr4b
3349         mov     $ctr4b, $ctr3b
3350         mov     $ctr3b, $ctr2b
3351
3352         mov     $ctr2b, $ctr1b
3353         movi    $acc_m.8b, #0
3354         b.gt    .L192_enc_blocks_more_than_6
3355
3356         mov     $ctr7b, $ctr6b
3357         cmp     $main_end_input_ptr, #80
3358
3359         mov     $ctr6b, $ctr5b
3360         mov     $ctr5b, $ctr4b
3361         mov     $ctr4b, $ctr3b
3362
3363         mov     $ctr3b, $ctr1b
3364         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3365         b.gt    .L192_enc_blocks_more_than_5
3366
3367         cmp     $main_end_input_ptr, #64
3368         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3369
3370         mov     $ctr7b, $ctr6b
3371         mov     $ctr6b, $ctr5b
3372         mov     $ctr5b, $ctr4b
3373
3374         mov     $ctr4b, $ctr1b
3375         b.gt    .L192_enc_blocks_more_than_4
3376
3377         mov     $ctr7b, $ctr6b
3378         mov     $ctr6b, $ctr5b
3379         mov     $ctr5b, $ctr1b
3380
3381         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3382         cmp     $main_end_input_ptr, #48
3383         b.gt    .L192_enc_blocks_more_than_3
3384
3385         mov     $ctr7b, $ctr6b
3386         mov     $ctr6b, $ctr1b
3387         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3388
3389         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
3390         cmp     $main_end_input_ptr, #32
3391         b.gt    .L192_enc_blocks_more_than_2
3392
3393         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3394
3395         cmp     $main_end_input_ptr, #16
3396         mov     $ctr7b, $ctr1b
3397         b.gt    .L192_enc_blocks_more_than_1
3398
3399         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
3400         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
3401         b        .L192_enc_blocks_less_than_1
3402 .L192_enc_blocks_more_than_7:                                           @ blocks left >  7
3403         st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
3404
3405         rev64   $res0b, $res1b                                          @ GHASH final-7 block
3406         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
3407
3408         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3409
3410         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
3411
3412         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
3413
3414         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
3415         movi    $t0.8b, #0                                              @ supress further partial tag feed in
3416         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
3417
3418         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
3419
3420         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
3421         eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
3422 .L192_enc_blocks_more_than_6:                                           @ blocks left >  6
3423
3424         st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
3425
3426         rev64   $res0b, $res1b                                          @ GHASH final-6 block
3427
3428         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
3429
3430         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3431
3432         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
3433
3434         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
3435         eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
3436
3437         movi    $t0.8b, #0                                              @ supress further partial tag feed in
3438         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
3439         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
3440
3441         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
3442
3443         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
3444         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
3445
3446         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
3447 .L192_enc_blocks_more_than_5:                                           @ blocks left >  5
3448
3449         st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
3450
3451         rev64   $res0b, $res1b                                          @ GHASH final-5 block
3452
3453         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3454
3455         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
3456
3457         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
3458         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
3459
3460         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
3461         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
3462
3463         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
3464         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
3465
3466         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
3467         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
3468
3469         eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
3470         movi    $t0.8b, #0                                              @ supress further partial tag feed in
3471
3472         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
3473 .L192_enc_blocks_more_than_4:                                           @ blocks left >  4
3474
3475         st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
3476
3477         rev64   $res0b, $res1b                                          @ GHASH final-4 block
3478
3479         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3480
3481         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
3482         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
3483         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
3484
3485         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
3486         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
3487
3488         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
3489
3490         movi    $t0.8b, #0                                              @ supress further partial tag feed in
3491         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
3492
3493         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
3494
3495         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
3496         eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
3497 .L192_enc_blocks_more_than_3:                                           @ blocks left >  3
3498
3499         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
3500         st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
3501
3502         rev64   $res0b, $res1b                                          @ GHASH final-3 block
3503
3504         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3505         movi    $t0.8b, #0                                              @ supress further partial tag feed in
3506
3507         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
3508         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
3509         ext     $h4.16b, $h4.16b, $h4.16b, #8
3510
3511         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
3512
3513         eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
3514         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
3515
3516         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
3517         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
3518
3519         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
3520         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
3521
3522         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
3523
3524         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
3525         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
3526 .L192_enc_blocks_more_than_2:                                           @ blocks left >  2
3527
3528         st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
3529
3530         rev64   $res0b, $res1b                                          @ GHASH final-2 block
3531         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
3532         ext     $h3.16b, $h3.16b, $h3.16b, #8
3533
3534         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3535
3536         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
3537         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
3538
3539         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
3540
3541         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
3542         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
3543         movi    $t0.8b, #0                                              @ supress further partial tag feed in
3544
3545         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
3546
3547         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
3548         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
3549
3550         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
3551         eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
3552 .L192_enc_blocks_more_than_1:                                           @ blocks left >  1
3553
3554         ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
3555         ext     $h2.16b, $h2.16b, $h2.16b, #8
3556         st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
3557
3558         rev64   $res0b, $res1b                                          @ GHASH final-1 block
3559
3560         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3561
3562         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
3563         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
3564
3565         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
3566         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
3567         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
3568
3569         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
3570         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
3571
3572         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
3573
3574         eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
3575         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
3576
3577         movi    $t0.8b, #0                                              @ supress further partial tag feed in
3578
3579         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
3580         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
3581 .L192_enc_blocks_less_than_1:                                           @ blocks left <= 1
3582
3583         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
3584         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
3585
3586         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
3587
3588         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
3589
3590         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
3591
3592         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
3593         cmp     $bit_length, #64
3594         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
3595
3596         csel    $temp2_x, $temp1_x, $temp0_x, lt
3597         csel    $temp3_x, $temp0_x, xzr, lt
3598
3599         mov     $ctr0.d[1], $temp3_x
3600         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
3601         ext     $h1.16b, $h1.16b, $h1.16b, #8
3602
3603         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
3604         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
3605
3606         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
3607
3608         rev64   $res0b, $res1b                                          @ GHASH final block
3609         bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
3610
3611         st1     { $res1b}, [$output_ptr]                                @ store all 16B
3612
3613         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
3614
3615         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
3616         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
3617
3618         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
3619         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
3620
3621         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
3622
3623         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
3624
3625         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
3626         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
3627
3628         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
3629         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
3630
3631         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
3632
3633         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
3634         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
3635
3636         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
3637
3638         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
3639
3640         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
3641         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
3642
3643         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
3644                 ext     $acc_lb, $acc_lb, $acc_lb, #8
3645         rev64   $acc_lb, $acc_lb
3646         st1     { $acc_l.16b }, [$current_tag]
3647
3648         lsr     x0, $bit_length, #3                                     @ return sizes
3649
3650         ldp     d10, d11, [sp, #16]
3651         ldp     d12, d13, [sp, #32]
3652         ldp     d14, d15, [sp, #48]
3653         ldp     d8, d9, [sp], #80
3654         ret
3655
3656 .L192_enc_ret:
3657         mov w0, #0x0
3658         ret
3659 .size unroll8_eor3_aes_gcm_enc_192_kernel,.-unroll8_eor3_aes_gcm_enc_192_kernel
3660 ___
3661
3662 #########################################################################################
3663 # size_t unroll8_eor3_aes_gcm_dec_192_kernel(const unsigned char *in,
3664 #                               size_t len,
3665 #                               unsigned char *out,
3666 #                               const void *key,
3667 #                               unsigned char ivec[16],
3668 #                               u64 *Xi);
3669 #
3670 $code.=<<___;
3671 .global unroll8_eor3_aes_gcm_dec_192_kernel
3672 .type   unroll8_eor3_aes_gcm_dec_192_kernel,%function
3673 .align  4
3674 unroll8_eor3_aes_gcm_dec_192_kernel:
3675         AARCH64_VALID_CALL_TARGET
3676         cbz     x1, .L192_dec_ret
3677         stp     d8, d9, [sp, #-80]!
3678         mov     $counter, x4
3679         mov     $cc, x5
3680         stp     d10, d11, [sp, #16]
3681         stp     d12, d13, [sp, #32]
3682         stp     d14, d15, [sp, #48]
3683         mov     x5, #0xc200000000000000
3684         stp     x5, xzr, [sp, #64]
3685         add     $modulo_constant, sp, #64
3686
3687         lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
3688         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
3689         ld1     { $acc_lb}, [$current_tag]
3690
3691                 mov     $constant_temp, #0x100000000                    @ set up counter increment
3692         movi    $rctr_inc.16b, #0x0
3693         mov     $rctr_inc.d[1], $constant_temp
3694
3695         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
3696
3697         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
3698
3699         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
3700         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
3701
3702         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
3703         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
3704
3705         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
3706         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
3707
3708         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
3709         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
3710
3711         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
3712         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
3713         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
3714
3715         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
3716         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
3717
3718         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
3719
3720         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
3721         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
3722         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
3723
3724         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
3725         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
3726         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
3727
3728         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
3729         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
3730         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
3731
3732         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
3733
3734         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
3735
3736         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
3737         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
3738         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
3739
3740         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
3741         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
3742
3743         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
3744         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
3745         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
3746
3747         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
3748         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
3749         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
3750
3751         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
3752         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
3753         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
3754
3755         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
3756
3757         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
3758         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
3759         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
3760
3761         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
3762         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
3763
3764         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
3765         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
3766         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
3767
3768         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
3769         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
3770         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
3771
3772         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
3773         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
3774         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
3775
3776         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
3777         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
3778         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
3779
3780         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
3781         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
3782
3783         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
3784         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
3785         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
3786
3787         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
3788         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
3789         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
3790
3791         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
3792
3793         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
3794         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
3795         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
3796
3797         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
3798         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
3799         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
3800
3801         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
3802         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
3803         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
3804
3805         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
3806
3807         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
3808         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
3809
3810         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
3811         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
3812         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
3813
3814         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
3815         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
3816         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
3817
3818         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
3819         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
3820         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3821
3822         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
3823         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
3824         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
3825
3826         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
3827         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
3828         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
3829
3830         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
3831         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
3832
3833         ld1     { $acc_lb}, [$current_tag]
3834         ext     $acc_lb, $acc_lb, $acc_lb, #8
3835         rev64   $acc_lb, $acc_lb
3836
3837         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
3838
3839         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
3840         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3841
3842         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
3843         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
3844         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
3845
3846         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
3847         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
3848
3849         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
3850         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
3851
3852         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
3853         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
3854         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
3855
3856         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
3857         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
3858         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
3859
3860         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
3861         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
3862         ldr     $rk12q, [$cc, #192]                                     @ load rk12
3863
3864         aese    $ctr0b, $rk11                                           @ AES block 0 - round 11
3865         aese    $ctr1b, $rk11                                           @ AES block 1 - round 11
3866         aese    $ctr4b, $rk11                                           @ AES block 4 - round 11
3867
3868         aese    $ctr6b, $rk11                                           @ AES block 6 - round 11
3869         aese    $ctr5b, $rk11                                           @ AES block 5 - round 11
3870         aese    $ctr7b, $rk11                                           @ AES block 7 - round 11
3871
3872         aese    $ctr2b, $rk11                                           @ AES block 2 - round 11
3873         aese    $ctr3b, $rk11                                           @ AES block 3 - round 11
3874         b.ge    .L192_dec_tail                                          @ handle tail
3875
3876         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
3877
3878         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
3879
3880         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
3881
3882         eor3    $ctr1b, $res1b, $ctr1b, $rk12                           @ AES block 1 - result
3883         eor3    $ctr0b, $res0b, $ctr0b, $rk12                           @ AES block 0 - result
3884         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
3885
3886         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
3887         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
3888
3889         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
3890         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
3891         eor3    $ctr3b, $res3b, $ctr3b, $rk12                           @ AES block 3 - result
3892
3893         eor3    $ctr2b, $res2b, $ctr2b, $rk12                           @ AES block 2 - result
3894         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
3895         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
3896
3897         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
3898         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
3899
3900         eor3    $ctr4b, $res4b, $ctr4b, $rk12                           @ AES block 4 - result
3901
3902         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
3903         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
3904
3905         eor3    $ctr5b, $res5b, $ctr5b, $rk12                           @ AES block 5 - result
3906         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
3907         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
3908
3909         eor3    $ctr6b, $res6b, $ctr6b, $rk12                           @ AES block 6 - result
3910         eor3    $ctr7b, $res7b, $ctr7b, $rk12                           @ AES block 7 - result
3911         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
3912
3913         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
3914         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
3915         b.ge    .L192_dec_prepretail                                    @ do prepretail
3916
3917 .L192_dec_main_loop:                                                    @ main loop start
3918         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
3919         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
3920         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
3921
3922         rev64   $res0b, $res0b                                          @ GHASH block 8k
3923         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
3924         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
3925
3926         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
3927         ext     $h7.16b, $h7.16b, $h7.16b, #8
3928         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
3929         ext     $h8.16b, $h8.16b, $h8.16b, #8
3930         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
3931         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
3932
3933         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
3934         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
3935         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
3936
3937         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
3938
3939         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
3940         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
3941         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
3942
3943         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
3944         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
3945         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
3946
3947         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
3948         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
3949         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
3950
3951         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
3952         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
3953         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
3954
3955         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
3956         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
3957         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
3958         ext     $h5.16b, $h5.16b, $h5.16b, #8
3959         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
3960         ext     $h6.16b, $h6.16b, $h6.16b, #8
3961
3962         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
3963         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
3964         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
3965
3966         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
3967         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
3968         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
3969
3970         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
3971         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
3972         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
3973
3974         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
3975         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
3976         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
3977         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
3978
3979         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
3980         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
3981         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
3982
3983         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
3984         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
3985         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
3986
3987         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
3988         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
3989         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
3990
3991         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
3992         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
3993         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
3994
3995         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
3996         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
3997         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
3998
3999         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
4000         ext     $h3.16b, $h3.16b, $h3.16b, #8
4001         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
4002         ext     $h4.16b, $h4.16b, $h4.16b, #8
4003         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
4004         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
4005
4006         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
4007         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
4008         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
4009
4010         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
4011         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
4012
4013         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
4014         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
4015         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
4016
4017         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
4018         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
4019         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
4020
4021         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
4022         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
4023
4024         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
4025         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
4026         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
4027
4028         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
4029         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
4030         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
4031
4032         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
4033         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
4034         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
4035
4036         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
4037         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
4038         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
4039
4040         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
4041         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
4042         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
4043
4044         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
4045         ext     $h1.16b, $h1.16b, $h1.16b, #8
4046         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
4047         ext     $h2.16b, $h2.16b, $h2.16b, #8
4048         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
4049         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
4050
4051         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
4052         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
4053         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
4054
4055         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
4056         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
4057         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
4058
4059         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
4060         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
4061         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
4062
4063         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
4064         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
4065         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
4066
4067         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
4068         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
4069         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
4070         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
4071
4072         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
4073         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
4074         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
4075
4076         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
4077         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
4078         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
4079
4080         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
4081         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
4082         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
4083
4084         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
4085         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
4086         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
4087
4088         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
4089         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
4090         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
4091
4092         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
4093         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
4094         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
4095
4096         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
4097         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
4098         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
4099
4100         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
4101         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
4102         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
4103
4104         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
4105         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
4106         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
4107
4108         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
4109         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
4110         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
4111
4112         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
4113         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
4114         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
4115
4116         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
4117         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
4118         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
4119
4120         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
4121         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
4122         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
4123
4124         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
4125         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
4126         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
4127
4128         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
4129         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
4130         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
4131
4132         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
4133         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
4134         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
4135
4136         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
4137         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
4138         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
4139
4140         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
4141         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
4142
4143         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
4144         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
4145         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
4146
4147         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
4148         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
4149         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
4150
4151         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
4152         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
4153         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
4154
4155         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
4156         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
4157         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
4158
4159         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
4160         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
4161         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
4162
4163         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
4164         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
4165         ldr     $rk12q, [$cc, #192]                                     @ load rk12
4166
4167         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
4168         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
4169         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
4170
4171         aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
4172         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
4173         aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
4174
4175         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
4176         aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
4177         aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
4178
4179         eor3    $ctr0b, $res0b, $ctr0b, $rk12                           @ AES block 8k+8 - result
4180         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
4181         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
4182
4183         aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
4184         aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
4185         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
4186
4187         aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
4188         aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
4189         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
4190
4191         eor3    $ctr1b, $res1b, $ctr1b, $rk12                           @ AES block 8k+9 - result
4192         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
4193         eor3    $ctr3b, $res3b, $ctr3b, $rk12                           @ AES block 8k+11 - result
4194
4195         eor3    $ctr2b, $res2b, $ctr2b, $rk12                           @ AES block 8k+10 - result
4196         eor3    $ctr7b, $res7b, $ctr7b, $rk12                           @ AES block 8k+15 - result
4197         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
4198
4199         eor3    $ctr5b, $res5b, $ctr5b, $rk12                           @ AES block 8k+13 - result
4200         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
4201         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
4202
4203         eor3    $ctr4b, $res4b, $ctr4b, $rk12                           @ AES block 8k+12 - result
4204         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
4205         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
4206
4207         eor3    $ctr6b, $res6b, $ctr6b, $rk12                           @ AES block 8k+14 - result
4208         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
4209         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
4210
4211         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
4212         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
4213
4214         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
4215         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
4216         b.lt    .L192_dec_main_loop
4217
4218 .L192_dec_prepretail:                                                   @ PREPRETAIL
4219         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
4220         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
4221         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
4222
4223         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
4224         ext     $h7.16b, $h7.16b, $h7.16b, #8
4225         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
4226         ext     $h8.16b, $h8.16b, $h8.16b, #8
4227         rev64   $res0b, $res0b                                          @ GHASH block 8k
4228         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
4229
4230         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
4231         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
4232         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
4233
4234         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
4235         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
4236         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
4237
4238         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
4239         ext     $h5.16b, $h5.16b, $h5.16b, #8
4240         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
4241         ext     $h6.16b, $h6.16b, $h6.16b, #8
4242         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
4243
4244         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
4245         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
4246         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
4247
4248         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
4249         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
4250         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
4251
4252         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
4253         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
4254         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
4255
4256         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
4257         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
4258         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
4259
4260         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
4261         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
4262         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
4263
4264         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
4265         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
4266         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
4267
4268         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
4269         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
4270         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
4271
4272         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
4273         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
4274         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
4275
4276         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
4277         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
4278         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
4279
4280         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
4281         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
4282         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
4283         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
4284
4285         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
4286         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
4287         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
4288
4289         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
4290         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
4291         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
4292
4293         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
4294         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
4295         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
4296
4297         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
4298         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
4299         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
4300
4301         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
4302         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
4303         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
4304
4305         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
4306         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
4307         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
4308
4309         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
4310         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
4311         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
4312
4313         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
4314         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
4315         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
4316
4317         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
4318         ext     $h3.16b, $h3.16b, $h3.16b, #8
4319         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
4320         ext     $h4.16b, $h4.16b, $h4.16b, #8
4321         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
4322         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
4323
4324         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
4325         ext     $h1.16b, $h1.16b, $h1.16b, #8
4326         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
4327         ext     $h2.16b, $h2.16b, $h2.16b, #8
4328         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
4329         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
4330
4331         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
4332
4333         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
4334         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
4335
4336         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
4337         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
4338         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
4339
4340         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
4341         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
4342         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
4343
4344         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
4345         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
4346         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
4347
4348         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
4349         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
4350         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
4351         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
4352
4353         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
4354         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
4355         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
4356
4357         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
4358         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
4359         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
4360
4361         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
4362         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
4363         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
4364
4365         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
4366
4367         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
4368         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
4369         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
4370
4371         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
4372         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
4373         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
4374
4375         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
4376         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
4377         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
4378
4379         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
4380         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
4381
4382         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
4383         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
4384         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
4385
4386         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
4387         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
4388         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
4389
4390         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
4391         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
4392         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
4393
4394         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
4395         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
4396         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
4397
4398         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
4399         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
4400         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
4401
4402         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
4403         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
4404         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
4405
4406         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
4407         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
4408         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
4409
4410         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
4411         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
4412         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
4413
4414         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
4415         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
4416         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
4417
4418         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
4419         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
4420
4421         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
4422         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
4423         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
4424
4425         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
4426         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
4427         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
4428
4429         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
4430         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
4431         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
4432
4433         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
4434         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
4435         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
4436
4437         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
4438         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
4439         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
4440
4441         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
4442         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
4443         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
4444
4445         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
4446         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
4447         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
4448
4449         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
4450         ldr     $rk12q, [$cc, #192]                                     @ load rk12
4451         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
4452
4453         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
4454         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
4455         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
4456
4457         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
4458         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
4459         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
4460
4461         aese    $ctr0b, $rk11                                           @ AES block 8k+8 - round 11
4462         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
4463         aese    $ctr5b, $rk11                                           @ AES block 8k+13 - round 11
4464
4465         aese    $ctr2b, $rk11                                           @ AES block 8k+10 - round 11
4466         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
4467         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
4468
4469         aese    $ctr6b, $rk11                                           @ AES block 8k+14 - round 11
4470         aese    $ctr4b, $rk11                                           @ AES block 8k+12 - round 11
4471         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
4472
4473         aese    $ctr3b, $rk11                                           @ AES block 8k+11 - round 11
4474         aese    $ctr1b, $rk11                                           @ AES block 8k+9 - round 11
4475         aese    $ctr7b, $rk11                                           @ AES block 8k+15 - round 11
4476
4477 .L192_dec_tail:                                                         @ TAIL
4478
4479         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
4480
4481         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
4482         ext     $h5.16b, $h5.16b, $h5.16b, #8
4483         ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
4484
4485         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
4486         ext     $h8.16b, $h8.16b, $h8.16b, #8
4487
4488         mov     $t1.16b, $rk12
4489
4490         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
4491         ext     $h6.16b, $h6.16b, $h6.16b, #8
4492         ext     $h7.16b, $h7.16b, $h7.16b, #8
4493         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
4494
4495         eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
4496         cmp     $main_end_input_ptr, #112
4497         b.gt    .L192_dec_blocks_more_than_7
4498
4499         mov     $ctr7b, $ctr6b
4500         movi    $acc_h.8b, #0
4501         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4502
4503         mov     $ctr6b, $ctr5b
4504         mov     $ctr5b, $ctr4b
4505         mov     $ctr4b, $ctr3b
4506
4507         cmp     $main_end_input_ptr, #96
4508         movi    $acc_l.8b, #0
4509         mov     $ctr3b, $ctr2b
4510
4511         mov     $ctr2b, $ctr1b
4512         movi    $acc_m.8b, #0
4513         b.gt    .L192_dec_blocks_more_than_6
4514
4515         mov     $ctr7b, $ctr6b
4516         mov     $ctr6b, $ctr5b
4517         mov     $ctr5b, $ctr4b
4518
4519         mov     $ctr4b, $ctr3b
4520         mov     $ctr3b, $ctr1b
4521
4522         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4523         cmp     $main_end_input_ptr, #80
4524         b.gt    .L192_dec_blocks_more_than_5
4525
4526         mov     $ctr7b, $ctr6b
4527         mov     $ctr6b, $ctr5b
4528
4529         mov     $ctr5b, $ctr4b
4530         mov     $ctr4b, $ctr1b
4531         cmp     $main_end_input_ptr, #64
4532
4533         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4534         b.gt    .L192_dec_blocks_more_than_4
4535
4536         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4537         mov     $ctr7b, $ctr6b
4538         mov     $ctr6b, $ctr5b
4539
4540         mov     $ctr5b, $ctr1b
4541         cmp     $main_end_input_ptr, #48
4542         b.gt    .L192_dec_blocks_more_than_3
4543
4544         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4545         mov     $ctr7b, $ctr6b
4546         cmp     $main_end_input_ptr, #32
4547
4548         mov     $ctr6b, $ctr1b
4549         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
4550         b.gt    .L192_dec_blocks_more_than_2
4551
4552         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4553
4554         mov     $ctr7b, $ctr1b
4555         cmp     $main_end_input_ptr, #16
4556         b.gt    .L192_dec_blocks_more_than_1
4557
4558         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
4559         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
4560         b        .L192_dec_blocks_less_than_1
4561 .L192_dec_blocks_more_than_7:                                           @ blocks left >  7
4562         rev64   $res0b, $res1b                                          @ GHASH final-7 block
4563
4564         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
4565         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4566
4567         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
4568         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
4569         ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
4570
4571         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
4572
4573         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
4574         st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
4575
4576         eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
4577
4578         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
4579         movi    $t0.8b, #0                                              @ supress further partial tag feed in
4580 .L192_dec_blocks_more_than_6:                                           @ blocks left >  6
4581
4582         rev64   $res0b, $res1b                                          @ GHASH final-6 block
4583
4584         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4585
4586         ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
4587         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
4588
4589         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
4590         movi    $t0.8b, #0                                              @ supress further partial tag feed in
4591         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
4592
4593         st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
4594         eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
4595
4596         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
4597         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
4598         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
4599
4600         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
4601         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
4602 .L192_dec_blocks_more_than_5:                                           @ blocks left >  5
4603
4604         rev64   $res0b, $res1b                                          @ GHASH final-5 block
4605
4606         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4607
4608         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
4609
4610         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
4611
4612         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
4613         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
4614
4615         ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
4616
4617         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
4618         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
4619
4620         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
4621
4622         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
4623         movi    $t0.8b, #0                                              @ supress further partial tag feed in
4624         st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
4625
4626         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
4627         eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
4628 .L192_dec_blocks_more_than_4:                                           @ blocks left >  4
4629
4630         rev64   $res0b, $res1b                                          @ GHASH final-4 block
4631
4632         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4633         movi    $t0.8b, #0                                              @ supress further partial tag feed in
4634
4635         ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
4636         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
4637         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
4638
4639         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
4640
4641         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
4642
4643         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
4644         st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
4645         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
4646
4647         eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
4648
4649         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
4650         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
4651 .L192_dec_blocks_more_than_3:                                           @ blocks left >  3
4652
4653         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
4654         ext     $h4.16b, $h4.16b, $h4.16b, #8
4655         rev64   $res0b, $res1b                                          @ GHASH final-3 block
4656         ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
4657
4658         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4659
4660         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
4661         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
4662
4663         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
4664         movi    $t0.8b, #0                                              @ supress further partial tag feed in
4665         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
4666
4667         st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
4668         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
4669         eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
4670
4671         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
4672         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
4673
4674         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
4675
4676         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
4677
4678         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
4679 .L192_dec_blocks_more_than_2:                                           @ blocks left >  2
4680
4681         rev64   $res0b, $res1b                                          @ GHASH final-2 block
4682         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
4683         ext     $h3.16b, $h3.16b, $h3.16b, #8
4684
4685         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4686
4687         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
4688         ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
4689
4690         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
4691
4692         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
4693
4694         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
4695         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
4696
4697         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
4698         movi    $t0.8b, #0                                              @ supress further partial tag feed in
4699
4700         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
4701         st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
4702
4703         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
4704         eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
4705 .L192_dec_blocks_more_than_1:                                           @ blocks left >  1
4706
4707         rev64   $res0b, $res1b                                          @ GHASH final-1 block
4708         ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
4709         ldr     $h2q, [$current_tag, #64]                               @ load h1l | h1h
4710         ext     $h2.16b, $h2.16b, $h2.16b, #8
4711
4712         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4713         movi    $t0.8b, #0                                              @ supress further partial tag feed in
4714         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
4715
4716         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
4717         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
4718         st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
4719
4720         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
4721
4722         eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
4723
4724         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
4725
4726         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
4727
4728         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
4729
4730         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
4731
4732         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
4733         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
4734 .L192_dec_blocks_less_than_1:                                           @ blocks left <= 1
4735
4736         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
4737         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
4738
4739         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
4740         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
4741
4742         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
4743         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
4744
4745         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
4746
4747         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
4748         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
4749         cmp     $bit_length, #64
4750
4751         csel    $temp2_x, $temp1_x, $temp0_x, lt
4752         csel    $temp3_x, $temp0_x, xzr, lt
4753         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
4754         ext     $h1.16b, $h1.16b, $h1.16b, #8
4755
4756         mov     $ctr0.d[1], $temp3_x
4757         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
4758
4759         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
4760
4761         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
4762         bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
4763
4764         rev64   $res0b, $res1b                                          @ GHASH final block
4765
4766         st1     { $res4b}, [$output_ptr]                                @ store all 16B
4767
4768         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
4769
4770         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
4771         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
4772
4773         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
4774         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
4775         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
4776
4777         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
4778         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
4779
4780         eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
4781         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
4782         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
4783
4784         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
4785         ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
4786
4787         eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
4788
4789         eor3    $acc_mb, $acc_mb, $acc_hb, $t11.16b                     @ MODULO - fold into mid
4790
4791         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
4792         ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
4793
4794         eor3    $acc_lb, $acc_lb, $acc_mb, $acc_hb                      @ MODULO - fold into low
4795         ext     $acc_lb, $acc_lb, $acc_lb, #8
4796         rev64   $acc_lb, $acc_lb
4797         st1     { $acc_l.16b }, [$current_tag]
4798
4799         ldp     d10, d11, [sp, #16]
4800         ldp     d12, d13, [sp, #32]
4801         ldp     d14, d15, [sp, #48]
4802         ldp     d8, d9, [sp], #80
4803         ret
4804
4805 .L192_dec_ret:
4806         mov w0, #0x0
4807         ret
4808 .size unroll8_eor3_aes_gcm_dec_192_kernel,.-unroll8_eor3_aes_gcm_dec_192_kernel
4809 ___
4810 }
4811
4812 {
4813
4814 my ($end_input_ptr,$main_end_input_ptr,$temp0_x,$temp1_x)=map("x$_",(4..7));
4815 my ($temp2_x,$temp3_x)=map("x$_",(13..14));
4816 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$ctr4b,$ctr5b,$ctr6b,$ctr7b,$res0b,$res1b,$res2b,$res3b,$res4b,$res5b,$res6b,$res7b)=map("v$_.16b",(0..15));
4817 my ($ctr0,$ctr1,$ctr2,$ctr3,$ctr4,$ctr5,$ctr6,$ctr7,$res0,$res1,$res2,$res3,$res4,$res5,$res6,$res7)=map("v$_",(0..15));
4818 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$ctr4d,$ctr5d,$ctr6d,$ctr7d)=map("d$_",(0..7));
4819 my ($ctr0q,$ctr1q,$ctr2q,$ctr3q,$ctr4q,$ctr5q,$ctr6q,$ctr7q)=map("q$_",(0..7));
4820 my ($res0q,$res1q,$res2q,$res3q,$res4q,$res5q,$res6q,$res7q)=map("q$_",(8..15));
4821
4822 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3,$ctr_t4,$ctr_t5,$ctr_t6,$ctr_t7)=map("v$_",(8..15));
4823 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b,$ctr_t4b,$ctr_t5b,$ctr_t6b,$ctr_t7b)=map("v$_.16b",(8..15));
4824 my ($ctr_t0q,$ctr_t1q,$ctr_t2q,$ctr_t3q,$ctr_t4q,$ctr_t5q,$ctr_t6q,$ctr_t7q)=map("q$_",(8..15));
4825
4826 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(17..19));
4827 my ($acc_h,$acc_m,$acc_l)=map("v$_",(17..19));
4828
4829 my ($h1,$h12k,$h2,$h3,$h34k,$h4)=map("v$_",(20..25));
4830 my ($h5,$h56k,$h6,$h7,$h78k,$h8)=map("v$_",(20..25));
4831 my ($h1q,$h12kq,$h2q,$h3q,$h34kq,$h4q)=map("q$_",(20..25));
4832 my ($h5q,$h56kq,$h6q,$h7q,$h78kq,$h8q)=map("q$_",(20..25));
4833
4834 my $t0="v16";
4835 my $t0d="d16";
4836
4837 my $t1="v29";
4838 my $t2=$res1;
4839 my $t3=$t1;
4840
4841 my $t4=$res0;
4842 my $t5=$res2;
4843 my $t6=$t0;
4844
4845 my $t7=$res3;
4846 my $t8=$res4;
4847 my $t9=$res5;
4848
4849 my $t10=$res6;
4850 my $t11="v21";
4851 my $t12=$t1;
4852
4853 my $rtmp_ctr="v30";
4854 my $rtmp_ctrq="q30";
4855 my $rctr_inc="v31";
4856 my $rctr_incd="d31";
4857
4858 my $mod_constantd=$t0d;
4859 my $mod_constant=$t0;
4860
4861 my ($rk0,$rk1,$rk2)=map("v$_.16b",(26..28));
4862 my ($rk3,$rk4,$rk5)=map("v$_.16b",(26..28));
4863 my ($rk6,$rk7,$rk8)=map("v$_.16b",(26..28));
4864 my ($rk9,$rk10,$rk11)=map("v$_.16b",(26..28));
4865 my ($rk12,$rk13,$rk14)=map("v$_.16b",(26..28));
4866 my ($rk0q,$rk1q,$rk2q)=map("q$_",(26..28));
4867 my ($rk3q,$rk4q,$rk5q)=map("q$_",(26..28));
4868 my ($rk6q,$rk7q,$rk8q)=map("q$_",(26..28));
4869 my ($rk9q,$rk10q,$rk11q)=map("q$_",(26..28));
4870 my ($rk12q,$rk13q,$rk14q)=map("q$_",(26..28));
4871 my $rk2q1="v28.1q";
4872 my $rk3q1="v26.1q";
4873 my $rk4v="v27";
4874 #########################################################################################
4875 # size_t unroll8_eor3_aes_gcm_enc_256_kernel(const unsigned char *in,
4876 #                               size_t len,
4877 #                               unsigned char *out,
4878 #                               const void *key,
4879 #                               unsigned char ivec[16],
4880 #                               u64 *Xi);
4881 #
4882 $code.=<<___;
4883 .global unroll8_eor3_aes_gcm_enc_256_kernel
4884 .type   unroll8_eor3_aes_gcm_enc_256_kernel,%function
4885 .align  4
4886 unroll8_eor3_aes_gcm_enc_256_kernel:
4887         AARCH64_VALID_CALL_TARGET
4888         cbz     x1, .L256_enc_ret
4889         stp     d8, d9, [sp, #-80]!
4890         mov     $counter, x4
4891         mov     $cc, x5
4892         stp     d10, d11, [sp, #16]
4893         stp     d12, d13, [sp, #32]
4894         stp     d14, d15, [sp, #48]
4895         mov     x5, #0xc200000000000000
4896         stp     x5, xzr, [sp, #64]
4897         add     $modulo_constant, sp, #64
4898
4899         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
4900
4901         lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
4902
4903         mov     $constant_temp, #0x100000000                    @ set up counter increment
4904         movi    $rctr_inc.16b, #0x0
4905         mov     $rctr_inc.d[1], $constant_temp
4906         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
4907
4908         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4909
4910         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4911
4912         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
4913
4914         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
4915
4916         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
4917         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
4918
4919         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
4920         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
4921
4922         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
4923         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
4924
4925         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
4926         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
4927
4928         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
4929         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
4930         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
4931
4932         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
4933         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
4934
4935         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
4936
4937         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
4938         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
4939         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
4940
4941         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
4942         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
4943         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
4944
4945         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
4946         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
4947         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
4948
4949         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
4950         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
4951         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
4952
4953         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
4954         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
4955
4956         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
4957
4958         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
4959
4960         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
4961         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
4962         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
4963
4964         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
4965         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
4966         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
4967
4968         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
4969         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
4970         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
4971
4972         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
4973         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
4974         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
4975
4976         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
4977
4978         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
4979         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
4980         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
4981
4982         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
4983         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
4984
4985         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
4986         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
4987         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
4988
4989         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
4990         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
4991
4992         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
4993         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
4994         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
4995
4996         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
4997         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
4998         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
4999
5000         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
5001         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
5002         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
5003
5004         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
5005         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
5006         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
5007
5008         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
5009         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
5010         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
5011
5012         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
5013         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
5014         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
5015
5016         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
5017         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
5018         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
5019
5020         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
5021         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
5022
5023         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
5024         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
5025         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
5026
5027         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
5028         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
5029
5030         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
5031
5032         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
5033         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
5034
5035         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
5036         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
5037         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
5038
5039         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
5040         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
5041         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
5042
5043         ld1     { $acc_lb}, [$current_tag]
5044         ext     $acc_lb, $acc_lb, $acc_lb, #8
5045         rev64   $acc_lb, $acc_lb
5046         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
5047
5048         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
5049         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
5050         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
5051
5052         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
5053         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
5054         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
5055
5056         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
5057
5058         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
5059         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
5060         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
5061
5062         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
5063         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
5064         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
5065
5066         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
5067         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
5068         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
5069
5070         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 11
5071         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
5072         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 11
5073
5074         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 11
5075         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 11
5076         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 11
5077
5078         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 11
5079         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 11
5080         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 11
5081
5082         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 7
5083         ldr     $rk14q, [$cc, #224]                                     @ load rk14
5084
5085         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 12
5086         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 12
5087         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 12
5088
5089         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 12
5090         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 12
5091         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 12
5092
5093         aese    $ctr2b, $rk13                                           @ AES block 2 - round 13
5094         aese    $ctr1b, $rk13                                           @ AES block 1 - round 13
5095         aese    $ctr4b, $rk13                                           @ AES block 4 - round 13
5096
5097         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 12
5098         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 12
5099
5100         aese    $ctr0b, $rk13                                           @ AES block 0 - round 13
5101         aese    $ctr5b, $rk13                                           @ AES block 5 - round 13
5102
5103         aese    $ctr6b, $rk13                                           @ AES block 6 - round 13
5104         aese    $ctr7b, $rk13                                           @ AES block 7 - round 13
5105         aese    $ctr3b, $rk13                                           @ AES block 3 - round 13
5106
5107         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3         @ end_input_ptr
5108         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
5109         b.ge    .L256_enc_tail                                          @ handle tail
5110
5111         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 0, 1 - load plaintext
5112
5113         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 2, 3 - load plaintext
5114
5115         eor3    $res0b, $ctr_t0b, $ctr0b, $rk14                         @ AES block 0 - result
5116         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
5117         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
5118
5119         eor3    $res1b, $ctr_t1b, $ctr1b, $rk14                         @ AES block 1 - result
5120         eor3    $res3b, $ctr_t3b, $ctr3b, $rk14                         @ AES block 3 - result
5121
5122         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
5123         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
5124         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
5125
5126         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
5127         eor3    $res2b, $ctr_t2b, $ctr2b, $rk14                         @ AES block 2 - result
5128         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
5129
5130         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
5131         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
5132         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
5133
5134         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
5135
5136         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
5137         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
5138
5139         eor3    $res4b, $ctr_t4b, $ctr4b, $rk14                         @ AES block 4 - result
5140
5141         eor3    $res7b, $ctr_t7b, $ctr7b, $rk14                         @ AES block 7 - result
5142         eor3    $res6b, $ctr_t6b, $ctr6b, $rk14                         @ AES block 6 - result
5143         eor3    $res5b, $ctr_t5b, $ctr5b, $rk14                         @ AES block 5 - result
5144
5145         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
5146         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
5147
5148         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
5149         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
5150         b.ge    .L256_enc_prepretail                                    @ do prepretail
5151
5152 .L256_enc_main_loop:                                                    @ main loop start
5153         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
5154
5155         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
5156         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
5157         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
5158         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
5159
5160         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
5161         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
5162         ext     $h5.16b, $h5.16b, $h5.16b, #8
5163         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
5164         ext     $h6.16b, $h6.16b, $h6.16b, #8
5165         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
5166
5167         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
5168         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
5169         rev64   $res0b, $res0b                                          @ GHASH block 8k
5170
5171         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
5172         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
5173         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
5174         ext     $h7.16b, $h7.16b, $h7.16b, #8
5175         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
5176         ext     $h8.16b, $h8.16b, $h8.16b, #8
5177
5178         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
5179         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
5180         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
5181
5182         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
5183         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
5184         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
5185
5186         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
5187         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
5188         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
5189
5190         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
5191         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
5192         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
5193
5194         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
5195         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
5196         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
5197
5198         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
5199         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
5200         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
5201
5202         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
5203         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
5204         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
5205
5206         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
5207         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
5208         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
5209
5210         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
5211         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
5212         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
5213
5214         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
5215         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
5216         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
5217
5218         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
5219         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
5220         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
5221
5222         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
5223         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
5224         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
5225
5226         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
5227         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
5228         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
5229
5230         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
5231         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
5232         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
5233
5234         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
5235         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
5236         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
5237
5238         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
5239         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
5240         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
5241
5242         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
5243         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
5244         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
5245         ext     $h3.16b, $h3.16b, $h3.16b, #8
5246         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
5247         ext     $h4.16b, $h4.16b, $h4.16b, #8
5248
5249         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
5250         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
5251         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
5252
5253         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
5254         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
5255         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
5256
5257         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
5258         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
5259         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
5260
5261         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
5262         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
5263         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
5264
5265         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
5266         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
5267         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
5268
5269         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
5270         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
5271         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
5272
5273         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
5274         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
5275         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
5276
5277         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
5278         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
5279         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
5280
5281         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
5282         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
5283         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
5284
5285         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
5286         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
5287         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
5288
5289         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
5290         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
5291         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
5292
5293         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
5294         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
5295         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
5296
5297         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
5298         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
5299         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
5300
5301         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
5302         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
5303         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
5304
5305         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
5306         ext     $h1.16b, $h1.16b, $h1.16b, #8
5307         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
5308         ext     $h2.16b, $h2.16b, $h2.16b, #8
5309         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
5310         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
5311
5312         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
5313         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
5314         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
5315         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
5316
5317         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
5318         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
5319         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
5320
5321         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
5322         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
5323         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
5324
5325         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
5326         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
5327         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
5328
5329         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
5330         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
5331         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
5332
5333         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
5334         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
5335         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
5336
5337         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
5338         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
5339         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
5340
5341         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
5342         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
5343         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
5344
5345         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
5346         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
5347         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
5348
5349         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
5350         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
5351         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
5352
5353         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
5354         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
5355         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
5356
5357         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
5358         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
5359         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
5360
5361         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
5362         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
5363         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
5364
5365         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
5366         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
5367         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
5368
5369         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
5370
5371         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
5372         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
5373         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
5374
5375         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
5376         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
5377         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
5378
5379         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
5380         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
5381         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
5382
5383         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
5384         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
5385         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
5386
5387         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
5388
5389         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
5390         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
5391
5392         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
5393         ldp     $ctr_t0q, $ctr_t1q, [$input_ptr], #32                   @ AES block 8k+8, 8k+9 - load plaintext
5394         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
5395
5396         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
5397         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
5398         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
5399
5400         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
5401         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
5402
5403         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
5404         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
5405
5406         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
5407         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
5408
5409         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
5410         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
5411         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
5412
5413         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
5414         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
5415         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
5416
5417         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
5418         ldr     $rk14q, [$cc, #224]                                     @ load rk14
5419         aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
5420
5421         ldp     $ctr_t2q, $ctr_t3q, [$input_ptr], #32                   @ AES block 8k+10, 8k+11 - load plaintext
5422         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
5423         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
5424
5425         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
5426         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
5427         ldp     $ctr_t4q, $ctr_t5q, [$input_ptr], #32                   @ AES block 4, 5 - load plaintext
5428
5429         ldp     $ctr_t6q, $ctr_t7q, [$input_ptr], #32                   @ AES block 6, 7 - load plaintext
5430         aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
5431         aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
5432
5433         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
5434         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
5435         aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
5436
5437         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
5438         aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
5439         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
5440
5441         eor3    $res2b, $ctr_t2b, $ctr2b, $rk14                         @ AES block 8k+10 - result
5442         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
5443         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
5444
5445         aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
5446         aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
5447         eor3    $res5b, $ctr_t5b, $ctr5b, $rk14                         @ AES block 5 - result
5448
5449         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
5450         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
5451         aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
5452
5453         eor3    $res4b, $ctr_t4b, $ctr4b, $rk14                         @ AES block 4 - result
5454         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
5455         eor3    $res3b, $ctr_t3b, $ctr3b, $rk14                         @ AES block 8k+11 - result
5456
5457         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
5458         eor3    $res1b, $ctr_t1b, $ctr1b, $rk14                         @ AES block 8k+9 - result
5459         eor3    $res0b, $ctr_t0b, $ctr0b, $rk14                         @ AES block 8k+8 - result
5460
5461         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
5462         stp     $res0q, $res1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
5463         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
5464
5465         eor3    $res7b, $ctr_t7b, $ctr7b, $rk14                         @ AES block 7 - result
5466         eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
5467         stp     $res2q, $res3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
5468
5469         eor3    $res6b, $ctr_t6b, $ctr6b, $rk14                         @ AES block 6 - result
5470         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
5471         stp     $res4q, $res5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
5472
5473         stp     $res6q, $res7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
5474         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
5475         b.lt    .L256_enc_main_loop
5476
5477 .L256_enc_prepretail:                                                   @ PREPRETAIL
5478         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
5479         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
5480         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
5481
5482         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
5483
5484         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
5485         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
5486
5487         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
5488         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
5489         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
5490
5491         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
5492
5493         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
5494         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
5495         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
5496
5497         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
5498         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
5499
5500         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
5501         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
5502         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
5503
5504         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
5505         rev64   $res0b, $res0b                                          @ GHASH block 8k
5506         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
5507
5508         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
5509         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
5510         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
5511
5512         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
5513         ext     $h7.16b, $h7.16b, $h7.16b, #8
5514         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
5515         ext     $h8.16b, $h8.16b, $h8.16b, #8
5516         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
5517
5518         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
5519         ext     $h5.16b, $h5.16b, $h5.16b, #8
5520         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
5521         ext     $h6.16b, $h6.16b, $h6.16b, #8
5522         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
5523         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
5524
5525         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
5526         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
5527
5528         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
5529         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
5530
5531         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
5532         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
5533         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
5534
5535         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
5536         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
5537         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
5538
5539         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
5540         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
5541         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
5542
5543         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
5544         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
5545         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
5546
5547         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
5548         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
5549         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
5550
5551         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
5552         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
5553         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
5554
5555         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
5556         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
5557
5558         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
5559         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
5560         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
5561
5562         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
5563         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
5564         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
5565
5566         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
5567         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
5568         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
5569
5570         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
5571         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
5572         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
5573
5574         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
5575         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
5576         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
5577
5578         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
5579         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
5580         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
5581
5582         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
5583         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
5584         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
5585
5586         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
5587         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
5588         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
5589
5590         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
5591         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
5592         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
5593
5594         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
5595         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
5596         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
5597
5598         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
5599         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
5600         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
5601
5602         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
5603         ext     $h3.16b, $h3.16b, $h3.16b, #8
5604         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
5605         ext     $h4.16b, $h4.16b, $h4.16b, #8
5606         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
5607         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
5608
5609         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
5610         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
5611
5612         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
5613         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
5614         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
5615
5616         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
5617         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
5618         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
5619
5620         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
5621         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
5622         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
5623
5624         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
5625         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
5626         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
5627         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
5628
5629         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
5630         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
5631         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
5632
5633         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
5634         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
5635         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
5636         ext     $h1.16b, $h1.16b, $h1.16b, #8
5637         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
5638         ext     $h2.16b, $h2.16b, $h2.16b, #8
5639
5640         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
5641         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
5642         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
5643
5644         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
5645         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
5646
5647         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
5648         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
5649         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
5650
5651         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
5652         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
5653         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
5654
5655         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
5656         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
5657         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
5658
5659         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
5660         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
5661         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
5662
5663         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
5664         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
5665         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
5666
5667         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
5668         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
5669         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
5670
5671         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
5672         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
5673         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
5674
5675         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
5676         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
5677         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
5678
5679         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
5680         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
5681         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
5682
5683         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
5684         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
5685         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
5686
5687         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
5688         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
5689         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
5690
5691         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
5692         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
5693         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
5694
5695         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
5696
5697         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
5698         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
5699         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
5700
5701         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
5702         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
5703
5704         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
5705         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
5706         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
5707
5708         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
5709         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
5710         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
5711
5712         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
5713         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
5714         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
5715
5716         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
5717         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
5718         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
5719
5720         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
5721         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
5722         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
5723
5724         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
5725         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
5726         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
5727
5728         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
5729         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
5730         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
5731
5732         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
5733         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
5734         ldr     $rk14q, [$cc, #224]                                     @ load rk14
5735
5736         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
5737         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
5738         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
5739
5740         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
5741         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
5742         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
5743
5744         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
5745         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
5746
5747         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
5748         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
5749         aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
5750
5751         eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
5752         aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
5753         aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
5754
5755         aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
5756         aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
5757         aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
5758
5759         aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
5760         aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
5761 .L256_enc_tail:                                                         @ TAIL
5762
5763         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8l | h8h
5764         ext     $h8.16b, $h8.16b, $h8.16b, #8
5765         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
5766
5767         ldr     $ctr_t0q, [$input_ptr], #16                             @ AES block 8k+8 - load plaintext
5768
5769         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
5770         ext     $h5.16b, $h5.16b, $h5.16b, #8
5771
5772         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
5773         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
5774         ext     $h6.16b, $h6.16b, $h6.16b, #8
5775         ext     $h7.16b, $h7.16b, $h7.16b, #8
5776         mov     $t1.16b, $rk14
5777
5778         cmp     $main_end_input_ptr, #112
5779         eor3    $res1b, $ctr_t0b, $ctr0b, $t1.16b                               @ AES block 8k+8 - result
5780         b.gt    .L256_enc_blocks_more_than_7
5781
5782         movi    $acc_l.8b, #0
5783         mov     $ctr7b, $ctr6b
5784         movi    $acc_h.8b, #0
5785
5786         mov     $ctr6b, $ctr5b
5787         mov     $ctr5b, $ctr4b
5788         mov     $ctr4b, $ctr3b
5789
5790         mov     $ctr3b, $ctr2b
5791         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5792         mov     $ctr2b, $ctr1b
5793
5794         movi    $acc_m.8b, #0
5795         cmp     $main_end_input_ptr, #96
5796         b.gt    .L256_enc_blocks_more_than_6
5797
5798         mov     $ctr7b, $ctr6b
5799         mov     $ctr6b, $ctr5b
5800         cmp     $main_end_input_ptr, #80
5801
5802         mov     $ctr5b, $ctr4b
5803         mov     $ctr4b, $ctr3b
5804         mov     $ctr3b, $ctr1b
5805
5806         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5807         b.gt    .L256_enc_blocks_more_than_5
5808
5809         mov     $ctr7b, $ctr6b
5810         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5811
5812         mov     $ctr6b, $ctr5b
5813         mov     $ctr5b, $ctr4b
5814
5815         cmp     $main_end_input_ptr, #64
5816         mov     $ctr4b, $ctr1b
5817         b.gt    .L256_enc_blocks_more_than_4
5818
5819         cmp     $main_end_input_ptr, #48
5820         mov     $ctr7b, $ctr6b
5821         mov     $ctr6b, $ctr5b
5822
5823         mov     $ctr5b, $ctr1b
5824         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5825         b.gt    .L256_enc_blocks_more_than_3
5826
5827         cmp     $main_end_input_ptr, #32
5828         mov     $ctr7b, $ctr6b
5829         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
5830
5831         mov     $ctr6b, $ctr1b
5832         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5833         b.gt    .L256_enc_blocks_more_than_2
5834
5835         mov     $ctr7b, $ctr1b
5836
5837         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5838         cmp     $main_end_input_ptr, #16
5839         b.gt    .L256_enc_blocks_more_than_1
5840
5841         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
5842         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
5843         b        .L256_enc_blocks_less_than_1
5844 .L256_enc_blocks_more_than_7:                                           @ blocks left >  7
5845         st1     { $res1b}, [$output_ptr], #16                           @ AES final-7 block  - store result
5846
5847         rev64   $res0b, $res1b                                          @ GHASH final-7 block
5848
5849         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5850
5851         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-6 block - load plaintext
5852
5853         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
5854         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
5855         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
5856
5857         movi    $t0.8b, #0                                              @ supress further partial tag feed in
5858
5859         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
5860         eor3    $res1b, $ctr_t1b, $ctr1b, $t1.16b                       @ AES final-6 block - result
5861
5862         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
5863         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
5864 .L256_enc_blocks_more_than_6:                                           @ blocks left >  6
5865
5866         st1     { $res1b}, [$output_ptr], #16                           @ AES final-6 block - store result
5867
5868         rev64   $res0b, $res1b                                          @ GHASH final-6 block
5869
5870         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5871
5872         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
5873         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
5874         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
5875
5876         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-5 block - load plaintext
5877
5878         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
5879
5880         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
5881
5882         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
5883         eor3    $res1b, $ctr_t1b, $ctr2b, $t1.16b                       @ AES final-5 block - result
5884
5885         movi    $t0.8b, #0                                              @ supress further partial tag feed in
5886
5887         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
5888         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
5889 .L256_enc_blocks_more_than_5:                                           @ blocks left >  5
5890
5891         st1     { $res1b}, [$output_ptr], #16                           @ AES final-5 block - store result
5892
5893         rev64   $res0b, $res1b                                          @ GHASH final-5 block
5894
5895         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5896
5897         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
5898
5899         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
5900
5901         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
5902         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
5903
5904         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
5905
5906         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-4 block - load plaintext
5907         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
5908
5909         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
5910         movi    $t0.8b, #0                                              @ supress further partial tag feed in
5911         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
5912
5913         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
5914         eor3    $res1b, $ctr_t1b, $ctr3b, $t1.16b                       @ AES final-4 block - result
5915 .L256_enc_blocks_more_than_4:                                           @ blocks left >  4
5916
5917         st1     { $res1b}, [$output_ptr], #16                           @ AES final-4 block - store result
5918
5919         rev64   $res0b, $res1b                                          @ GHASH final-4 block
5920
5921         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-3 block - load plaintext
5922
5923         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5924
5925         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
5926         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
5927
5928         eor3    $res1b, $ctr_t1b, $ctr4b, $t1.16b                       @ AES final-3 block - result
5929         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
5930
5931         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
5932         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
5933
5934         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
5935
5936         movi    $t0.8b, #0                                              @ supress further partial tag feed in
5937
5938         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
5939         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
5940 .L256_enc_blocks_more_than_3:                                           @ blocks left >  3
5941
5942         st1     { $res1b}, [$output_ptr], #16                           @ AES final-3 block - store result
5943
5944         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
5945         ext     $h4.16b, $h4.16b, $h4.16b, #8
5946         rev64   $res0b, $res1b                                          @ GHASH final-3 block
5947
5948         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5949
5950         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
5951         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
5952
5953         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
5954         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
5955         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
5956
5957         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
5958         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-2 block - load plaintext
5959
5960         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
5961         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
5962
5963         eor3    $res1b, $ctr_t1b, $ctr5b, $t1.16b                       @ AES final-2 block - result
5964         movi    $t0.8b, #0                                              @ supress further partial tag feed in
5965
5966         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
5967         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
5968 .L256_enc_blocks_more_than_2:                                           @ blocks left >  2
5969
5970         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
5971         ext     $h3.16b, $h3.16b, $h3.16b, #8
5972
5973         st1     { $res1b}, [$output_ptr], #16                           @ AES final-2 block - store result
5974
5975         rev64   $res0b, $res1b                                          @ GHASH final-2 block
5976         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final-1 block - load plaintext
5977
5978         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
5979
5980         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
5981
5982         movi    $t0.8b, #0                                              @ supress further partial tag feed in
5983
5984         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
5985         eor3    $res1b, $ctr_t1b, $ctr6b, $t1.16b                       @ AES final-1 block - result
5986
5987         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
5988
5989         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
5990
5991         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
5992         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
5993
5994         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
5995         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
5996 .L256_enc_blocks_more_than_1:                                           @ blocks left >  1
5997
5998         st1     { $res1b}, [$output_ptr], #16                           @ AES final-1 block - store result
5999
6000         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
6001         ext     $h2.16b, $h2.16b, $h2.16b, #8
6002         rev64   $res0b, $res1b                                          @ GHASH final-1 block
6003         ldr     $ctr_t1q, [$input_ptr], #16                             @ AES final block - load plaintext
6004
6005         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
6006         movi    $t0.8b, #0                                              @ supress further partial tag feed in
6007
6008         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
6009         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
6010
6011         eor3    $res1b, $ctr_t1b, $ctr7b, $t1.16b                       @ AES final block - result
6012         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
6013
6014         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
6015         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
6016
6017         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
6018
6019         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
6020         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
6021
6022         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
6023
6024         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
6025 .L256_enc_blocks_less_than_1:                                           @ blocks left <= 1
6026
6027         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
6028
6029         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
6030
6031         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
6032
6033         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
6034         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
6035
6036         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
6037         cmp     $bit_length, #64
6038         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
6039
6040         csel    $temp3_x, $temp0_x, xzr, lt
6041         csel    $temp2_x, $temp1_x, $temp0_x, lt
6042
6043         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
6044         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
6045         ext     $h1.16b, $h1.16b, $h1.16b, #8
6046
6047         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
6048         mov     $ctr0.d[1], $temp3_x
6049
6050         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
6051
6052         rev64   $res0b, $res1b                                          @ GHASH final block
6053
6054         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
6055         bif     $res1b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
6056         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
6057
6058         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
6059         st1     { $res1b}, [$output_ptr]                                @ store all 16B
6060
6061         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
6062         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
6063         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
6064
6065         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
6066         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
6067
6068         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
6069
6070         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
6071
6072         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
6073         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
6074
6075         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
6076
6077         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
6078         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
6079
6080         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
6081
6082         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
6083         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
6084
6085         eor3    $acc_lb, $acc_lb, $acc_hb, $t11.16b                     @ MODULO - fold into low
6086                 ext     $acc_lb, $acc_lb, $acc_lb, #8
6087         rev64   $acc_lb, $acc_lb
6088         st1     { $acc_l.16b }, [$current_tag]
6089         lsr     x0, $bit_length, #3                                     @ return sizes
6090
6091         ldp     d10, d11, [sp, #16]
6092         ldp     d12, d13, [sp, #32]
6093         ldp     d14, d15, [sp, #48]
6094         ldp     d8, d9, [sp], #80
6095         ret
6096
6097 .L256_enc_ret:
6098         mov w0, #0x0
6099         ret
6100 .size unroll8_eor3_aes_gcm_enc_256_kernel,.-unroll8_eor3_aes_gcm_enc_256_kernel
6101 ___
6102
6103 {
6104 #########################################################################################
6105 # size_t unroll8_eor3_aes_gcm_dec_256_kernel(const unsigned char *in,
6106 #                               size_t len,
6107 #                               unsigned char *out,
6108 #                               const void *key,
6109 #                               unsigned char ivec[16],
6110 #                               u64 *Xi);
6111 #
6112 $code.=<<___;
6113 .global unroll8_eor3_aes_gcm_dec_256_kernel
6114 .type   unroll8_eor3_aes_gcm_dec_256_kernel,%function
6115 .align  4
6116 unroll8_eor3_aes_gcm_dec_256_kernel:
6117         AARCH64_VALID_CALL_TARGET
6118         cbz     x1, .L256_dec_ret
6119         stp     d8, d9, [sp, #-80]!
6120         mov     $counter, x4
6121         mov     $cc, x5
6122         stp     d10, d11, [sp, #16]
6123         stp     d12, d13, [sp, #32]
6124         stp     d14, d15, [sp, #48]
6125         mov     x5, #0xc200000000000000
6126         stp     x5, xzr, [sp, #64]
6127         add     $modulo_constant, sp, #64
6128
6129         ld1     { $ctr0b}, [$counter]                                   @ CTR block 0
6130
6131         mov     $constant_temp, #0x100000000                    @ set up counter increment
6132         movi    $rctr_inc.16b, #0x0
6133         mov     $rctr_inc.d[1], $constant_temp
6134         lsr     $main_end_input_ptr, $bit_length, #3                    @ byte_len
6135
6136         sub     $main_end_input_ptr, $main_end_input_ptr, #1            @ byte_len - 1
6137
6138         rev32   $rtmp_ctr.16b, $ctr0.16b                                @ set up reversed counter
6139
6140         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 0
6141
6142         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 1
6143         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 1
6144
6145         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 2
6146         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 2
6147         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
6148
6149         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 3
6150         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 3
6151
6152         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 4
6153         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 4
6154
6155         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 0
6156
6157         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 5
6158         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 5
6159
6160         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 0
6161         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 0
6162
6163         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 6
6164         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 6
6165
6166         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 7
6167         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 0
6168
6169         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 0
6170         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 0
6171
6172         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 0
6173         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 0
6174         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
6175
6176         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 1
6177         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 1
6178         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 1
6179
6180         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 1
6181         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 1
6182         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 1
6183
6184         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 1
6185         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 1
6186
6187         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 2
6188         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 2
6189         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 2
6190
6191         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 2
6192         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 2
6193         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 2
6194
6195         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 2
6196         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 2
6197         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
6198
6199         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 3
6200         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 3
6201
6202         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 3
6203         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 3
6204
6205         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 3
6206         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 3
6207         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 3
6208
6209         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 3
6210
6211         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 4
6212         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 4
6213
6214         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 4
6215         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 4
6216         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 4
6217
6218         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 4
6219         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 4
6220         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 4
6221
6222         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 5
6223         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 5
6224
6225         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
6226         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 5
6227         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 5
6228
6229         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 5
6230
6231         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 5
6232         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 5
6233
6234         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 5
6235
6236         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 6
6237         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 6
6238         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 6
6239
6240         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 6
6241         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 6
6242         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 6
6243
6244         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 6
6245         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 6
6246         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
6247
6248         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 7
6249         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 7
6250
6251         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 7
6252         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 7
6253         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 7
6254
6255         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 7
6256         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 7
6257         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 7
6258
6259         and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffff80 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
6260         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 8
6261         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 8
6262
6263         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 8
6264         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 8
6265         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 8
6266
6267         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 8
6268         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 8
6269         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 8
6270
6271         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 9
6272
6273         ld1     { $acc_lb}, [$current_tag]
6274         ext     $acc_lb, $acc_lb, $acc_lb, #8
6275         rev64   $acc_lb, $acc_lb
6276         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
6277         add     $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
6278         add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
6279
6280         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 9
6281         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 9
6282
6283         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 9
6284         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 9
6285
6286         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 9
6287
6288         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 9
6289         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 9
6290
6291         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 10
6292         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 10
6293         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 10
6294
6295         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 10
6296         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 10
6297         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 10
6298
6299         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 10
6300         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 10
6301         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
6302
6303         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 11
6304         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s @ CTR block 7
6305
6306         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 11
6307         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 11
6308         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 11
6309
6310         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 11
6311         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 11
6312         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 11
6313
6314         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 11
6315         ldr     $rk14q, [$cc, #224]                                     @ load rk14
6316
6317         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 1 - round 12
6318         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 4 - round 12
6319         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 5 - round 12
6320
6321         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
6322         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 3 - round 12
6323         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 2 - round 12
6324
6325         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 6 - round 12
6326         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 0 - round 12
6327         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 7 - round 12
6328
6329         aese    $ctr5b, $rk13                                           @ AES block 5 - round 13
6330         aese    $ctr1b, $rk13                                           @ AES block 1 - round 13
6331         aese    $ctr2b, $rk13                                           @ AES block 2 - round 13
6332
6333         aese    $ctr0b, $rk13                                           @ AES block 0 - round 13
6334         aese    $ctr4b, $rk13                                           @ AES block 4 - round 13
6335         aese    $ctr6b, $rk13                                           @ AES block 6 - round 13
6336
6337         aese    $ctr3b, $rk13                                           @ AES block 3 - round 13
6338         aese    $ctr7b, $rk13                                           @ AES block 7 - round 13
6339         b.ge    .L256_dec_tail                                          @ handle tail
6340
6341         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 0, 1 - load ciphertext
6342
6343         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 2, 3 - load ciphertext
6344
6345         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 4, 5 - load ciphertext
6346
6347         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 6, 7 - load ciphertext
6348         cmp     $input_ptr, $main_end_input_ptr                         @ check if we have <= 8 blocks
6349
6350         eor3    $ctr1b, $res1b, $ctr1b, $rk14                           @ AES block 1 - result
6351         eor3    $ctr0b, $res0b, $ctr0b, $rk14                           @ AES block 0 - result
6352         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 0, 1 - store result
6353
6354         rev32   $ctr0.16b, $rtmp_ctr.16b                                @ CTR block 8
6355         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8
6356         eor3    $ctr3b, $res3b, $ctr3b, $rk14                           @ AES block 3 - result
6357
6358         eor3    $ctr5b, $res5b, $ctr5b, $rk14                           @ AES block 5 - result
6359
6360         eor3    $ctr4b, $res4b, $ctr4b, $rk14                           @ AES block 4 - result
6361         rev32   $ctr1.16b, $rtmp_ctr.16b                                @ CTR block 9
6362         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 9
6363
6364         eor3    $ctr2b, $res2b, $ctr2b, $rk14                           @ AES block 2 - result
6365         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 2, 3 - store result
6366
6367         rev32   $ctr2.16b, $rtmp_ctr.16b                                @ CTR block 10
6368         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 10
6369
6370         eor3    $ctr6b, $res6b, $ctr6b, $rk14                           @ AES block 6 - result
6371
6372         rev32   $ctr3.16b, $rtmp_ctr.16b                                @ CTR block 11
6373         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 11
6374         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 4, 5 - store result
6375
6376         eor3    $ctr7b, $res7b, $ctr7b, $rk14                           @ AES block 7 - result
6377         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 6, 7 - store result
6378
6379         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 12
6380         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 12
6381         b.ge    .L256_dec_prepretail                                    @ do prepretail
6382
6383 .L256_dec_main_loop:                                                    @ main loop start
6384         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
6385         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
6386         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
6387
6388         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
6389         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
6390         ext     $h7.16b, $h7.16b, $h7.16b, #8
6391         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
6392         ext     $h8.16b, $h8.16b, $h8.16b, #8
6393
6394         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
6395         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
6396         rev64   $res0b, $res0b                                          @ GHASH block 8k
6397
6398         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
6399         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
6400         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
6401
6402         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
6403         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
6404
6405         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
6406         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
6407         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
6408
6409         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
6410         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
6411         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
6412
6413         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
6414         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
6415         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
6416
6417         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
6418         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
6419         ext     $h5.16b, $h5.16b, $h5.16b, #8
6420         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
6421         ext     $h6.16b, $h6.16b, $h6.16b, #8
6422         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
6423
6424         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
6425         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
6426         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
6427
6428         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
6429         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
6430         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
6431
6432         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
6433         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
6434         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
6435
6436         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
6437         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
6438         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
6439
6440         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
6441         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
6442         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
6443
6444         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
6445         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
6446         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
6447
6448         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
6449         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
6450         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
6451
6452         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
6453         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
6454         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
6455
6456         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
6457         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
6458         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
6459
6460         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
6461         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
6462         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
6463
6464         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
6465         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
6466         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
6467
6468         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
6469         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
6470         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
6471
6472         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
6473         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
6474         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
6475
6476         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
6477         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
6478         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
6479
6480         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
6481         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
6482         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
6483         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
6484
6485         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
6486         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
6487         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
6488
6489         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
6490         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
6491         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
6492
6493         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
6494         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
6495         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
6496
6497         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
6498         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
6499         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
6500
6501         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
6502         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
6503         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
6504
6505         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
6506         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
6507         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
6508
6509         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
6510         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
6511         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
6512
6513         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
6514         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
6515         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
6516
6517         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
6518         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
6519         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
6520
6521         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
6522         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
6523         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
6524
6525         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
6526         ext     $h3.16b, $h3.16b, $h3.16b, #8
6527         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
6528         ext     $h4.16b, $h4.16b, $h4.16b, #8
6529         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
6530         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
6531
6532         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
6533         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
6534         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
6535
6536         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
6537         ext     $h1.16b, $h1.16b, $h1.16b, #8
6538         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
6539         ext     $h2.16b, $h2.16b, $h2.16b, #8
6540         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
6541         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
6542
6543         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
6544         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
6545         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
6546
6547         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
6548         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
6549         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
6550         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
6551
6552         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
6553         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
6554         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
6555
6556         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
6557         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
6558         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
6559
6560         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
6561         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
6562         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
6563
6564         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
6565         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
6566         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
6567
6568         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
6569         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
6570         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
6571
6572         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
6573         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
6574         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
6575
6576         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
6577         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
6578         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
6579
6580         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
6581         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
6582         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
6583
6584         ldp     $res0q, $res1q, [$input_ptr], #32                       @ AES block 8k+8, 8k+9 - load ciphertext
6585         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
6586         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
6587
6588         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
6589         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
6590         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
6591
6592         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
6593         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
6594         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
6595
6596         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
6597         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
6598         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
6599
6600         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
6601         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
6602         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
6603
6604         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
6605         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
6606         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
6607
6608         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
6609         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
6610         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
6611
6612         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
6613         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
6614         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
6615
6616         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
6617         rev32   $h1.16b, $rtmp_ctr.16b                                  @ CTR block 8k+16
6618         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
6619
6620         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+16
6621         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
6622         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
6623
6624         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
6625         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
6626
6627         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
6628         rev32   $h2.16b, $rtmp_ctr.16b                                  @ CTR block 8k+17
6629         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
6630
6631         ldp     $res2q, $res3q, [$input_ptr], #32                       @ AES block 8k+10, 8k+11 - load ciphertext
6632         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
6633         ext     $t11.16b, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
6634
6635         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
6636         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+17
6637         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
6638
6639         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
6640         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
6641         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
6642
6643         rev32   $h3.16b, $rtmp_ctr.16b                                  @ CTR block 8k+18
6644         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+18
6645         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
6646
6647         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
6648         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
6649         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
6650
6651         ldr     $rk14q, [$cc, #224]                                     @ load rk14
6652         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
6653         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
6654
6655         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
6656         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
6657         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
6658
6659         ldp     $res4q, $res5q, [$input_ptr], #32                       @ AES block 8k+12, 8k+13 - load ciphertext
6660         aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
6661         aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
6662
6663         ldp     $res6q, $res7q, [$input_ptr], #32                       @ AES block 8k+14, 8k+15 - load ciphertext
6664         aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
6665         aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
6666
6667         rev32   $h4.16b, $rtmp_ctr.16b                                  @ CTR block 8k+19
6668         eor3    $ctr2b, $res2b, $ctr2b, $rk14                           @ AES block 8k+10 - result
6669         eor3    $ctr1b, $res1b, $ctr1b, $rk14                           @ AES block 8k+9 - result
6670
6671         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
6672         aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
6673
6674         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+19
6675         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
6676         aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
6677
6678         eor3    $ctr5b, $res5b, $ctr5b, $rk14                           @ AES block 8k+13 - result
6679         eor3    $ctr0b, $res0b, $ctr0b, $rk14                           @ AES block 8k+8 - result
6680         aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
6681
6682         stp     $ctr0q, $ctr1q, [$output_ptr], #32                      @ AES block 8k+8, 8k+9 - store result
6683         mov     $ctr0.16b, $h1.16b                                      @ CTR block 8k+16
6684         eor3    $ctr4b, $res4b, $ctr4b, $rk14                           @ AES block 8k+12 - result
6685
6686         eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
6687         eor3    $ctr3b, $res3b, $ctr3b, $rk14                           @ AES block 8k+11 - result
6688         stp     $ctr2q, $ctr3q, [$output_ptr], #32                      @ AES block 8k+10, 8k+11 - store result
6689
6690         mov     $ctr3.16b, $h4.16b                                      @ CTR block 8k+19
6691         mov     $ctr2.16b, $h3.16b                                      @ CTR block 8k+18
6692         aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
6693
6694         mov     $ctr1.16b, $h2.16b                                      @ CTR block 8k+17
6695         stp     $ctr4q, $ctr5q, [$output_ptr], #32                      @ AES block 8k+12, 8k+13 - store result
6696         eor3    $ctr7b, $res7b, $ctr7b, $rk14                           @ AES block 8k+15 - result
6697
6698         eor3    $ctr6b, $res6b, $ctr6b, $rk14                           @ AES block 8k+14 - result
6699         rev32   $ctr4.16b, $rtmp_ctr.16b                                @ CTR block 8k+20
6700         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+20
6701
6702         cmp     $input_ptr, $main_end_input_ptr                         @ LOOP CONTROL
6703         stp     $ctr6q, $ctr7q, [$output_ptr], #32                      @ AES block 8k+14, 8k+15 - store result
6704         b.lt    .L256_dec_main_loop
6705
6706 .L256_dec_prepretail:                                                   @ PREPRETAIL
6707         ldp     $rk0q, $rk1q, [$cc, #0]                                 @ load rk0, rk1
6708         rev32   $ctr5.16b, $rtmp_ctr.16b                                @ CTR block 8k+13
6709         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+13
6710
6711         rev64   $res4b, $res4b                                          @ GHASH block 8k+4
6712         ldr     $h56kq, [$current_tag, #144]                            @ load h6k | h5k
6713         ldr     $h78kq, [$current_tag, #192]                            @ load h8k | h7k
6714
6715         rev32   $ctr6.16b, $rtmp_ctr.16b                                @ CTR block 8k+14
6716         rev64   $res0b, $res0b                                          @ GHASH block 8k
6717         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+14
6718
6719         ext     $acc_lb, $acc_lb, $acc_lb, #8                           @ PRE 0
6720         ldr     $h7q, [$current_tag, #176]                              @ load h7l | h7h
6721         ext     $h7.16b, $h7.16b, $h7.16b, #8
6722         ldr     $h8q, [$current_tag, #208]                              @ load h8l | h8h
6723         ext     $h8.16b, $h8.16b, $h8.16b, #8
6724         rev64   $res1b, $res1b                                          @ GHASH block 8k+1
6725
6726         rev32   $ctr7.16b, $rtmp_ctr.16b                                @ CTR block 8k+15
6727         rev64   $res2b, $res2b                                          @ GHASH block 8k+2
6728         ldr     $h5q, [$current_tag, #128]                              @ load h5l | h5h
6729         ext     $h5.16b, $h5.16b, $h5.16b, #8
6730         ldr     $h6q, [$current_tag, #160]                              @ load h6l | h6h
6731         ext     $h6.16b, $h6.16b, $h6.16b, #8
6732
6733         aese    $ctr0b, $rk0  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 0
6734         aese    $ctr1b, $rk0  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 0
6735         aese    $ctr4b, $rk0  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 0
6736
6737         aese    $ctr3b, $rk0  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 0
6738         aese    $ctr5b, $rk0  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 0
6739         aese    $ctr6b, $rk0  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 0
6740
6741         aese    $ctr4b, $rk1  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 1
6742         aese    $ctr7b, $rk0  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 0
6743         aese    $ctr2b, $rk0  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 0
6744
6745         ldp     $rk2q, $rk3q, [$cc, #32]                                @ load rk2, rk3
6746         aese    $ctr0b, $rk1  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 1
6747         eor     $res0b, $res0b, $acc_lb                                 @ PRE 1
6748
6749         aese    $ctr7b, $rk1  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 1
6750         aese    $ctr6b, $rk1  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 1
6751         aese    $ctr2b, $rk1  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 1
6752
6753         aese    $ctr3b, $rk1  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 1
6754         aese    $ctr1b, $rk1  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 1
6755         aese    $ctr5b, $rk1  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 1
6756
6757         pmull2  $t0.1q, $res1.2d, $h7.2d                                @ GHASH block 8k+1 - high
6758         trn1    $acc_m.2d, $res1.2d, $res0.2d                           @ GHASH block 8k, 8k+1 - mid
6759         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH block 8k - low
6760
6761         rev64   $res3b, $res3b                                          @ GHASH block 8k+3
6762         pmull   $h7.1q, $res1.1d, $h7.1d                                @ GHASH block 8k+1 - low
6763
6764         aese    $ctr5b, $rk2  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 2
6765         aese    $ctr7b, $rk2  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 2
6766         aese    $ctr1b, $rk2  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 2
6767
6768         aese    $ctr3b, $rk2  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 2
6769         aese    $ctr6b, $rk2  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 2
6770         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH block 8k - high
6771
6772         aese    $ctr0b, $rk2  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 2
6773         aese    $ctr7b, $rk3  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 3
6774
6775         aese    $ctr5b, $rk3  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 3
6776         rev64   $res6b, $res6b                                          @ GHASH block 8k+6
6777
6778         aese    $ctr0b, $rk3  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 3
6779         aese    $ctr2b, $rk2  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 2
6780         aese    $ctr6b, $rk3  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 3
6781
6782         pmull2  $t1.1q, $res2.2d, $h6.2d                                @ GHASH block 8k+2 - high
6783         trn2    $res0.2d, $res1.2d, $res0.2d                            @ GHASH block 8k, 8k+1 - mid
6784         aese    $ctr4b, $rk2  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 2
6785
6786         ldp     $rk4q, $rk5q, [$cc, #64]                                @ load rk4, rk5
6787         aese    $ctr1b, $rk3  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 3
6788         pmull2  $t2.1q, $res3.2d, $h5.2d                                @ GHASH block 8k+3 - high
6789
6790         aese    $ctr2b, $rk3  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 3
6791         eor     $acc_hb, $acc_hb, $t0.16b                               @ GHASH block 8k+1 - high
6792         eor     $res0.16b, $res0.16b, $acc_m.16b                        @ GHASH block 8k, 8k+1 - mid
6793
6794         aese    $ctr4b, $rk3  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 3
6795         pmull   $h6.1q, $res2.1d, $h6.1d                                @ GHASH block 8k+2 - low
6796         aese    $ctr3b, $rk3  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 3
6797
6798         eor3    $acc_hb, $acc_hb, $t1.16b, $t2.16b                      @ GHASH block 8k+2, 8k+3 - high
6799         trn1    $t3.2d, $res3.2d, $res2.2d                              @ GHASH block 8k+2, 8k+3 - mid
6800         trn2    $res2.2d, $res3.2d, $res2.2d                            @ GHASH block 8k+2, 8k+3 - mid
6801
6802         pmull2  $acc_m.1q, $res0.2d, $h78k.2d                           @ GHASH block 8k        - mid
6803         pmull   $h5.1q, $res3.1d, $h5.1d                                @ GHASH block 8k+3 - low
6804         eor     $acc_lb, $acc_lb, $h7.16b                               @ GHASH block 8k+1 - low
6805
6806         pmull   $h78k.1q, $res0.1d, $h78k.1d                            @ GHASH block 8k+1 - mid
6807         aese    $ctr5b, $rk4  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 4
6808         aese    $ctr0b, $rk4  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 4
6809
6810         eor3    $acc_lb, $acc_lb, $h6.16b, $h5.16b                      @ GHASH block 8k+2, 8k+3 - low
6811         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
6812         ext     $h1.16b, $h1.16b, $h1.16b, #8
6813         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
6814         ext     $h2.16b, $h2.16b, $h2.16b, #8
6815         aese    $ctr7b, $rk4  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 4
6816
6817         aese    $ctr2b, $rk4  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 4
6818         aese    $ctr6b, $rk4  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 4
6819         eor     $acc_mb, $acc_mb, $h78k.16b                             @ GHASH block 8k+1 - mid
6820
6821         eor     $res2.16b, $res2.16b, $t3.16b                           @ GHASH block 8k+2, 8k+3 - mid
6822         aese    $ctr7b, $rk5  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 5
6823         aese    $ctr1b, $rk4  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 4
6824
6825         aese    $ctr2b, $rk5  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 5
6826         aese    $ctr3b, $rk4  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 4
6827         aese    $ctr4b, $rk4  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 4
6828
6829         aese    $ctr1b, $rk5  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 5
6830         pmull2  $t3.1q, $res2.2d, $h56k.2d                              @ GHASH block 8k+2 - mid
6831         aese    $ctr6b, $rk5  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 5
6832
6833         aese    $ctr4b, $rk5  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 5
6834         aese    $ctr3b, $rk5  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 5
6835         pmull   $h56k.1q, $res2.1d, $h56k.1d                            @ GHASH block 8k+3 - mid
6836
6837         aese    $ctr0b, $rk5  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 5
6838         aese    $ctr5b, $rk5  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 5
6839         ldp     $rk6q, $rk7q, [$cc, #96]                                @ load rk6, rk7
6840
6841         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
6842         ext     $h3.16b, $h3.16b, $h3.16b, #8
6843         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
6844         ext     $h4.16b, $h4.16b, $h4.16b, #8
6845         rev64   $res7b, $res7b                                          @ GHASH block 8k+7
6846         rev64   $res5b, $res5b                                          @ GHASH block 8k+5
6847
6848         eor3    $acc_mb, $acc_mb, $h56k.16b, $t3.16b                    @ GHASH block 8k+2, 8k+3 - mid
6849
6850         trn1    $t6.2d, $res5.2d, $res4.2d                              @ GHASH block 8k+4, 8k+5 - mid
6851
6852         aese    $ctr0b, $rk6  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 6
6853         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
6854         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
6855         aese    $ctr6b, $rk6  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 6
6856
6857         aese    $ctr5b, $rk6  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 6
6858         aese    $ctr7b, $rk6  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 6
6859
6860         pmull2  $t4.1q, $res4.2d, $h4.2d                                @ GHASH block 8k+4 - high
6861         pmull2  $t5.1q, $res5.2d, $h3.2d                                @ GHASH block 8k+5 - high
6862         pmull   $h4.1q, $res4.1d, $h4.1d                                @ GHASH block 8k+4 - low
6863
6864         trn2    $res4.2d, $res5.2d, $res4.2d                            @ GHASH block 8k+4, 8k+5 - mid
6865         pmull   $h3.1q, $res5.1d, $h3.1d                                @ GHASH block 8k+5 - low
6866         trn1    $t9.2d, $res7.2d, $res6.2d                              @ GHASH block 8k+6, 8k+7 - mid
6867
6868         aese    $ctr7b, $rk7  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 7
6869         pmull2  $t7.1q, $res6.2d, $h2.2d                                @ GHASH block 8k+6 - high
6870         aese    $ctr1b, $rk6  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 6
6871
6872         aese    $ctr2b, $rk6  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 6
6873         aese    $ctr3b, $rk6  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 6
6874         aese    $ctr4b, $rk6  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 6
6875
6876         ldp     $rk8q, $rk9q, [$cc, #128]                               @ load rk8, rk9
6877         pmull   $h2.1q, $res6.1d, $h2.1d                                @ GHASH block 8k+6 - low
6878         aese    $ctr5b, $rk7  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 7
6879
6880         aese    $ctr1b, $rk7  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 7
6881         aese    $ctr4b, $rk7  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 7
6882
6883         aese    $ctr6b, $rk7  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 7
6884         aese    $ctr2b, $rk7  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 7
6885         eor3    $acc_hb, $acc_hb, $t4.16b, $t5.16b                      @ GHASH block 8k+4, 8k+5 - high
6886
6887         aese    $ctr0b, $rk7  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 7
6888         trn2    $res6.2d, $res7.2d, $res6.2d                            @ GHASH block 8k+6, 8k+7 - mid
6889         aese    $ctr3b, $rk7  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 7
6890
6891         aese    $ctr0b, $rk8  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 8
6892         aese    $ctr7b, $rk8  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 8
6893         aese    $ctr4b, $rk8  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 8
6894
6895         aese    $ctr1b, $rk8  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 8
6896         aese    $ctr5b, $rk8  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 8
6897         aese    $ctr6b, $rk8  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 8
6898
6899         aese    $ctr3b, $rk8  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 8
6900         aese    $ctr4b, $rk9  \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 9
6901         eor     $res4.16b, $res4.16b, $t6.16b                           @ GHASH block 8k+4, 8k+5 - mid
6902
6903         aese    $ctr0b, $rk9  \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 9
6904         aese    $ctr1b, $rk9  \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 9
6905         eor     $res6.16b, $res6.16b, $t9.16b                           @ GHASH block 8k+6, 8k+7 - mid
6906
6907         aese    $ctr6b, $rk9  \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 9
6908         aese    $ctr7b, $rk9  \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 9
6909         pmull2  $t6.1q, $res4.2d, $h34k.2d                              @ GHASH block 8k+4 - mid
6910
6911         aese    $ctr2b, $rk8  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 8
6912         pmull   $h34k.1q, $res4.1d, $h34k.1d                            @ GHASH block 8k+5 - mid
6913         pmull2  $t8.1q, $res7.2d, $h1.2d                                @ GHASH block 8k+7 - high
6914
6915         pmull2  $t9.1q, $res6.2d, $h12k.2d                              @ GHASH block 8k+6 - mid
6916         pmull   $h12k.1q, $res6.1d, $h12k.1d                            @ GHASH block 8k+7 - mid
6917         pmull   $h1.1q, $res7.1d, $h1.1d                                @ GHASH block 8k+7 - low
6918
6919         ldp     $rk10q, $rk11q, [$cc, #160]                             @ load rk10, rk11
6920         eor3    $acc_lb, $acc_lb, $h4.16b, $h3.16b                      @ GHASH block 8k+4, 8k+5 - low
6921         eor3    $acc_mb, $acc_mb, $h34k.16b, $t6.16b                    @ GHASH block 8k+4, 8k+5 - mid
6922
6923         aese    $ctr2b, $rk9  \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 9
6924         aese    $ctr3b, $rk9  \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 9
6925         aese    $ctr5b, $rk9  \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 9
6926
6927         eor3    $acc_hb, $acc_hb, $t7.16b, $t8.16b                      @ GHASH block 8k+6, 8k+7 - high
6928         eor3    $acc_lb, $acc_lb, $h2.16b, $h1.16b                      @ GHASH block 8k+6, 8k+7 - low
6929         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
6930
6931         eor3    $acc_mb, $acc_mb, $h12k.16b, $t9.16b                    @ GHASH block 8k+6, 8k+7 - mid
6932
6933         aese    $ctr4b, $rk10 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 10
6934         aese    $ctr6b, $rk10 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 10
6935         aese    $ctr5b, $rk10 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 10
6936
6937         aese    $ctr0b, $rk10 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 10
6938         aese    $ctr2b, $rk10 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 10
6939         aese    $ctr3b, $rk10 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 10
6940
6941         eor3    $acc_mb, $acc_mb, $acc_hb, $acc_lb                      @ MODULO - karatsuba tidy up
6942
6943         aese    $ctr7b, $rk10 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 10
6944         aese    $ctr1b, $rk10 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 10
6945         ldp     $rk12q, $rk13q, [$cc, #192]                             @ load rk12, rk13
6946
6947         ext     $t11.16b, $acc_hb, $acc_hb, #8                          @ MODULO - other top alignment
6948
6949         aese    $ctr2b, $rk11 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 11
6950         aese    $ctr1b, $rk11 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 11
6951         aese    $ctr0b, $rk11 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 11
6952
6953         pmull   $t12.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
6954         aese    $ctr3b, $rk11 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 11
6955
6956         aese    $ctr7b, $rk11 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 11
6957         aese    $ctr6b, $rk11 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 11
6958         aese    $ctr4b, $rk11 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 11
6959
6960         aese    $ctr5b, $rk11 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 11
6961         aese    $ctr3b, $rk12 \n  aesmc $ctr3b, $ctr3b                  @ AES block 8k+11 - round 12
6962
6963         eor3    $acc_mb, $acc_mb, $t12.16b, $t11.16b                    @ MODULO - fold into mid
6964
6965         aese    $ctr3b, $rk13                                           @ AES block 8k+11 - round 13
6966         aese    $ctr2b, $rk12 \n  aesmc $ctr2b, $ctr2b                  @ AES block 8k+10 - round 12
6967         aese    $ctr6b, $rk12 \n  aesmc $ctr6b, $ctr6b                  @ AES block 8k+14 - round 12
6968
6969         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
6970         aese    $ctr4b, $rk12 \n  aesmc $ctr4b, $ctr4b                  @ AES block 8k+12 - round 12
6971         aese    $ctr7b, $rk12 \n  aesmc $ctr7b, $ctr7b                  @ AES block 8k+15 - round 12
6972
6973         aese    $ctr0b, $rk12 \n  aesmc $ctr0b, $ctr0b                  @ AES block 8k+8 - round 12
6974         ldr     $rk14q, [$cc, #224]                                     @ load rk14
6975         aese    $ctr1b, $rk12 \n  aesmc $ctr1b, $ctr1b                  @ AES block 8k+9 - round 12
6976
6977         aese    $ctr4b, $rk13                                           @ AES block 8k+12 - round 13
6978         ext     $t11.16b, $acc_mb, $acc_mb, #8                          @ MODULO - other mid alignment
6979         aese    $ctr5b, $rk12 \n  aesmc $ctr5b, $ctr5b                  @ AES block 8k+13 - round 12
6980
6981         aese    $ctr6b, $rk13                                           @ AES block 8k+14 - round 13
6982         aese    $ctr2b, $rk13                                           @ AES block 8k+10 - round 13
6983         aese    $ctr1b, $rk13                                           @ AES block 8k+9 - round 13
6984
6985         aese    $ctr5b, $rk13                                           @ AES block 8k+13 - round 13
6986         eor3    $acc_lb, $acc_lb, $t11.16b, $acc_hb                     @ MODULO - fold into low
6987         add     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s                @ CTR block 8k+15
6988
6989         aese    $ctr7b, $rk13                                           @ AES block 8k+15 - round 13
6990         aese    $ctr0b, $rk13                                           @ AES block 8k+8 - round 13
6991 .L256_dec_tail:                                                         @ TAIL
6992
6993         ext     $t0.16b, $acc_lb, $acc_lb, #8                           @ prepare final partial tag
6994         sub     $main_end_input_ptr, $end_input_ptr, $input_ptr         @ main_end_input_ptr is number of bytes left to process
6995         cmp     $main_end_input_ptr, #112
6996
6997         ldr     $res1q, [$input_ptr], #16                               @ AES block 8k+8 - load ciphertext
6998
6999         ldp     $h78kq, $h8q, [$current_tag, #192]                      @ load h8k | h7k
7000         ext     $h8.16b, $h8.16b, $h8.16b, #8
7001         mov     $t1.16b, $rk14
7002
7003         ldp     $h5q, $h56kq, [$current_tag, #128]                      @ load h5l | h5h
7004         ext     $h5.16b, $h5.16b, $h5.16b, #8
7005
7006         eor3    $res4b, $res1b, $ctr0b, $t1.16b                         @ AES block 8k+8 - result
7007         ldp     $h6q, $h7q, [$current_tag, #160]                        @ load h6l | h6h
7008         ext     $h6.16b, $h6.16b, $h6.16b, #8
7009         ext     $h7.16b, $h7.16b, $h7.16b, #8
7010         b.gt    .L256_dec_blocks_more_than_7
7011
7012         mov     $ctr7b, $ctr6b
7013         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7014         mov     $ctr6b, $ctr5b
7015
7016         mov     $ctr5b, $ctr4b
7017         mov     $ctr4b, $ctr3b
7018         movi    $acc_l.8b, #0
7019
7020         movi    $acc_h.8b, #0
7021         movi    $acc_m.8b, #0
7022         mov     $ctr3b, $ctr2b
7023
7024         cmp     $main_end_input_ptr, #96
7025         mov     $ctr2b, $ctr1b
7026         b.gt    .L256_dec_blocks_more_than_6
7027
7028         mov     $ctr7b, $ctr6b
7029         mov     $ctr6b, $ctr5b
7030
7031         mov     $ctr5b, $ctr4b
7032         cmp     $main_end_input_ptr, #80
7033         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7034
7035         mov     $ctr4b, $ctr3b
7036         mov     $ctr3b, $ctr1b
7037         b.gt    .L256_dec_blocks_more_than_5
7038
7039         cmp     $main_end_input_ptr, #64
7040         mov     $ctr7b, $ctr6b
7041         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7042
7043         mov     $ctr6b, $ctr5b
7044
7045         mov     $ctr5b, $ctr4b
7046         mov     $ctr4b, $ctr1b
7047         b.gt    .L256_dec_blocks_more_than_4
7048
7049         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7050         mov     $ctr7b, $ctr6b
7051         cmp     $main_end_input_ptr, #48
7052
7053         mov     $ctr6b, $ctr5b
7054         mov     $ctr5b, $ctr1b
7055         b.gt    .L256_dec_blocks_more_than_3
7056
7057         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
7058         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7059         mov     $ctr7b, $ctr6b
7060
7061         cmp     $main_end_input_ptr, #32
7062         mov     $ctr6b, $ctr1b
7063         b.gt    .L256_dec_blocks_more_than_2
7064
7065         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7066
7067         mov     $ctr7b, $ctr1b
7068         cmp     $main_end_input_ptr, #16
7069         b.gt    .L256_dec_blocks_more_than_1
7070
7071         sub     $rtmp_ctr.4s, $rtmp_ctr.4s, $rctr_inc.4s
7072         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
7073         b        .L256_dec_blocks_less_than_1
7074 .L256_dec_blocks_more_than_7:                                           @ blocks left >  7
7075         rev64   $res0b, $res1b                                          @ GHASH final-7 block
7076         ldr     $res1q, [$input_ptr], #16                               @ AES final-6 block - load ciphertext
7077         st1     { $res4b}, [$output_ptr], #16                           @ AES final-7 block  - store result
7078
7079         ins     $acc_m.d[0], $h78k.d[1]                                 @ GHASH final-7 block - mid
7080
7081         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7082
7083         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-7 block - mid
7084         eor3    $res4b, $res1b, $ctr1b, $t1.16b                         @ AES final-6 block - result
7085
7086         pmull2  $acc_h.1q, $res0.2d, $h8.2d                             @ GHASH final-7 block - high
7087
7088         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-7 block - mid
7089         movi    $t0.8b, #0                                              @ supress further partial tag feed in
7090
7091         pmull   $acc_l.1q, $res0.1d, $h8.1d                             @ GHASH final-7 block - low
7092         pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                          @ GHASH final-7 block - mid
7093 .L256_dec_blocks_more_than_6:                                           @ blocks left >  6
7094
7095         rev64   $res0b, $res1b                                          @ GHASH final-6 block
7096
7097         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7098         ldr     $res1q, [$input_ptr], #16                               @ AES final-5 block - load ciphertext
7099         movi    $t0.8b, #0                                              @ supress further partial tag feed in
7100
7101         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-6 block - mid
7102         st1     { $res4b}, [$output_ptr], #16                           @ AES final-6 block - store result
7103         pmull2  $rk2q1, $res0.2d, $h7.2d                                @ GHASH final-6 block - high
7104
7105         pmull   $rk3q1, $res0.1d, $h7.1d                                @ GHASH final-6 block - low
7106
7107         eor3    $res4b, $res1b, $ctr2b, $t1.16b                         @ AES final-5 block - result
7108         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-6 block - low
7109         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-6 block - mid
7110
7111         pmull   $rk4v.1q, $rk4v.1d, $h78k.1d                            @ GHASH final-6 block - mid
7112
7113         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-6 block - mid
7114         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-6 block - high
7115 .L256_dec_blocks_more_than_5:                                           @ blocks left >  5
7116
7117         rev64   $res0b, $res1b                                          @ GHASH final-5 block
7118
7119         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7120
7121         pmull2  $rk2q1, $res0.2d, $h6.2d                                @ GHASH final-5 block - high
7122         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-5 block - mid
7123
7124         ldr     $res1q, [$input_ptr], #16                               @ AES final-4 block - load ciphertext
7125
7126         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-5 block - mid
7127         st1     { $res4b}, [$output_ptr], #16                           @ AES final-5 block - store result
7128
7129         pmull   $rk3q1, $res0.1d, $h6.1d                                @ GHASH final-5 block - low
7130         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-5 block - mid
7131
7132         pmull2  $rk4v.1q, $rk4v.2d, $h56k.2d                            @ GHASH final-5 block - mid
7133
7134         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-5 block - high
7135         eor3    $res4b, $res1b, $ctr3b, $t1.16b                         @ AES final-4 block - result
7136         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-5 block - low
7137
7138         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-5 block - mid
7139         movi    $t0.8b, #0                                              @ supress further partial tag feed in
7140 .L256_dec_blocks_more_than_4:                                           @ blocks left >  4
7141
7142         rev64   $res0b, $res1b                                          @ GHASH final-4 block
7143
7144         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7145
7146         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-4 block - mid
7147         ldr     $res1q, [$input_ptr], #16                               @ AES final-3 block - load ciphertext
7148
7149         movi    $t0.8b, #0                                              @ supress further partial tag feed in
7150
7151         pmull   $rk3q1, $res0.1d, $h5.1d                                @ GHASH final-4 block - low
7152         pmull2  $rk2q1, $res0.2d, $h5.2d                                @ GHASH final-4 block - high
7153
7154         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-4 block - mid
7155
7156         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-4 block - high
7157
7158         pmull   $rk4v.1q, $rk4v.1d, $h56k.1d                            @ GHASH final-4 block - mid
7159
7160         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-4 block - low
7161         st1     { $res4b}, [$output_ptr], #16                           @ AES final-4 block - store result
7162
7163         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-4 block - mid
7164         eor3    $res4b, $res1b, $ctr4b, $t1.16b                         @ AES final-3 block - result
7165 .L256_dec_blocks_more_than_3:                                           @ blocks left >  3
7166
7167         ldr     $h4q, [$current_tag, #112]                              @ load h4l | h4h
7168         ext     $h4.16b, $h4.16b, $h4.16b, #8
7169         rev64   $res0b, $res1b                                          @ GHASH final-3 block
7170
7171         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7172         ldr     $res1q, [$input_ptr], #16                               @ AES final-2 block - load ciphertext
7173         ldr     $h34kq, [$current_tag, #96]                             @ load h4k | h3k
7174
7175         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-3 block - mid
7176         st1     { $res4b}, [$output_ptr], #16                           @ AES final-3 block - store result
7177
7178         eor3    $res4b, $res1b, $ctr5b, $t1.16b                         @ AES final-2 block - result
7179
7180         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-3 block - mid
7181
7182         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-3 block - mid
7183         pmull   $rk3q1, $res0.1d, $h4.1d                                @ GHASH final-3 block - low
7184         pmull2  $rk2q1, $res0.2d, $h4.2d                                @ GHASH final-3 block - high
7185
7186         movi    $t0.8b, #0                                              @ supress further partial tag feed in
7187         pmull2  $rk4v.1q, $rk4v.2d, $h34k.2d                            @ GHASH final-3 block - mid
7188         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-3 block - low
7189
7190         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-3 block - high
7191
7192         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-3 block - mid
7193 .L256_dec_blocks_more_than_2:                                           @ blocks left >  2
7194
7195         rev64   $res0b, $res1b                                          @ GHASH final-2 block
7196
7197         ldr     $h3q, [$current_tag, #80]                               @ load h3l | h3h
7198         ext     $h3.16b, $h3.16b, $h3.16b, #8
7199         ldr     $res1q, [$input_ptr], #16                               @ AES final-1 block - load ciphertext
7200
7201         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7202
7203         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-2 block - mid
7204
7205         pmull   $rk3q1, $res0.1d, $h3.1d                                @ GHASH final-2 block - low
7206         st1     { $res4b}, [$output_ptr], #16                           @ AES final-2 block - store result
7207         eor3    $res4b, $res1b, $ctr6b, $t1.16b                         @ AES final-1 block - result
7208
7209         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-2 block - mid
7210         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-2 block - low
7211         movi    $t0.8b, #0                                              @ supress further partial tag feed in
7212
7213         pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                            @ GHASH final-2 block - mid
7214         pmull2  $rk2q1, $res0.2d, $h3.2d                                @ GHASH final-2 block - high
7215
7216         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-2 block - mid
7217         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-2 block - high
7218 .L256_dec_blocks_more_than_1:                                           @ blocks left >  1
7219
7220         rev64   $res0b, $res1b                                          @ GHASH final-1 block
7221
7222         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7223
7224         ins     $rk4v.d[0], $res0.d[1]                                  @ GHASH final-1 block - mid
7225         ldr     $h2q, [$current_tag, #64]                               @ load h2l | h2h
7226         ext     $h2.16b, $h2.16b, $h2.16b, #8
7227
7228         eor     $rk4v.8b, $rk4v.8b, $res0.8b                            @ GHASH final-1 block - mid
7229         ldr     $res1q, [$input_ptr], #16                               @ AES final block - load ciphertext
7230         st1     { $res4b}, [$output_ptr], #16                           @ AES final-1 block - store result
7231
7232         ldr     $h12kq, [$current_tag, #48]                             @ load h2k | h1k
7233         pmull   $rk3q1, $res0.1d, $h2.1d                                @ GHASH final-1 block - low
7234
7235         ins     $rk4v.d[1], $rk4v.d[0]                                  @ GHASH final-1 block - mid
7236
7237         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final-1 block - low
7238
7239         eor3    $res4b, $res1b, $ctr7b, $t1.16b                         @ AES final block - result
7240         pmull2  $rk2q1, $res0.2d, $h2.2d                                @ GHASH final-1 block - high
7241
7242         pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                            @ GHASH final-1 block - mid
7243
7244         movi    $t0.8b, #0                                              @ supress further partial tag feed in
7245         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final-1 block - high
7246
7247         eor     $acc_mb, $acc_mb, $rk4v.16b                             @ GHASH final-1 block - mid
7248 .L256_dec_blocks_less_than_1:                                           @ blocks left <= 1
7249
7250         ld1     { $rk0}, [$output_ptr]                                  @ load existing bytes where the possibly partial last block is to be stored
7251         mvn     $temp0_x, xzr                                           @ temp0_x = 0xffffffffffffffff
7252         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
7253
7254         sub     $bit_length, $bit_length, #128                          @ bit_length -= 128
7255         rev32   $rtmp_ctr.16b, $rtmp_ctr.16b
7256         str     $rtmp_ctrq, [$counter]                                  @ store the updated counter
7257
7258         neg     $bit_length, $bit_length                                @ bit_length = 128 - #bits in input (in range [1,128])
7259
7260         and     $bit_length, $bit_length, #127                          @ bit_length %= 128
7261
7262         lsr     $temp0_x, $temp0_x, $bit_length                         @ temp0_x is mask for top 64b of last block
7263         cmp     $bit_length, #64
7264         mvn     $temp1_x, xzr                                           @ temp1_x = 0xffffffffffffffff
7265
7266         csel    $temp3_x, $temp0_x, xzr, lt
7267         csel    $temp2_x, $temp1_x, $temp0_x, lt
7268
7269         mov     $ctr0.d[0], $temp2_x                                    @ ctr0b is mask for last block
7270         mov     $ctr0.d[1], $temp3_x
7271
7272         and     $res1b, $res1b, $ctr0b                                  @ possibly partial last block has zeroes in highest bits
7273         ldr     $h1q, [$current_tag, #32]                               @ load h1l | h1h
7274         ext     $h1.16b, $h1.16b, $h1.16b, #8
7275         bif     $res4b, $rk0, $ctr0b                                    @ insert existing bytes in top end of result before storing
7276
7277         rev64   $res0b, $res1b                                          @ GHASH final block
7278
7279         eor     $res0b, $res0b, $t0.16b                                 @ feed in partial tag
7280
7281         ins     $t0.d[0], $res0.d[1]                                    @ GHASH final block - mid
7282         pmull2  $rk2q1, $res0.2d, $h1.2d                                @ GHASH final block - high
7283
7284         eor     $t0.8b, $t0.8b, $res0.8b                                @ GHASH final block - mid
7285
7286         pmull   $rk3q1, $res0.1d, $h1.1d                                @ GHASH final block - low
7287         eor     $acc_hb, $acc_hb, $rk2                                  @ GHASH final block - high
7288
7289         pmull   $t0.1q, $t0.1d, $h12k.1d                                @ GHASH final block - mid
7290
7291         eor     $acc_mb, $acc_mb, $t0.16b                               @ GHASH final block - mid
7292         ldr     $mod_constantd, [$modulo_constant]                      @ MODULO - load modulo constant
7293         eor     $acc_lb, $acc_lb, $rk3                                  @ GHASH final block - low
7294
7295         pmull   $t11.1q, $acc_h.1d, $mod_constant.1d                    @ MODULO - top 64b align with mid
7296         eor     $t10.16b, $acc_hb, $acc_lb                              @ MODULO - karatsuba tidy up
7297
7298         ext     $acc_hb, $acc_hb, $acc_hb, #8                           @ MODULO - other top alignment
7299         st1     { $res4b}, [$output_ptr]                                @ store all 16B
7300
7301         eor     $acc_mb, $acc_mb, $t10.16b                              @ MODULO - karatsuba tidy up
7302
7303         eor     $t11.16b, $acc_hb, $t11.16b                             @ MODULO - fold into mid
7304         eor     $acc_mb, $acc_mb, $t11.16b                              @ MODULO - fold into mid
7305
7306         pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d                  @ MODULO - mid 64b align with low
7307
7308         ext     $acc_mb, $acc_mb, $acc_mb, #8                           @ MODULO - other mid alignment
7309         eor     $acc_lb, $acc_lb, $acc_hb                               @ MODULO - fold into low
7310
7311         eor     $acc_lb, $acc_lb, $acc_mb                               @ MODULO - fold into low
7312         ext     $acc_lb, $acc_lb, $acc_lb, #8
7313         rev64   $acc_lb, $acc_lb
7314         st1     { $acc_l.16b }, [$current_tag]
7315         lsr     x0, $bit_length, #3                                     @ return sizes
7316
7317         ldp     d10, d11, [sp, #16]
7318         ldp     d12, d13, [sp, #32]
7319         ldp     d14, d15, [sp, #48]
7320         ldp     d8, d9, [sp], #80
7321         ret
7322
7323 .L256_dec_ret:
7324         mov w0, #0x0
7325         ret
7326 .size unroll8_eor3_aes_gcm_dec_256_kernel,.-unroll8_eor3_aes_gcm_dec_256_kernel
7327 ___
7328 }
7329 }
7330
7331 $code.=<<___;
7332 .asciz  "AES GCM module for ARMv8, SPDX BSD-3-Clause by <xiaokang.qian\@arm.com>"
7333 .align  2
7334 #endif
7335 ___
7336
7337 {
7338     my  %opcode = (
7339     "rax1"    => 0xce608c00,    "eor3"    => 0xce000000,
7340     "bcax"    => 0xce200000,    "xar"    => 0xce800000    );
7341
7342     sub unsha3 {
7343          my ($mnemonic,$arg)=@_;
7344
7345          $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
7346          &&
7347          sprintf ".inst\t0x%08x\t//%s %s",
7348             $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
7349             $mnemonic,$arg;
7350     }
7351     sub unvmov {
7352         my $arg=shift;
7353
7354         $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
7355         sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
7356                              $3<8?$3:$3+8,($4 eq "lo")?0:1;
7357     }
7358
7359      foreach(split("\n",$code)) {
7360         s/@\s/\/\//o;               # old->new style commentary
7361         s/\`([^\`]*)\`/eval($1)/ge;
7362
7363         m/\bld1r\b/ and s/\.16b/.2d/g    or
7364         s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge;
7365         print $_,"\n";
7366      }
7367 }
7368
7369 close STDOUT or die "error closing STDOUT: $!"; # enforce flush