2 # Copyright 2019-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 #========================================================================
11 # Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
12 # derived from https://github.com/ARM-software/AArch64cryptolib, original
13 # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14 # licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
15 # obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
16 #========================================================================
18 # Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
20 # main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
22 # ____________________________________________________
25 # |____________________________________________________|
27 # | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
28 # |________________|________________|__________________|
30 # | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
31 # |________________|________________|__________________|
33 # | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
34 # |________________|________________|__________________|
36 # | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
37 # |________________|____(mostly)____|__________________|
40 # |____________________________________________________|
43 # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
44 # EXT low_acc, low_acc, low_acc, #8
45 # EOR res_curr (4k+0), res_curr (4k+0), low_acc
48 # Increment and byte reverse counter in scalar registers and transfer to SIMD registers
49 # REV ctr32, rev_ctr32
50 # ORR ctr64, constctr96_top32, ctr32, LSL #32
51 # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
52 # INS ctr_next.d[1], ctr64X
56 # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
57 # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
58 # Given we are very constrained in our ASIMD registers this is quite important
61 # LDR input_low, [ input_ptr ], #8
62 # LDR input_high, [ input_ptr ], #8
63 # EOR input_low, k14_low
64 # EOR input_high, k14_high
65 # INS res_curr.d[0], input_low
66 # INS res_curr.d[1], input_high
67 # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
68 # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
69 # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
70 # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
71 # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
72 # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
73 # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
74 # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
75 # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
76 # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
77 # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
78 # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
79 # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
81 # EOR res_curr, res_curr, ctr_curr
82 # ST1 { res_curr.16b }, [ output_ptr ], #16
85 # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
86 # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
87 # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
88 # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
89 # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
90 # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
91 # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
92 # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
93 # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
94 # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
95 # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
96 # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
97 # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
99 # LDR res_curr, [ input_ptr ], #16
100 # EOR res_curr, res_curr, ctr_curr
101 # MOV output_low, res_curr.d[0]
102 # MOV output_high, res_curr.d[1]
103 # EOR output_low, k14_low
104 # EOR output_high, k14_high
105 # STP output_low, output_high, [ output_ptr ], #16
108 # do 128b karatsuba polynomial multiplication on block
109 # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
112 # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
114 # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
115 # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
117 # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
118 # multiplying with "twisted" powers of H
120 # Note: We can PMULL directly into the acc_x in first GHASH of the loop
121 # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
122 # path latency dominates the performance
124 # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
125 # than indicated here
126 # REV64 res_curr, res_curr
127 # INS t_m.d[0], res_curr.d[1]
128 # EOR t_m.8B, t_m.8B, res_curr.8B
129 # PMULL2 t_h, res_curr, HX
130 # PMULL t_l, res_curr, HX
131 # PMULL t_m, t_m, HX_k
132 # EOR acc_h, acc_h, t_h
133 # EOR acc_l, acc_l, t_l
134 # EOR acc_m, acc_m, t_m
136 # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
137 # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
138 # with a reversed constant
139 # EOR acc_m, acc_m, acc_h
140 # EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
141 # PMULL t_mod, acc_h, mod_constant
142 # EXT acc_h, acc_h, acc_h, #8
143 # EOR acc_m, acc_m, acc_h
144 # EOR acc_m, acc_m, t_mod
145 # PMULL acc_h, acc_m, mod_constant
146 # EXT acc_m, acc_m, acc_m, #8
147 # EOR acc_l, acc_l, acc_h
148 # EOR acc_l, acc_l, acc_m
150 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
151 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
153 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
154 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
155 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
156 die "can't locate arm-xlate.pl";
158 open OUT,"| \"$^X\" $xlate $flavour $output";
161 $input_ptr="x0"; #argument block
169 my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
170 my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
171 my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
172 my ($output_l0,$output_h0)=map("x$_",(6..7));
175 my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
176 my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
178 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
179 my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
180 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
181 my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
183 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
184 my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
185 my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
187 my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
188 my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
189 my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
194 my ($t1,$t2,$t3)=map("v$_",(28..30));
195 my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
211 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
212 my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
213 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
215 my $mod_constantd="d8";
216 my $mod_constant="v8";
219 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
220 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
227 #include "arm_arch.h"
229 #if __ARM_MAX_ARCH__>=8
231 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
232 $code.=<<___ if ($flavour !~ /64/);
237 # define INST(a,b,c,d) $_byte c,0xef,a,b
240 # define INST(a,b,c,d) $_byte a,b,c,0xf2
246 #########################################################################################
247 # size_t aes_gcm_enc_128_kernel(const unsigned char *in,
249 # unsigned char *out,
251 # unsigned char ivec[16],
255 .global aes_gcm_enc_128_kernel
256 .type aes_gcm_enc_128_kernel,%function
258 aes_gcm_enc_128_kernel:
259 AARCH64_VALID_CALL_TARGET
260 cbz x1, .L128_enc_ret
261 stp x19, x20, [sp, #-112]!
264 stp x21, x22, [sp, #16]
265 stp x23, x24, [sp, #32]
266 stp d8, d9, [sp, #48]
267 stp d10, d11, [sp, #64]
268 stp d12, d13, [sp, #80]
269 stp d14, d15, [sp, #96]
271 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
272 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
274 ld1 {$acc_lb}, [$current_tag]
275 ext $acc_lb, $acc_lb, $acc_lb, #8
276 rev64 $acc_lb, $acc_lb
277 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
278 mov $len, $main_end_input_ptr
280 ldr $rk9q, [$cc, #144] @ load rk9
281 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
282 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
284 lsr $rctr32x, $ctr96_t32x, #32
285 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
286 ext $h4b, $h4b, $h4b, #8
288 fmov $ctr1d, $ctr96_b64x @ CTR block 1
289 rev $rctr32w, $rctr32w @ rev_ctr32
291 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
292 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
293 ldr $rk0q, [$cc, #0] @ load rk0
295 rev $ctr32w, $rctr32w @ CTR block 1
296 add $rctr32w, $rctr32w, #1 @ CTR block 1
297 fmov $ctr3d, $ctr96_b64x @ CTR block 3
299 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
300 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
302 fmov $ctr1.d[1], $ctr32x @ CTR block 1
303 rev $ctr32w, $rctr32w @ CTR block 2
305 fmov $ctr2d, $ctr96_b64x @ CTR block 2
306 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
307 add $rctr32w, $rctr32w, #1 @ CTR block 2
309 fmov $ctr2.d[1], $ctr32x @ CTR block 2
310 rev $ctr32w, $rctr32w @ CTR block 3
312 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
313 ldr $rk1q, [$cc, #16] @ load rk1
315 add $rctr32w, $rctr32w, #1 @ CTR block 3
316 fmov $ctr3.d[1], $ctr32x @ CTR block 3
318 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
319 ext $h3b, $h3b, $h3b, #8
321 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
322 ldr $rk2q, [$cc, #32] @ load rk2
324 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
325 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
326 ext $h1b, $h1b, $h1b, #8
328 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
329 ldr $rk8q, [$cc, #128] @ load rk8
331 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
332 ldr $rk3q, [$cc, #48] @ load rk3
334 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
335 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
337 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
338 ldr $rk6q, [$cc, #96] @ load rk6
340 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
341 ldr $rk7q, [$cc, #112] @ load rk7
343 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
344 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
346 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
347 ldr $rk5q, [$cc, #80] @ load rk5
349 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
350 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
351 ext $h2b, $h2b, $h2b, #8
353 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
355 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
356 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
358 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
360 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
362 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
363 ldr $rk4q, [$cc, #64] @ load rk4
365 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
367 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
368 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
370 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
371 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
373 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
374 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
376 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
378 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
380 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
382 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
384 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
386 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
388 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
389 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
391 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
393 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
395 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
397 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
399 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
401 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
403 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
405 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
407 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
409 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
411 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
413 aese $ctr2b, $rk9 @ AES block 2 - round 9
415 aese $ctr0b, $rk9 @ AES block 0 - round 9
417 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
419 aese $ctr1b, $rk9 @ AES block 1 - round 9
421 aese $ctr3b, $rk9 @ AES block 3 - round 9
422 b.ge .L128_enc_tail @ handle tail
424 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
426 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
428 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
430 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
432 eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
433 eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
435 eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
436 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
438 eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
439 eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
440 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
442 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
443 eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
445 eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
446 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
448 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
449 eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
450 rev $ctr32w, $rctr32w @ CTR block 4
452 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
453 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
455 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
456 fmov $ctr0d, $ctr96_b64x @ CTR block 4
457 add $rctr32w, $rctr32w, #1 @ CTR block 4
459 fmov $ctr0.d[1], $ctr32x @ CTR block 4
460 rev $ctr32w, $rctr32w @ CTR block 5
462 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
463 fmov $ctr1d, $ctr96_b64x @ CTR block 5
464 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
466 add $rctr32w, $rctr32w, #1 @ CTR block 5
467 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
468 fmov $ctr1.d[1], $ctr32x @ CTR block 5
470 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
471 rev $ctr32w, $rctr32w @ CTR block 6
472 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
474 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
475 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
477 add $rctr32w, $rctr32w, #1 @ CTR block 6
478 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
479 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
481 fmov $ctr2d, $ctr96_b64x @ CTR block 6
482 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
484 fmov $ctr2.d[1], $ctr32x @ CTR block 6
485 rev $ctr32w, $rctr32w @ CTR block 7
486 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
488 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
490 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
491 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
492 b.ge .L128_enc_prepretail @ do prepretail
494 .L128_enc_main_loop: @ main loop start
495 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
496 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
497 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
499 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
500 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
502 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
503 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
505 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
506 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
507 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
509 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
510 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
512 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
513 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
515 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
516 eor $res0b, $res0b, $acc_lb @ PRE 1
518 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
519 eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
521 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
522 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
523 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
525 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
526 rev $ctr32w, $rctr32w @ CTR block 4k+8
528 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
529 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
530 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
532 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
533 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
534 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
536 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
538 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
539 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
541 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
543 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
544 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
546 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
548 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
549 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
551 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
553 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
554 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
556 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
557 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
559 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
560 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
562 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
563 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
565 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
566 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
568 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
569 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
571 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
573 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
574 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
576 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
578 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
579 movi $mod_constant.8b, #0xc2
581 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
582 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
584 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
586 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
587 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
589 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
590 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
592 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
593 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
595 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
596 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
598 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
599 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
601 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
602 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
604 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
605 eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
607 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
608 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
610 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
611 eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
613 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
614 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
616 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
617 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
618 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
620 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
621 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
622 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
624 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
625 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
627 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
628 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
630 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
631 eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
633 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
634 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
636 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
637 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
639 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
640 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
642 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
643 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
645 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
646 eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
647 eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
649 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
650 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
652 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
653 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
655 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
656 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
658 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
659 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
661 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
662 rev $ctr32w, $rctr32w @ CTR block 4k+9
663 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
665 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
666 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
668 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
669 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
670 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
672 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
673 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
674 rev $ctr32w, $rctr32w @ CTR block 4k+10
676 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
677 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
678 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
679 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
681 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
682 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
683 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
684 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
686 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
687 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
689 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
690 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
691 rev $ctr32w, $rctr32w @ CTR block 4k+11
693 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
694 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
696 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
697 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
698 b.lt .L128_enc_main_loop
700 .L128_enc_prepretail: @ PREPRETAIL
701 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
702 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
703 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
705 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
706 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
707 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
709 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
710 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
712 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
714 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
715 eor $res0b, $res0b, $acc_lb @ PRE 1
717 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
719 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
720 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
722 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
723 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
725 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
726 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
728 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
729 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
731 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
733 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
734 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
736 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
738 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
739 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
741 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
743 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
744 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
746 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
748 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
749 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
751 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
752 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
754 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
756 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
757 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
759 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
761 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
763 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
764 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
766 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
768 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
769 movi $mod_constant.8b, #0xc2
771 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
772 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
774 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
776 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
777 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
779 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
781 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
782 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
784 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
786 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
787 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
789 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
790 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
792 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
794 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
795 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
797 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
799 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
800 ext $acc_hb, $acc_hb, $acc_hb, #8
802 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
804 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
805 eor $acc_mb, $acc_mb, $acc_lb
807 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
809 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
811 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
813 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
814 eor $acc_mb, $acc_mb, $t1.16b
816 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
818 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
820 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
822 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
823 eor $acc_mb, $acc_mb, $acc_hb
825 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
827 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
829 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
831 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
833 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
834 ext $acc_mb, $acc_mb, $acc_mb, #8
836 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
838 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
839 eor $acc_lb, $acc_lb, $t1.16b
841 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
843 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
845 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
847 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
849 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
850 eor $acc_lb, $acc_lb, $acc_mb
852 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
853 .L128_enc_tail: @ TAIL
855 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
856 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
858 cmp $main_end_input_ptr, #48
860 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
861 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
862 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
864 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
866 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
868 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
870 b.gt .L128_enc_blocks_more_than_3
872 sub $rctr32w, $rctr32w, #1
876 cmp $main_end_input_ptr, #32
881 b.gt .L128_enc_blocks_more_than_2
884 cmp $main_end_input_ptr, #16
886 sub $rctr32w, $rctr32w, #1
887 b.gt .L128_enc_blocks_more_than_1
889 sub $rctr32w, $rctr32w, #1
890 b .L128_enc_blocks_less_than_1
891 .L128_enc_blocks_more_than_3: @ blocks left > 3
892 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
894 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
896 rev64 $res0b, $res1b @ GHASH final-3 block
898 eor $res0b, $res0b, $t0.16b @ feed in partial tag
899 eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
900 eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
902 fmov $res1d, $input_l0 @ AES final-2 block - mov low
904 movi $t0.8b, #0 @ suppress further partial tag feed in
905 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
907 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
908 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
910 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
912 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
914 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
915 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
917 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
918 .L128_enc_blocks_more_than_2: @ blocks left > 2
920 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
922 rev64 $res0b, $res1b @ GHASH final-2 block
923 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
925 eor $res0b, $res0b, $t0.16b @ feed in partial tag
927 eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
929 fmov $res1d, $input_l0 @ AES final-1 block - mov low
930 eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
932 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
933 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
935 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
937 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
939 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
941 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
943 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
945 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
947 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
949 movi $t0.8b, #0 @ suppress further partial tag feed in
951 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
952 .L128_enc_blocks_more_than_1: @ blocks left > 1
954 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
956 rev64 $res0b, $res1b @ GHASH final-1 block
957 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
959 eor $res0b, $res0b, $t0.16b @ feed in partial tag
961 eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
962 eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
964 fmov $res1d, $input_l0 @ AES final block - mov low
966 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
967 fmov $res1.d[1], $input_h0 @ AES final block - mov high
969 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
971 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
973 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
975 eor $res1b, $res1b, $ctr3b @ AES final block - result
977 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
979 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
981 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
983 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
985 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
986 movi $t0.8b, #0 @ suppress further partial tag feed in
987 .L128_enc_blocks_less_than_1: @ blocks left <= 1
989 and $bit_length, $bit_length, #127 @ bit_length %= 128
990 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
992 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
993 sub $bit_length, $bit_length, #128 @ bit_length -= 128
995 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
997 and $bit_length, $bit_length, #127 @ bit_length %= 128
999 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1000 cmp $bit_length, #64
1002 csel $input_l0, $rk10_l, $rk10_h, lt
1003 csel $input_h0, $rk10_h, xzr, lt
1005 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
1007 fmov $ctr0.d[1], $input_h0
1009 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1011 rev64 $res0b, $res1b @ GHASH final block
1013 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1015 mov $t0d, $res0.d[1] @ GHASH final block - mid
1017 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1018 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
1020 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1022 rev $ctr32w, $rctr32w
1024 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1026 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1028 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1030 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1032 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1033 movi $mod_constant.8b, #0xc2
1035 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1037 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1039 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1041 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1043 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1045 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1047 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1049 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1051 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1053 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
1055 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
1056 st1 { $res1b}, [$output_ptr] @ store all 16B
1058 str $ctr32w, [$counter, #12] @ store the updated counter
1060 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1061 ext $acc_lb, $acc_lb, $acc_lb, #8
1062 rev64 $acc_lb, $acc_lb
1064 st1 { $acc_l.16b }, [$current_tag]
1065 ldp x21, x22, [sp, #16]
1066 ldp x23, x24, [sp, #32]
1067 ldp d8, d9, [sp, #48]
1068 ldp d10, d11, [sp, #64]
1069 ldp d12, d13, [sp, #80]
1070 ldp d14, d15, [sp, #96]
1071 ldp x19, x20, [sp], #112
1077 .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
1080 #########################################################################################
1081 # size_t aes_gcm_dec_128_kernel(const unsigned char *in,
1083 # unsigned char *out,
1085 # unsigned char ivec[16],
1089 .global aes_gcm_dec_128_kernel
1090 .type aes_gcm_dec_128_kernel,%function
1092 aes_gcm_dec_128_kernel:
1093 AARCH64_VALID_CALL_TARGET
1094 cbz x1, .L128_dec_ret
1095 stp x19, x20, [sp, #-112]!
1098 stp x21, x22, [sp, #16]
1099 stp x23, x24, [sp, #32]
1100 stp d8, d9, [sp, #48]
1101 stp d10, d11, [sp, #64]
1102 stp d12, d13, [sp, #80]
1103 stp d14, d15, [sp, #96]
1105 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
1106 mov $len, $main_end_input_ptr
1107 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1109 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
1110 ldr $rk0q, [$cc, #0] @ load rk0
1112 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1113 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
1115 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
1116 ext $h2b, $h2b, $h2b, #8
1118 lsr $rctr32x, $ctr96_t32x, #32
1119 fmov $ctr2d, $ctr96_b64x @ CTR block 2
1121 ldr $rk1q, [$cc, #16] @ load rk1
1122 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1123 rev $rctr32w, $rctr32w @ rev_ctr32
1125 fmov $ctr1d, $ctr96_b64x @ CTR block 1
1126 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
1128 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
1129 rev $ctr32w, $rctr32w @ CTR block 1
1131 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
1132 ldr $rk2q, [$cc, #32] @ load rk2
1133 add $rctr32w, $rctr32w, #1 @ CTR block 1
1135 fmov $ctr1.d[1], $ctr32x @ CTR block 1
1136 rev $ctr32w, $rctr32w @ CTR block 2
1137 add $rctr32w, $rctr32w, #1 @ CTR block 2
1139 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
1140 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
1142 fmov $ctr2.d[1], $ctr32x @ CTR block 2
1143 rev $ctr32w, $rctr32w @ CTR block 3
1145 fmov $ctr3d, $ctr96_b64x @ CTR block 3
1146 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
1147 add $rctr32w, $rctr32w, #1 @ CTR block 3
1149 fmov $ctr3.d[1], $ctr32x @ CTR block 3
1150 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
1152 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
1153 ldr $rk3q, [$cc, #48] @ load rk3
1155 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
1156 ldr $rk6q, [$cc, #96] @ load rk6
1158 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
1159 ldr $rk7q, [$cc, #112] @ load rk7
1161 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
1162 ldr $rk4q, [$cc, #64] @ load rk4
1164 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
1166 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
1168 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
1169 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
1171 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
1172 ld1 { $acc_lb}, [$current_tag]
1173 ext $acc_lb, $acc_lb, $acc_lb, #8
1174 rev64 $acc_lb, $acc_lb
1176 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
1177 ldr $rk5q, [$cc, #80] @ load rk5
1179 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
1181 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
1183 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
1184 ldr $rk9q, [$cc, #144] @ load rk9
1186 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
1188 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
1190 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
1191 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1192 ext $h3b, $h3b, $h3b, #8
1194 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
1195 ldr $rk8q, [$cc, #128] @ load rk8
1197 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
1199 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
1201 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
1203 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
1205 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
1206 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1207 ext $h1b, $h1b, $h1b, #8
1209 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
1211 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
1213 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
1215 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
1217 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
1218 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
1220 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1221 ext $h4b, $h4b, $h4b, #8
1222 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
1223 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1225 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
1227 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
1229 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
1230 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
1232 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
1234 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
1235 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
1237 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
1239 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
1241 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
1242 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
1244 aese $ctr2b, $rk9 @ AES block 2 - round 9
1246 aese $ctr3b, $rk9 @ AES block 3 - round 9
1248 aese $ctr0b, $rk9 @ AES block 0 - round 9
1249 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
1251 aese $ctr1b, $rk9 @ AES block 1 - round 9
1252 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
1253 b.ge .L128_dec_tail @ handle tail
1255 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
1257 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
1259 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
1260 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
1262 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
1263 rev64 $res0b, $res0b @ GHASH block 0
1264 rev $ctr32w, $rctr32w @ CTR block 4
1266 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
1267 add $rctr32w, $rctr32w, #1 @ CTR block 4
1268 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
1270 rev64 $res1b, $res1b @ GHASH block 1
1271 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
1272 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
1274 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
1276 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
1277 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1279 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
1281 fmov $ctr0d, $ctr96_b64x @ CTR block 4
1283 fmov $ctr0.d[1], $ctr32x @ CTR block 4
1284 rev $ctr32w, $rctr32w @ CTR block 5
1285 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
1287 fmov $ctr1d, $ctr96_b64x @ CTR block 5
1288 add $rctr32w, $rctr32w, #1 @ CTR block 5
1289 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
1291 fmov $ctr1.d[1], $ctr32x @ CTR block 5
1292 rev $ctr32w, $rctr32w @ CTR block 6
1293 add $rctr32w, $rctr32w, #1 @ CTR block 6
1295 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
1297 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
1298 eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
1299 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
1301 eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
1302 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
1304 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
1305 b.ge .L128_dec_prepretail @ do prepretail
1307 .L128_dec_main_loop: @ main loop start
1308 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1309 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1310 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1312 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1313 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1315 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1316 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1318 rev64 $res2b, $res2b @ GHASH block 4k+2
1319 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1320 rev $ctr32w, $rctr32w @ CTR block 4k+7
1322 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1323 eor $res0b, $res0b, $acc_lb @ PRE 1
1324 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1326 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1327 rev64 $res3b, $res3b @ GHASH block 4k+3
1329 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1330 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1331 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1333 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1334 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1335 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1337 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1338 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1340 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1341 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1343 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1344 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1346 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1348 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1349 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1351 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1352 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1354 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1356 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1357 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1359 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1360 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1362 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1363 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1364 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1366 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1367 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1369 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1371 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1372 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1374 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1376 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1377 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1379 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1381 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1382 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1384 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1386 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1387 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1389 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1390 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1392 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1393 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1395 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1396 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1398 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1399 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1401 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1402 movi $mod_constant.8b, #0xc2
1404 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1405 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1407 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1409 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1410 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1412 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1413 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1415 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1416 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1417 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
1419 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1420 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1422 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1423 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1425 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1426 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1428 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1429 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1431 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1432 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1434 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1435 rev $ctr32w, $rctr32w @ CTR block 4k+8
1437 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1438 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
1439 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1441 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1442 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
1444 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1445 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1447 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1449 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1450 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
1452 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1453 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
1455 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
1456 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1457 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
1459 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1460 ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
1462 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1463 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
1465 rev64 $res1b, $res1b @ GHASH block 4k+5
1466 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1467 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1469 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1470 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1472 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1473 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
1475 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1476 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
1477 rev $ctr32w, $rctr32w @ CTR block 4k+9
1479 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1480 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
1481 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1483 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1484 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1486 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1487 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
1488 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1490 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
1491 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
1492 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
1494 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1495 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
1496 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
1498 rev64 $res0b, $res0b @ GHASH block 4k+4
1499 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1500 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
1502 rev $ctr32w, $rctr32w @ CTR block 4k+10
1503 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
1505 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
1506 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
1508 eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
1509 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
1511 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
1512 b.lt L128_dec_main_loop
1514 .L128_dec_prepretail: @ PREPRETAIL
1515 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1516 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1517 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1519 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1520 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1522 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1523 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1525 eor $res0b, $res0b, $acc_lb @ PRE 1
1526 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1527 rev64 $res2b, $res2b @ GHASH block 4k+2
1529 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1530 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1532 rev $ctr32w, $rctr32w @ CTR block 4k+7
1533 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1534 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1536 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1537 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1538 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1540 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1541 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1543 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1544 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1546 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1547 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1548 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1550 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1551 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1553 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1554 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1556 rev64 $res3b, $res3b @ GHASH block 4k+3
1558 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1559 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1561 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1563 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1564 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1566 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1568 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1569 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1571 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1573 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1574 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1576 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1578 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1580 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1581 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1583 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1584 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1586 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1588 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1589 movi $mod_constant.8b, #0xc2
1591 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1592 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1594 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1596 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1597 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1599 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1600 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1602 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1603 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1604 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1606 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1608 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1609 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1611 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1613 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1614 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1616 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1618 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1619 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1621 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1623 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1625 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1627 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1628 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1630 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1632 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1633 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1635 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1637 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1638 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1640 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1642 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1644 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1646 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1647 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1649 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1651 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1653 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1655 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1656 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1658 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1659 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1661 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1663 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1664 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1666 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1668 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1669 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1671 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1672 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1674 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1675 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1676 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1678 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1679 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1680 .L128_dec_tail: @ TAIL
1682 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
1683 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
1685 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
1687 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1689 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1691 cmp $main_end_input_ptr, #48
1693 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1695 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
1696 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1697 b.gt .L128_dec_blocks_more_than_3
1700 sub $rctr32w, $rctr32w, #1
1707 cmp $main_end_input_ptr, #32
1708 b.gt .L128_dec_blocks_more_than_2
1710 cmp $main_end_input_ptr, #16
1713 sub $rctr32w, $rctr32w, #1
1714 b.gt .L128_dec_blocks_more_than_1
1716 sub $rctr32w, $rctr32w, #1
1717 b .L128_dec_blocks_less_than_1
1718 .L128_dec_blocks_more_than_3: @ blocks left > 3
1719 rev64 $res0b, $res1b @ GHASH final-3 block
1720 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
1722 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1724 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
1725 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
1726 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
1728 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
1729 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
1731 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
1732 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
1734 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
1736 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
1738 movi $t0.8b, #0 @ suppress further partial tag feed in
1739 eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
1741 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
1742 eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
1743 .L128_dec_blocks_more_than_2: @ blocks left > 2
1745 rev64 $res0b, $res1b @ GHASH final-2 block
1746 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
1748 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1750 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
1751 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
1753 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
1755 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
1757 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
1758 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
1760 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
1761 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
1763 movi $t0.8b, #0 @ suppress further partial tag feed in
1765 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
1767 eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
1768 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
1770 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
1772 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
1773 eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
1774 .L128_dec_blocks_more_than_1: @ blocks left > 1
1776 rev64 $res0b, $res1b @ GHASH final-1 block
1778 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
1779 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1781 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
1783 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
1785 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
1787 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
1788 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
1790 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
1791 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
1793 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
1795 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
1797 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
1798 movi $t0.8b, #0 @ suppress further partial tag feed in
1800 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
1802 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
1803 eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
1805 eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
1806 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
1807 .L128_dec_blocks_less_than_1: @ blocks left <= 1
1809 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
1810 and $bit_length, $bit_length, #127 @ bit_length %= 128
1812 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
1813 sub $bit_length, $bit_length, #128 @ bit_length -= 128
1815 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
1817 and $bit_length, $bit_length, #127 @ bit_length %= 128
1819 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1820 cmp $bit_length, #64
1822 csel $ctr96_b64x, $rk10_h, xzr, lt
1823 csel $ctr32x, $rk10_l, $rk10_h, lt
1825 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
1827 mov $ctr0.d[1], $ctr96_b64x
1829 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1831 rev64 $res0b, $res1b @ GHASH final block
1833 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1835 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
1837 and $output_h0, $output_h0, $ctr96_b64x
1839 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1840 mov $t0d, $res0.d[1] @ GHASH final block - mid
1842 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1843 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1845 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1847 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1848 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
1849 and $output_l0, $output_l0, $ctr32x
1851 rev $ctr32w, $rctr32w
1853 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1854 movi $mod_constant.8b, #0xc2
1856 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1858 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
1859 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1861 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1863 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1865 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1867 orr $output_l0, $output_l0, $end_input_ptr
1868 str $ctr32w, [$counter, #12] @ store the updated counter
1870 orr $output_h0, $output_h0, $main_end_input_ptr
1871 stp $output_l0, $output_h0, [$output_ptr]
1872 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1874 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1876 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1878 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1879 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1881 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1883 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1884 ext $acc_lb, $acc_lb, $acc_lb, #8
1885 rev64 $acc_lb, $acc_lb
1887 st1 { $acc_l.16b }, [$current_tag]
1889 ldp x21, x22, [sp, #16]
1890 ldp x23, x24, [sp, #32]
1891 ldp d8, d9, [sp, #48]
1892 ldp d10, d11, [sp, #64]
1893 ldp d12, d13, [sp, #80]
1894 ldp d14, d15, [sp, #96]
1895 ldp x19, x20, [sp], #112
1901 .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
1906 my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
1907 my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
1908 my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
1909 my ($output_l0,$output_h0)=map("x$_",(6..7));
1912 my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
1913 my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
1915 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
1916 my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
1917 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
1918 my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
1920 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
1921 my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
1922 my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
1924 my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
1925 my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
1926 my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
1933 my ($t1,$t2)=map("v$_",(30..31));
1934 my ($t1d,$t2d)=map("d$_",(30..31));
1950 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
1951 my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
1952 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
1954 my $mod_constantd="d8";
1955 my $mod_constant="v8";
1958 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
1959 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
1965 #########################################################################################
1966 # size_t aes_gcm_enc_192_kernel(const unsigned char *in,
1968 # unsigned char *out,
1970 # unsigned char ivec[16],
1974 .global aes_gcm_enc_192_kernel
1975 .type aes_gcm_enc_192_kernel,%function
1977 aes_gcm_enc_192_kernel:
1978 AARCH64_VALID_CALL_TARGET
1979 cbz x1, .L192_enc_ret
1980 stp x19, x20, [sp, #-112]!
1983 stp x21, x22, [sp, #16]
1984 stp x23, x24, [sp, #32]
1985 stp d8, d9, [sp, #48]
1986 stp d10, d11, [sp, #64]
1987 stp d12, d13, [sp, #80]
1988 stp d14, d15, [sp, #96]
1990 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1992 ldr $rk5q, [$cc, #80] @ load rk5
1994 ldr $rk4q, [$cc, #64] @ load rk4
1996 ldr $rk8q, [$cc, #128] @ load rk8
1998 lsr $rctr32x, $ctr96_t32x, #32
1999 ldr $rk6q, [$cc, #96] @ load rk6
2000 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2002 ldr $rk7q, [$cc, #112] @ load rk7
2003 rev $rctr32w, $rctr32w @ rev_ctr32
2005 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2006 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2008 rev $ctr32w, $rctr32w @ CTR block 1
2009 add $rctr32w, $rctr32w, #1 @ CTR block 1
2010 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2012 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2013 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2015 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2016 rev $ctr32w, $rctr32w @ CTR block 2
2017 add $rctr32w, $rctr32w, #1 @ CTR block 2
2019 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2020 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2022 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2023 rev $ctr32w, $rctr32w @ CTR block 3
2025 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2026 ldr $rk0q, [$cc, #0] @ load rk0
2028 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2030 ldr $rk3q, [$cc, #48] @ load rk3
2032 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2034 ldr $rk1q, [$cc, #16] @ load rk1
2036 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2037 ld1 { $acc_lb}, [$current_tag]
2038 ext $acc_lb, $acc_lb, $acc_lb, #8
2039 rev64 $acc_lb, $acc_lb
2041 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2042 ldr $rk11q, [$cc, #176] @ load rk11
2044 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2045 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2046 ext $h4b, $h4b, $h4b, #8
2048 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2049 ldr $rk2q, [$cc, #32] @ load rk2
2051 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2052 ldr $rk10q, [$cc, #160] @ load rk10
2054 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2055 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2056 ext $h1b, $h1b, $h1b, #8
2058 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2059 ldr $rk9q, [$cc, #144] @ load rk9
2061 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2062 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2063 ext $h3b, $h3b, $h3b, #8
2065 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2067 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2069 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2071 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2072 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2074 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2076 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2077 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2079 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2081 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2083 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2085 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2087 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2089 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2091 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2093 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2095 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2097 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2099 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2101 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2102 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2103 ext $h2b, $h2b, $h2b, #8
2105 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2107 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2109 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2111 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
2112 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2114 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2116 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
2118 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2119 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
2121 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
2123 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
2125 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
2127 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
2129 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
2131 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
2133 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
2135 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
2137 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
2139 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
2140 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2141 mov $len, $main_end_input_ptr
2143 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
2144 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
2146 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
2147 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2149 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
2151 aese $ctr2b, $rk11 @ AES block 2 - round 11
2152 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2153 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2155 aese $ctr1b, $rk11 @ AES block 1 - round 11
2156 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
2158 aese $ctr0b, $rk11 @ AES block 0 - round 11
2159 add $rctr32w, $rctr32w, #1 @ CTR block 3
2161 aese $ctr3b, $rk11 @ AES block 3 - round 11
2162 b.ge .L192_enc_tail @ handle tail
2164 rev $ctr32w, $rctr32w @ CTR block 4
2165 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
2167 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
2168 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
2170 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
2172 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
2173 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2174 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2176 eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
2178 eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
2179 eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
2180 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
2182 eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
2183 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
2185 eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
2186 eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
2188 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
2189 eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
2191 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
2193 eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
2194 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
2196 add $rctr32w, $rctr32w, #1 @ CTR block 4
2197 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
2198 fmov $ctr0d, $ctr96_b64x @ CTR block 4
2200 fmov $ctr0.d[1], $ctr32x @ CTR block 4
2201 rev $ctr32w, $rctr32w @ CTR block 5
2203 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
2204 add $rctr32w, $rctr32w, #1 @ CTR block 5
2206 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
2207 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
2209 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
2211 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
2212 fmov $ctr1d, $ctr96_b64x @ CTR block 5
2213 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
2215 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
2217 fmov $ctr1.d[1], $ctr32x @ CTR block 5
2218 rev $ctr32w, $rctr32w @ CTR block 6
2220 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
2222 add $rctr32w, $rctr32w, #1 @ CTR block 6
2223 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
2224 fmov $ctr2d, $ctr96_b64x @ CTR block 6
2226 fmov $ctr2.d[1], $ctr32x @ CTR block 6
2227 rev $ctr32w, $rctr32w @ CTR block 7
2229 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
2230 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
2232 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
2233 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
2234 b.ge .L192_enc_prepretail @ do prepretail
2236 .L192_enc_main_loop: @ main loop start
2237 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2238 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2240 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2241 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
2243 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2244 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2245 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2247 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2248 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2250 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2251 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2252 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
2254 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2255 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
2257 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2258 eor $res0b, $res0b, $acc_lb @ PRE 1
2260 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2262 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2263 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2265 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2266 eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
2268 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2269 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2271 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2273 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2274 eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
2276 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2277 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2279 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2280 eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
2282 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2283 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2285 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2286 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2288 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2290 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2292 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2293 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2295 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2296 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2298 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2300 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2301 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2303 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2305 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2306 eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
2307 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2309 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2310 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2312 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2313 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2315 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2316 eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
2318 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2319 eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
2320 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2322 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2323 rev $ctr32w, $rctr32w @ CTR block 4k+8
2325 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2326 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
2328 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2329 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2331 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2332 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
2334 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2335 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2337 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2338 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2340 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2341 movi $mod_constant.8b, #0xc2
2343 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2344 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2345 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2347 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2348 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2350 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2351 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2353 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2354 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2356 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2357 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
2359 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2360 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2362 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2363 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
2365 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2366 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2368 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2369 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
2370 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2372 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2373 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2375 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2376 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
2378 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2379 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2380 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
2382 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2383 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
2385 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2386 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2387 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
2389 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2391 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2392 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2394 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2396 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2398 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2400 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2401 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2403 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2405 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2407 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2409 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2410 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2412 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2414 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2415 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
2417 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2418 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
2419 rev $ctr32w, $rctr32w @ CTR block 4k+9
2421 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2422 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
2423 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
2425 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2426 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
2428 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
2429 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
2430 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
2432 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2433 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
2434 rev $ctr32w, $rctr32w @ CTR block 4k+10
2436 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
2437 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2438 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
2440 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
2441 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2443 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2444 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
2445 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
2447 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
2448 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
2449 rev $ctr32w, $rctr32w @ CTR block 4k+11
2451 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2452 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
2454 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
2455 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
2456 b.lt .L192_enc_main_loop
2458 .L192_enc_prepretail: @ PREPRETAIL
2459 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2460 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2462 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2463 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2464 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2466 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2467 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2469 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2471 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2472 eor $res0b, $res0b, $acc_lb @ PRE 1
2473 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2475 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2476 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2478 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2480 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2481 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2483 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2484 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2486 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2488 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2489 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2491 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2492 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2494 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2495 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2497 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2499 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2500 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2502 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2504 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2505 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2507 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2509 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2510 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2512 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2513 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2515 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2517 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2518 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2520 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2522 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2524 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2526 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2527 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2529 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2531 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2532 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2534 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2536 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2537 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2539 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2541 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2542 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2544 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2546 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2547 movi $mod_constant.8b, #0xc2
2549 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2551 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2553 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2554 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2556 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2558 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2560 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2561 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2563 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2565 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2566 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
2568 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2570 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2571 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2573 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2575 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2576 eor $acc_mb, $acc_mb, $acc_lb
2578 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2580 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
2582 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2583 ext $acc_hb, $acc_hb, $acc_hb, #8
2585 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2587 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2588 eor $acc_mb, $acc_mb, $t1.16b
2590 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2592 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2594 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2596 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2597 eor $acc_mb, $acc_mb, $acc_hb
2599 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2601 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2603 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2605 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
2607 ext $acc_mb, $acc_mb, $acc_mb, #8
2609 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2611 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2613 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2615 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2616 eor $acc_lb, $acc_lb, $t1.16b
2618 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2620 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2622 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2624 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2625 eor $acc_lb, $acc_lb, $acc_mb
2626 .L192_enc_tail: @ TAIL
2628 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
2629 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
2631 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2632 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2634 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2636 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2637 cmp $main_end_input_ptr, #48
2639 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2641 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
2642 b.gt .L192_enc_blocks_more_than_3
2644 sub $rctr32w, $rctr32w, #1
2649 cmp $main_end_input_ptr, #32
2653 b.gt .L192_enc_blocks_more_than_2
2655 sub $rctr32w, $rctr32w, #1
2658 cmp $main_end_input_ptr, #16
2659 b.gt .L192_enc_blocks_more_than_1
2661 sub $rctr32w, $rctr32w, #1
2662 b .L192_enc_blocks_less_than_1
2663 .L192_enc_blocks_more_than_3: @ blocks left > 3
2664 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
2666 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
2668 rev64 $res0b, $res1b @ GHASH final-3 block
2670 eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
2671 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2673 eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
2674 fmov $res1d, $input_l0 @ AES final-2 block - mov low
2676 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
2678 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
2680 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
2682 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
2684 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
2686 movi $t0.8b, #0 @ suppress further partial tag feed in
2688 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
2690 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
2691 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
2692 .L192_enc_blocks_more_than_2: @ blocks left > 2
2694 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
2696 rev64 $res0b, $res1b @ GHASH final-2 block
2697 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
2699 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2701 eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
2703 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
2704 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
2706 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
2707 eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
2709 fmov $res1d, $input_l0 @ AES final-1 block - mov low
2711 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
2712 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
2713 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
2715 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
2717 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
2719 movi $t0.8b, #0 @ suppress further partial tag feed in
2721 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
2723 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
2724 .L192_enc_blocks_more_than_1: @ blocks left > 1
2726 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
2728 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
2730 rev64 $res0b, $res1b @ GHASH final-1 block
2732 eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
2733 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2734 movi $t0.8b, #0 @ suppress further partial tag feed in
2736 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
2738 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
2739 eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
2740 fmov $res1d, $input_l0 @ AES final block - mov low
2742 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
2743 fmov $res1.d[1], $input_h0 @ AES final block - mov high
2745 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
2747 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
2749 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
2751 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
2753 eor $res1b, $res1b, $ctr3b @ AES final block - result
2755 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
2757 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
2758 .L192_enc_blocks_less_than_1: @ blocks left <= 1
2760 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
2761 rev $ctr32w, $rctr32w
2762 and $bit_length, $bit_length, #127 @ bit_length %= 128
2764 sub $bit_length, $bit_length, #128 @ bit_length -= 128
2765 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
2767 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
2768 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
2770 and $bit_length, $bit_length, #127 @ bit_length %= 128
2772 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
2773 cmp $bit_length, #64
2775 csel $input_l0, $rk12_l, $rk12_h, lt
2776 csel $input_h0, $rk12_h, xzr, lt
2778 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
2780 fmov $ctr0.d[1], $input_h0
2782 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
2784 rev64 $res0b, $res1b @ GHASH final block
2786 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2788 mov $t0d, $res0.d[1] @ GHASH final block - mid
2790 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
2792 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
2794 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
2796 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
2798 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
2800 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
2802 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
2803 movi $mod_constant.8b, #0xc2
2805 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2807 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2809 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
2811 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2813 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2815 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2817 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2819 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2821 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2823 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2825 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2826 str $ctr32w, [$counter, #12] @ store the updated counter
2828 st1 { $res1b}, [$output_ptr] @ store all 16B
2830 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2831 ext $acc_lb, $acc_lb, $acc_lb, #8
2832 rev64 $acc_lb, $acc_lb
2834 st1 { $acc_l.16b }, [$current_tag]
2836 ldp x21, x22, [sp, #16]
2837 ldp x23, x24, [sp, #32]
2838 ldp d8, d9, [sp, #48]
2839 ldp d10, d11, [sp, #64]
2840 ldp d12, d13, [sp, #80]
2841 ldp d14, d15, [sp, #96]
2842 ldp x19, x20, [sp], #112
2848 .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
2851 #########################################################################################
2852 # size_t aes_gcm_dec_192_kernel(const unsigned char *in,
2854 # unsigned char *out,
2856 # unsigned char ivec[16],
2860 .global aes_gcm_dec_192_kernel
2861 .type aes_gcm_dec_192_kernel,%function
2863 aes_gcm_dec_192_kernel:
2864 AARCH64_VALID_CALL_TARGET
2865 cbz x1, .L192_dec_ret
2866 stp x19, x20, [sp, #-112]!
2869 stp x21, x22, [sp, #16]
2870 stp x23, x24, [sp, #32]
2871 stp d8, d9, [sp, #48]
2872 stp d10, d11, [sp, #64]
2873 stp d12, d13, [sp, #80]
2874 stp d14, d15, [sp, #96]
2876 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2877 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
2879 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2881 ldr $rk0q, [$cc, #0] @ load rk0
2883 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2884 mov $len, $main_end_input_ptr
2885 ldr $rk2q, [$cc, #32] @ load rk2
2887 lsr $rctr32x, $ctr96_t32x, #32
2888 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2889 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2891 rev $rctr32w, $rctr32w @ rev_ctr32
2892 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2894 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2895 ldr $rk1q, [$cc, #16] @ load rk1
2897 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2898 rev $ctr32w, $rctr32w @ CTR block 1
2900 add $rctr32w, $rctr32w, #1 @ CTR block 1
2901 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2902 ldr $rk3q, [$cc, #48] @ load rk3
2904 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2905 rev $ctr32w, $rctr32w @ CTR block 2
2906 add $rctr32w, $rctr32w, #1 @ CTR block 2
2908 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2909 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2911 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2912 rev $ctr32w, $rctr32w @ CTR block 3
2914 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2915 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2917 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2919 ldr $rk8q, [$cc, #128] @ load rk8
2921 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2923 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2924 ldr $rk11q, [$cc, #176] @ load rk11
2926 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2927 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2928 ext $h4b, $h4b, $h4b, #8
2930 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2931 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2932 ext $h2b, $h2b, $h2b, #8
2934 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2935 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2936 ext $h3b, $h3b, $h3b, #8
2938 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2939 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2941 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2942 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2943 ext $h1b, $h1b, $h1b, #8
2945 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2946 ldr $rk10q, [$cc, #160] @ load rk10
2948 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2949 ldr $rk9q, [$cc, #144] @ load rk9
2951 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2952 ldr $rk7q, [$cc, #112] @ load rk7
2954 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2955 ldr $rk4q, [$cc, #64] @ load rk4
2957 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2958 ld1 { $acc_lb}, [$current_tag]
2959 ext $acc_lb, $acc_lb, $acc_lb, #8
2960 rev64 $acc_lb, $acc_lb
2962 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2963 add $rctr32w, $rctr32w, #1 @ CTR block 3
2965 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2966 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2968 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2969 ldr $rk5q, [$cc, #80] @ load rk5
2971 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2972 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2974 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2976 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2977 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2979 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2980 ldr $rk6q, [$cc, #96] @ load rk6
2982 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2984 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2986 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2988 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2990 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2992 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2994 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2996 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2998 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
3000 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3002 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3004 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3006 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3008 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3010 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3012 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3013 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3015 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3016 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3018 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3019 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3021 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3022 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3024 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3025 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3027 aese $ctr3b, $rk11 @ AES block 3 - round 11
3029 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3031 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3033 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3034 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3036 aese $ctr2b, $rk11 @ AES block 2 - round 11
3038 aese $ctr1b, $rk11 @ AES block 1 - round 11
3039 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3041 aese $ctr0b, $rk11 @ AES block 0 - round 11
3042 b.ge .L192_dec_tail @ handle tail
3044 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
3046 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
3048 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
3050 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
3051 rev $ctr32w, $rctr32w @ CTR block 4
3052 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
3054 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
3056 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
3058 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
3060 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
3061 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
3062 add $rctr32w, $rctr32w, #1 @ CTR block 4
3064 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
3065 rev64 $res0b, $res0b @ GHASH block 0
3066 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
3068 fmov $ctr0d, $ctr96_b64x @ CTR block 4
3069 rev64 $res1b, $res1b @ GHASH block 1
3070 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3072 eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
3073 fmov $ctr0.d[1], $ctr32x @ CTR block 4
3074 rev $ctr32w, $rctr32w @ CTR block 5
3076 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
3077 fmov $ctr1d, $ctr96_b64x @ CTR block 5
3078 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
3080 add $rctr32w, $rctr32w, #1 @ CTR block 5
3081 fmov $ctr1.d[1], $ctr32x @ CTR block 5
3082 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
3084 rev $ctr32w, $rctr32w @ CTR block 6
3085 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
3087 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
3088 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
3090 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
3092 add $rctr32w, $rctr32w, #1 @ CTR block 6
3093 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
3094 b.ge .L192_dec_prepretail @ do prepretail
3096 .L192_dec_main_loop: @ main loop start
3097 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3098 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3100 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3101 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3103 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3104 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3105 rev64 $res3b, $res3b @ GHASH block 4k+3
3107 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3108 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3110 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3111 eor $res0b, $res0b, $acc_lb @ PRE 1
3113 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3114 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3116 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3117 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3119 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3120 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3122 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3123 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3124 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3126 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3127 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3128 rev $ctr32w, $rctr32w @ CTR block 4k+7
3130 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3131 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3133 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3134 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3135 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3137 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3139 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3140 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3142 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3143 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3145 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3147 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3148 rev64 $res2b, $res2b @ GHASH block 4k+2
3150 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3152 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3153 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3154 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3156 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3158 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3160 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3161 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3163 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3164 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3166 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3168 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3169 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3171 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3173 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3175 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3176 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3178 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3180 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3182 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3183 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3185 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3187 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3188 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3190 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3192 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3193 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3195 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3197 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3198 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3200 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3202 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3203 movi $mod_constant.8b, #0xc2
3205 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3207 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3208 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3210 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3212 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3213 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3215 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3217 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3218 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3220 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3222 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3223 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3225 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3227 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3228 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3230 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3232 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3233 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
3235 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3236 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3238 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3239 ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
3240 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3242 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3243 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3245 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
3246 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3248 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3249 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3251 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3252 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
3254 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
3255 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
3256 rev $ctr32w, $rctr32w @ CTR block 4k+8
3258 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3259 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3261 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3262 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3264 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
3265 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
3267 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
3268 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3269 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
3271 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3272 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
3274 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3276 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3277 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
3279 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3280 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3281 rev64 $res1b, $res1b @ GHASH block 4k+5
3283 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
3284 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3286 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3287 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
3289 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
3290 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
3291 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3293 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
3294 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
3295 rev $ctr32w, $rctr32w @ CTR block 4k+9
3297 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3298 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
3299 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3301 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
3302 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
3303 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
3305 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
3306 rev $ctr32w, $rctr32w @ CTR block 4k+10
3307 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
3309 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3310 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
3311 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3313 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
3314 rev64 $res0b, $res0b @ GHASH block 4k+4
3315 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
3317 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
3318 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
3319 b.lt .L192_dec_main_loop
3321 .L192_dec_prepretail: @ PREPRETAIL
3322 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3323 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3324 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3326 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3327 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3329 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3330 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3332 eor $res0b, $res0b, $acc_lb @ PRE 1
3333 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3335 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3336 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3338 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3339 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3341 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3342 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3343 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3345 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3346 rev64 $res2b, $res2b @ GHASH block 4k+2
3348 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3349 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3350 rev $ctr32w, $rctr32w @ CTR block 4k+7
3352 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3353 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3354 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3356 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3357 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3358 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3360 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3361 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3363 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3364 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3365 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3367 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3368 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3369 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3371 rev64 $res3b, $res3b @ GHASH block 4k+3
3372 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3374 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3375 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3377 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3378 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3380 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3381 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3383 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3385 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3386 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3388 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3390 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3391 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3393 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3395 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3397 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3398 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3400 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3401 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3403 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3405 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3406 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3408 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3410 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3411 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3413 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3415 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3416 movi $mod_constant.8b, #0xc2
3418 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3420 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3422 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3423 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3425 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3426 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3428 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3430 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3431 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3433 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3435 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3436 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3438 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3440 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3441 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3443 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3445 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3446 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3448 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3450 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3451 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3453 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3455 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3457 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3459 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3461 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3462 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3464 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3466 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3468 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3470 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3471 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3473 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3475 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3477 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3479 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3481 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3483 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3485 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3487 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3489 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3490 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3492 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3495 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3503 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3504 .L192_dec_tail: @ TAIL
3506 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
3507 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
3509 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
3511 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3513 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3515 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
3517 cmp $main_end_input_ptr, #48
3519 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3521 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3522 b.gt .L192_dec_blocks_more_than_3
3529 sub $rctr32w, $rctr32w, #1
3532 cmp $main_end_input_ptr, #32
3533 b.gt .L192_dec_blocks_more_than_2
3536 cmp $main_end_input_ptr, #16
3537 sub $rctr32w, $rctr32w, #1
3539 b.gt .L192_dec_blocks_more_than_1
3541 sub $rctr32w, $rctr32w, #1
3542 b .L192_dec_blocks_less_than_1
3543 .L192_dec_blocks_more_than_3: @ blocks left > 3
3544 rev64 $res0b, $res1b @ GHASH final-3 block
3545 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
3547 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
3549 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3551 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
3553 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
3554 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
3555 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
3557 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
3559 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
3560 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
3562 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
3564 eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
3565 movi $t0.8b, #0 @ suppress further partial tag feed in
3567 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
3568 eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
3569 .L192_dec_blocks_more_than_2: @ blocks left > 2
3571 rev64 $res0b, $res1b @ GHASH final-2 block
3572 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
3574 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3576 movi $t0.8b, #0 @ suppress further partial tag feed in
3578 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
3580 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
3582 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
3584 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
3586 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
3587 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
3589 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
3590 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
3592 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
3594 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
3596 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
3597 eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
3599 eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
3600 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
3601 .L192_dec_blocks_more_than_1: @ blocks left > 1
3603 rev64 $res0b, $res1b @ GHASH final-1 block
3605 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3606 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
3608 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
3610 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
3612 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
3613 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
3615 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
3617 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
3619 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
3620 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
3622 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
3623 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
3625 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
3627 movi $t0.8b, #0 @ suppress further partial tag feed in
3628 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
3629 eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
3631 eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
3633 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
3634 .L192_dec_blocks_less_than_1: @ blocks left <= 1
3636 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
3637 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
3638 and $bit_length, $bit_length, #127 @ bit_length %= 128
3640 sub $bit_length, $bit_length, #128 @ bit_length -= 128
3642 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
3644 and $bit_length, $bit_length, #127 @ bit_length %= 128
3645 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
3647 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
3648 cmp $bit_length, #64
3650 csel $ctr32x, $rk12_l, $rk12_h, lt
3651 csel $ctr96_b64x, $rk12_h, xzr, lt
3653 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
3654 and $output_l0, $output_l0, $ctr32x
3655 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
3657 orr $output_l0, $output_l0, $end_input_ptr
3658 mov $ctr0.d[1], $ctr96_b64x
3660 rev $ctr32w, $rctr32w
3662 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
3663 str $ctr32w, [$counter, #12] @ store the updated counter
3665 rev64 $res0b, $res1b @ GHASH final block
3667 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3668 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
3670 and $output_h0, $output_h0, $ctr96_b64x
3672 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
3673 mov $t0d, $res0.d[1] @ GHASH final block - mid
3675 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
3677 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
3679 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
3681 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
3683 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
3685 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
3686 movi $mod_constant.8b, #0xc2
3688 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3690 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3692 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3694 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3695 orr $output_h0, $output_h0, $main_end_input_ptr
3696 stp $output_l0, $output_h0, [$output_ptr]
3698 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3700 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3702 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3704 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3706 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3708 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3710 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3711 ext $acc_lb, $acc_lb, $acc_lb, #8
3712 rev64 $acc_lb, $acc_lb
3714 st1 { $acc_l.16b }, [$current_tag]
3716 ldp x21, x22, [sp, #16]
3717 ldp x23, x24, [sp, #32]
3718 ldp d8, d9, [sp, #48]
3719 ldp d10, d11, [sp, #64]
3720 ldp d12, d13, [sp, #80]
3721 ldp d14, d15, [sp, #96]
3722 ldp x19, x20, [sp], #112
3728 .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
3733 my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
3734 my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
3735 my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
3736 my ($output_l0,$output_h0)=map("x$_",(6..7));
3739 my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
3740 my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
3742 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
3743 my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
3744 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
3745 my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
3747 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
3748 my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
3749 my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
3751 my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
3752 my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
3753 my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
3776 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
3777 my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
3778 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
3780 my $mod_constantd="d8";
3781 my $mod_constant="v8";
3784 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
3785 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
3791 #########################################################################################
3792 # size_t aes_gcm_enc_256_kernel(const unsigned char *in,
3794 # unsigned char *out,
3796 # unsigned char ivec[16],
3800 .global aes_gcm_enc_256_kernel
3801 .type aes_gcm_enc_256_kernel,%function
3803 aes_gcm_enc_256_kernel:
3804 AARCH64_VALID_CALL_TARGET
3805 cbz x1, .L256_enc_ret
3806 stp x19, x20, [sp, #-112]!
3809 stp x21, x22, [sp, #16]
3810 stp x23, x24, [sp, #32]
3811 stp d8, d9, [sp, #48]
3812 stp d10, d11, [sp, #64]
3813 stp d12, d13, [sp, #80]
3814 stp d14, d15, [sp, #96]
3816 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
3817 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
3818 mov $len, $main_end_input_ptr
3819 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
3821 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
3822 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3824 ldr $rk0q, [$cc, #0] @ load rk0
3825 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3827 ldr $rk7q, [$cc, #112] @ load rk7
3828 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3830 lsr $rctr32x, $ctr96_t32x, #32
3831 fmov $ctr2d, $ctr96_b64x @ CTR block 2
3832 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
3834 rev $rctr32w, $rctr32w @ rev_ctr32
3835 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3836 fmov $ctr1d, $ctr96_b64x @ CTR block 1
3838 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
3839 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
3841 rev $ctr32w, $rctr32w @ CTR block 1
3842 fmov $ctr3d, $ctr96_b64x @ CTR block 3
3844 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
3845 add $rctr32w, $rctr32w, #1 @ CTR block 1
3846 ldr $rk1q, [$cc, #16] @ load rk1
3848 fmov $ctr1.d[1], $ctr32x @ CTR block 1
3849 rev $ctr32w, $rctr32w @ CTR block 2
3850 add $rctr32w, $rctr32w, #1 @ CTR block 2
3852 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
3853 ldr $rk2q, [$cc, #32] @ load rk2
3855 fmov $ctr2.d[1], $ctr32x @ CTR block 2
3856 rev $ctr32w, $rctr32w @ CTR block 3
3858 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
3859 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
3861 fmov $ctr3.d[1], $ctr32x @ CTR block 3
3863 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
3864 ldr $rk3q, [$cc, #48] @ load rk3
3866 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
3867 ldr $rk6q, [$cc, #96] @ load rk6
3869 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
3870 ldr $rk5q, [$cc, #80] @ load rk5
3872 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
3873 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3874 ext $h3b, $h3b, $h3b, #8
3876 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
3877 ldr $rk13q, [$cc, #208] @ load rk13
3879 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
3880 ldr $rk4q, [$cc, #64] @ load rk4
3882 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
3883 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
3884 ext $h2b, $h2b, $h2b, #8
3886 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
3887 ldr $rk12q, [$cc, #192] @ load rk12
3889 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
3890 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3891 ext $h4b, $h4b, $h4b, #8
3893 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
3894 ldr $rk11q, [$cc, #176] @ load rk11
3896 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
3897 ldr $rk8q, [$cc, #128] @ load rk8
3899 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
3900 add $rctr32w, $rctr32w, #1 @ CTR block 3
3902 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
3903 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
3905 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
3906 ld1 { $acc_lb}, [$current_tag]
3907 ext $acc_lb, $acc_lb, $acc_lb, #8
3908 rev64 $acc_lb, $acc_lb
3910 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
3912 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
3914 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
3916 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
3918 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
3920 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
3922 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
3924 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
3926 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3927 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
3929 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
3930 ldr $rk9q, [$cc, #144] @ load rk9
3932 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
3933 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3934 ext $h1b, $h1b, $h1b, #8
3936 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
3937 ldr $rk10q, [$cc, #160] @ load rk10
3939 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3940 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
3942 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
3944 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
3946 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
3947 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
3949 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3951 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3953 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3955 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3957 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3959 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3961 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3963 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3965 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3967 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3969 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3971 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
3973 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
3975 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3977 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
3979 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
3981 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
3982 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3984 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
3986 aese $ctr2b, $rk13 @ AES block 2 - round 13
3987 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3989 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
3991 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
3993 aese $ctr1b, $rk13 @ AES block 1 - round 13
3995 aese $ctr0b, $rk13 @ AES block 0 - round 13
3997 aese $ctr3b, $rk13 @ AES block 3 - round 13
3998 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3999 b.ge .L256_enc_tail @ handle tail
4001 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
4003 rev $ctr32w, $rctr32w @ CTR block 4
4004 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
4006 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
4008 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
4009 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4011 eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
4012 eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
4014 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
4015 eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
4017 eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
4018 eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
4019 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
4021 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4022 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
4023 eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
4025 eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
4026 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
4028 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
4029 add $rctr32w, $rctr32w, #1 @ CTR block 4
4031 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4032 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
4033 eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
4035 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
4037 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
4038 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4040 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4041 rev $ctr32w, $rctr32w @ CTR block 5
4042 add $rctr32w, $rctr32w, #1 @ CTR block 5
4044 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
4045 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4046 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4048 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4049 rev $ctr32w, $rctr32w @ CTR block 6
4050 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
4052 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
4053 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4054 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
4056 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
4058 add $rctr32w, $rctr32w, #1 @ CTR block 6
4059 fmov $ctr2d, $ctr96_b64x @ CTR block 6
4061 fmov $ctr2.d[1], $ctr32x @ CTR block 6
4062 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
4063 rev $ctr32w, $rctr32w @ CTR block 7
4065 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
4067 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
4068 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
4069 b.ge L256_enc_prepretail @ do prepretail
4071 .L256_enc_main_loop: @ main loop start
4072 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4073 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4075 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4076 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4078 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4079 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4081 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4082 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4084 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4085 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
4087 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4088 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
4090 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4091 eor $res0b, $res0b, $acc_lb @ PRE 1
4093 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4095 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4096 eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
4098 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4099 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4101 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4102 eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
4103 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4105 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4106 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4108 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4110 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4111 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4113 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4115 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4116 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4118 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4120 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4121 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4123 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4125 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4126 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4128 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4130 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4131 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4133 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4135 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4136 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4138 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4139 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4141 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4143 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4144 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4146 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4148 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4150 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4152 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4153 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4155 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4157 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4159 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4161 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4162 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4164 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4166 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4168 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4170 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4171 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4173 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4174 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
4176 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4177 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4179 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4180 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4182 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4184 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4185 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4187 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4188 eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
4190 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4191 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4193 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4194 eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
4196 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4197 movi $mod_constant.8b, #0xc2
4199 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4200 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4201 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
4203 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4204 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
4206 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4207 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4209 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4210 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4212 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4214 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4215 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4217 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4218 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4220 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4221 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4223 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4224 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4226 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4227 rev $ctr32w, $rctr32w @ CTR block 4k+8
4228 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4230 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4231 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4233 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4234 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4236 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4237 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4239 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4240 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
4241 eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
4243 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4244 eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
4246 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4247 eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
4249 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4250 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
4252 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4253 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4254 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4256 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4257 fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
4259 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4260 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
4262 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
4263 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
4265 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
4267 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4268 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4269 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
4271 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
4272 rev $ctr32w, $rctr32w @ CTR block 4k+9
4273 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
4275 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
4276 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
4277 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
4279 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4280 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
4282 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4283 rev $ctr32w, $rctr32w @ CTR block 4k+10
4284 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
4286 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
4287 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4288 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
4290 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4291 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
4292 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
4294 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4295 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
4296 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
4298 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
4299 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
4300 rev $ctr32w, $rctr32w @ CTR block 4k+11
4302 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4303 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
4305 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
4306 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
4307 b.lt L256_enc_main_loop
4309 .L256_enc_prepretail: @ PREPRETAIL
4310 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4311 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4313 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4314 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4316 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4317 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4319 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4320 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4322 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4324 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4326 eor $res0b, $res0b, $acc_lb @ PRE 1
4327 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4329 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4331 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4332 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4334 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4336 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4337 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4339 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4341 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4343 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4344 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4346 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4348 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4350 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4352 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4354 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4356 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4358 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4360 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4361 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4363 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4364 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4366 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4368 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4369 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4371 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4372 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4374 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4376 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4377 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4378 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4380 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4382 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4384 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4385 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4387 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4389 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4390 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4392 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4394 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4395 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4397 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4399 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4401 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4403 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4405 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4407 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4408 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4410 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4412 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4414 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4416 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4417 movi $mod_constant.8b, #0xc2
4419 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4421 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4422 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4424 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4426 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4427 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4429 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4430 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4432 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4434 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4436 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4438 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4439 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4441 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4443 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
4445 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
4446 ext $acc_hb, $acc_hb, $acc_hb, #8
4448 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4450 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4451 eor $acc_mb, $acc_mb, $acc_lb
4453 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4455 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4457 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4459 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4460 eor $acc_mb, $acc_mb, $t1.16b
4462 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4464 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4466 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4468 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4469 eor $acc_mb, $acc_mb, $acc_hb
4471 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4473 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4475 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4477 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
4479 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4480 ext $acc_mb, $acc_mb, $acc_mb, #8
4482 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4484 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4485 eor $acc_lb, $acc_lb, $t1.16b
4487 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4489 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4491 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4493 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4494 eor $acc_lb, $acc_lb, $acc_mb
4495 .L256_enc_tail: @ TAIL
4497 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
4498 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
4499 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
4501 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4502 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4504 cmp $main_end_input_ptr, #48
4505 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4507 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4509 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4510 b.gt .L256_enc_blocks_more_than_3
4512 cmp $main_end_input_ptr, #32
4517 sub $rctr32w, $rctr32w, #1
4521 b.gt .L256_enc_blocks_more_than_2
4524 sub $rctr32w, $rctr32w, #1
4525 cmp $main_end_input_ptr, #16
4527 b.gt .L256_enc_blocks_more_than_1
4529 sub $rctr32w, $rctr32w, #1
4530 b .L256_enc_blocks_less_than_1
4531 .L256_enc_blocks_more_than_3: @ blocks left > 3
4532 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
4534 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
4536 rev64 $res0b, $res1b @ GHASH final-3 block
4538 eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
4539 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4541 eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
4543 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
4544 fmov $res1d, $input_l0 @ AES final-2 block - mov low
4546 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
4548 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
4549 movi $t0.8b, #0 @ suppress further partial tag feed in
4551 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
4553 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
4555 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
4557 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
4558 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
4559 .L256_enc_blocks_more_than_2: @ blocks left > 2
4561 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
4563 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
4565 rev64 $res0b, $res1b @ GHASH final-2 block
4567 eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
4568 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4570 fmov $res1d, $input_l0 @ AES final-1 block - mov low
4571 eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
4573 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
4575 movi $t0.8b, #0 @ suppress further partial tag feed in
4577 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
4578 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
4580 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
4582 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
4584 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
4586 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
4588 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
4590 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
4592 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
4593 .L256_enc_blocks_more_than_1: @ blocks left > 1
4595 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
4597 rev64 $res0b, $res1b @ GHASH final-1 block
4599 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
4601 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4603 movi $t0.8b, #0 @ suppress further partial tag feed in
4605 eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
4606 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
4608 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
4609 eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
4611 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
4613 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
4615 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
4616 fmov $res1d, $input_l0 @ AES final block - mov low
4618 fmov $res1.d[1], $input_h0 @ AES final block - mov high
4620 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
4622 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
4624 eor $res1b, $res1b, $ctr3b @ AES final block - result
4625 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
4627 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
4628 .L256_enc_blocks_less_than_1: @ blocks left <= 1
4630 and $bit_length, $bit_length, #127 @ bit_length %= 128
4632 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
4633 sub $bit_length, $bit_length, #128 @ bit_length -= 128
4635 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
4636 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
4638 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
4639 and $bit_length, $bit_length, #127 @ bit_length %= 128
4641 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
4642 cmp $bit_length, #64
4644 csel $input_l0, $rk14_l, $rk14_h, lt
4645 csel $input_h0, $rk14_h, xzr, lt
4647 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
4649 fmov $ctr0.d[1], $input_h0
4651 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
4653 rev64 $res0b, $res1b @ GHASH final block
4655 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4657 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
4659 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
4660 mov $t0d, $res0.d[1] @ GHASH final block - mid
4661 rev $ctr32w, $rctr32w
4663 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
4665 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
4666 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
4668 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
4670 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
4672 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
4673 movi $mod_constant.8b, #0xc2
4675 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4677 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4679 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4681 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4683 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4685 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4687 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
4689 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4691 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4693 str $ctr32w, [$counter, #12] @ store the updated counter
4695 st1 { $res1b}, [$output_ptr] @ store all 16B
4696 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4698 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4699 ext $acc_lb, $acc_lb, $acc_lb, #8
4700 rev64 $acc_lb, $acc_lb
4702 st1 { $acc_l.16b }, [$current_tag]
4704 ldp x21, x22, [sp, #16]
4705 ldp x23, x24, [sp, #32]
4706 ldp d8, d9, [sp, #48]
4707 ldp d10, d11, [sp, #64]
4708 ldp d12, d13, [sp, #80]
4709 ldp d14, d15, [sp, #96]
4710 ldp x19, x20, [sp], #112
4716 .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
4724 #########################################################################################
4725 # size_t aes_gcm_dec_256_kernel(const unsigned char *in,
4727 # unsigned char *out,
4729 # unsigned char ivec[16],
4733 .global aes_gcm_dec_256_kernel
4734 .type aes_gcm_dec_256_kernel,%function
4736 aes_gcm_dec_256_kernel:
4737 AARCH64_VALID_CALL_TARGET
4738 cbz x1, .L256_dec_ret
4739 stp x19, x20, [sp, #-112]!
4742 stp x21, x22, [sp, #16]
4743 stp x23, x24, [sp, #32]
4744 stp d8, d9, [sp, #48]
4745 stp d10, d11, [sp, #64]
4746 stp d12, d13, [sp, #80]
4747 stp d14, d15, [sp, #96]
4749 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
4750 mov $len, $main_end_input_ptr
4751 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
4753 ldr $rk8q, [$cc, #128] @ load rk8
4754 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
4756 ldr $rk7q, [$cc, #112] @ load rk7
4757 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4759 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
4760 ldr $rk6q, [$cc, #96] @ load rk6
4762 lsr $rctr32x, $ctr96_t32x, #32
4763 ldr $rk5q, [$cc, #80] @ load rk5
4764 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
4766 ldr $rk3q, [$cc, #48] @ load rk3
4767 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4768 rev $rctr32w, $rctr32w @ rev_ctr32
4770 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
4771 fmov $ctr3d, $ctr96_b64x @ CTR block 3
4773 rev $ctr32w, $rctr32w @ CTR block 1
4774 add $rctr32w, $rctr32w, #1 @ CTR block 1
4775 fmov $ctr1d, $ctr96_b64x @ CTR block 1
4777 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
4778 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
4780 fmov $ctr1.d[1], $ctr32x @ CTR block 1
4781 rev $ctr32w, $rctr32w @ CTR block 2
4782 add $rctr32w, $rctr32w, #1 @ CTR block 2
4784 fmov $ctr2d, $ctr96_b64x @ CTR block 2
4785 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
4787 fmov $ctr2.d[1], $ctr32x @ CTR block 2
4788 rev $ctr32w, $rctr32w @ CTR block 3
4790 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
4791 ldr $rk0q, [$cc, #0] @ load rk0
4793 fmov $ctr3.d[1], $ctr32x @ CTR block 3
4794 add $rctr32w, $rctr32w, #1 @ CTR block 3
4796 ldr $rk4q, [$cc, #64] @ load rk4
4798 ldr $rk13q, [$cc, #208] @ load rk13
4800 ldr $rk1q, [$cc, #16] @ load rk1
4802 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
4803 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4804 ext $h3b, $h3b, $h3b, #8
4806 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
4807 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4808 ext $h4b, $h4b, $h4b, #8
4810 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
4811 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
4812 ext $h2b, $h2b, $h2b, #8
4814 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
4815 ldr $rk2q, [$cc, #32] @ load rk2
4817 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
4818 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
4820 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
4821 ld1 { $acc_lb}, [$current_tag]
4822 ext $acc_lb, $acc_lb, $acc_lb, #8
4823 rev64 $acc_lb, $acc_lb
4825 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
4826 ldr $rk9q, [$cc, #144] @ load rk9
4828 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
4829 ldr $rk12q, [$cc, #192] @ load rk12
4831 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
4832 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4833 ext $h1b, $h1b, $h1b, #8
4835 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
4836 ldr $rk10q, [$cc, #160] @ load rk10
4838 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
4840 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
4842 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
4844 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
4846 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
4847 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
4849 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
4851 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
4853 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
4855 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
4857 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
4859 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
4861 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
4863 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
4865 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
4867 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
4869 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
4871 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
4873 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
4875 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
4877 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
4879 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
4881 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
4883 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
4885 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
4887 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
4889 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
4891 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
4892 ldr $rk11q, [$cc, #176] @ load rk11
4894 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
4896 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
4898 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
4900 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
4902 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
4904 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
4906 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
4908 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
4910 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
4912 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
4914 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
4916 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
4918 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
4920 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
4921 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
4923 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
4925 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
4927 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
4929 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
4930 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
4932 aese $ctr1b, $rk13 @ AES block 1 - round 13
4934 aese $ctr2b, $rk13 @ AES block 2 - round 13
4935 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
4937 aese $ctr3b, $rk13 @ AES block 3 - round 13
4939 aese $ctr0b, $rk13 @ AES block 0 - round 13
4940 b.ge .L256_dec_tail @ handle tail
4942 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
4944 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
4946 rev $ctr32w, $rctr32w @ CTR block 4
4948 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
4950 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
4951 rev64 $res1b, $res1b @ GHASH block 1
4952 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
4954 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
4956 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
4957 rev64 $res0b, $res0b @ GHASH block 0
4958 add $rctr32w, $rctr32w, #1 @ CTR block 4
4960 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4961 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4963 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4964 rev $ctr32w, $rctr32w @ CTR block 5
4965 add $rctr32w, $rctr32w, #1 @ CTR block 5
4967 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
4969 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4970 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
4971 eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
4973 eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
4974 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
4975 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4977 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
4978 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4980 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4981 rev $ctr32w, $rctr32w @ CTR block 6
4982 add $rctr32w, $rctr32w, #1 @ CTR block 6
4984 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
4985 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4987 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
4988 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
4990 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
4991 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4992 b.ge .L256_dec_prepretail @ do prepretail
4994 .L256_dec_main_loop: @ main loop start
4995 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
4996 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4997 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
4999 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
5000 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
5002 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
5003 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
5005 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5006 eor $res0b, $res0b, $acc_lb @ PRE 1
5007 rev $ctr32w, $rctr32w @ CTR block 4k+7
5009 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5010 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5012 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5013 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5015 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5016 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5017 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5019 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5020 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5022 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5023 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5025 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5026 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5028 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5029 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5031 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5032 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5034 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5035 rev64 $res2b, $res2b @ GHASH block 4k+2
5037 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5038 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5040 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5041 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5043 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5045 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5047 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5048 rev64 $res3b, $res3b @ GHASH block 4k+3
5050 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5051 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5053 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5054 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5055 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5057 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5059 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5060 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5062 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5063 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5065 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5066 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5068 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5069 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5071 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5072 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5074 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5076 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5077 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5079 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5081 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5082 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5084 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5085 rev $ctr32w, $rctr32w @ CTR block 4k+8
5087 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5088 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5090 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5091 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
5093 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5095 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5096 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5098 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5100 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5101 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5103 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5105 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5107 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5108 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5110 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5112 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5113 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
5114 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5116 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5118 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5119 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5121 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5123 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5124 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5126 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5128 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5129 movi $mod_constant.8b, #0xc2
5131 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5132 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5134 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5136 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5137 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5139 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5140 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5142 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5144 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5145 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5147 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5148 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
5150 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5151 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5153 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5154 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5156 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5157 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
5159 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5160 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
5162 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5163 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5165 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5166 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5168 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5169 ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
5171 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5172 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
5174 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5175 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5177 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5178 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5180 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5181 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
5182 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5184 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5185 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
5187 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5188 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
5190 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5191 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
5192 rev $ctr32w, $rctr32w @ CTR block 4k+9
5194 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5195 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
5196 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
5198 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
5200 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5201 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5203 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
5204 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
5205 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5207 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5208 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
5210 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
5211 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5213 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
5214 rev $ctr32w, $rctr32w @ CTR block 4k+10
5215 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
5217 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5218 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
5220 rev64 $res1b, $res1b @ GHASH block 4k+5
5221 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
5222 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
5224 eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
5225 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
5227 rev64 $res0b, $res0b @ GHASH block 4k+4
5228 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5229 b.lt .L256_dec_main_loop
5232 .L256_dec_prepretail: @ PREPRETAIL
5233 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5234 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
5235 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
5237 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
5238 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
5240 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
5241 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
5243 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5244 rev $ctr32w, $rctr32w @ CTR block 4k+7
5245 eor $res0b, $res0b, $acc_lb @ PRE 1
5247 rev64 $res2b, $res2b @ GHASH block 4k+2
5248 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5249 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5251 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5252 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5254 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5255 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5256 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5258 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5259 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5261 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5262 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5264 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5265 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5267 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5269 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5270 rev64 $res3b, $res3b @ GHASH block 4k+3
5272 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5274 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5275 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5277 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5279 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5280 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5282 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5284 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5285 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5287 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5289 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5290 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5292 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5293 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5295 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5297 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5299 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5300 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5302 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5304 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5305 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5307 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5309 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5310 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5312 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5314 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5315 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5317 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5319 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5320 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5322 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5324 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5325 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5327 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5329 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5331 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5332 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5334 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5336 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5337 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5339 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5341 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5342 movi $mod_constant.8b, #0xc2
5344 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5345 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5347 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5349 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5350 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5352 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5354 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5355 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5357 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5359 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5360 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5362 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5364 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5365 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5367 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5369 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5370 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5372 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5374 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5375 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5377 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5379 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5380 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5382 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5384 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5386 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5387 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5389 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5390 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5392 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5393 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5395 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5396 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5398 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5399 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5401 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5403 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5404 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5406 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5407 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5409 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5410 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5412 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5413 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5415 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5416 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5418 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5420 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5422 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5424 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5425 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5426 .L256_dec_tail: @ TAIL
5428 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
5429 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
5431 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
5433 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5435 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5436 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
5438 cmp $main_end_input_ptr, #48
5440 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5442 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5443 b.gt .L256_dec_blocks_more_than_3
5445 sub $rctr32w, $rctr32w, #1
5450 cmp $main_end_input_ptr, #32
5454 b.gt .L256_dec_blocks_more_than_2
5456 sub $rctr32w, $rctr32w, #1
5459 cmp $main_end_input_ptr, #16
5460 b.gt .L256_dec_blocks_more_than_1
5462 sub $rctr32w, $rctr32w, #1
5463 b .L256_dec_blocks_less_than_1
5464 .L256_dec_blocks_more_than_3: @ blocks left > 3
5465 rev64 $res0b, $res1b @ GHASH final-3 block
5466 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
5468 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
5470 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
5472 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5474 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
5476 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
5478 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
5480 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
5482 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
5484 movi $t0.8b, #0 @ suppress further partial tag feed in
5486 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
5488 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
5489 eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
5491 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
5492 eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
5493 .L256_dec_blocks_more_than_2: @ blocks left > 2
5495 rev64 $res0b, $res1b @ GHASH final-2 block
5496 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
5498 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5499 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
5501 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
5503 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
5505 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
5507 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
5509 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
5510 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
5512 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
5513 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
5514 movi $t0.8b, #0 @ suppress further partial tag feed in
5516 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
5518 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
5519 eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
5521 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
5522 eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
5523 .L256_dec_blocks_more_than_1: @ blocks left > 1
5525 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
5526 rev64 $res0b, $res1b @ GHASH final-1 block
5528 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
5530 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5531 movi $t0.8b, #0 @ suppress further partial tag feed in
5533 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
5535 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
5537 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
5539 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
5541 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
5542 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
5544 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
5546 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
5548 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
5549 eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
5551 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
5553 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
5555 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
5556 eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
5557 .L256_dec_blocks_less_than_1: @ blocks left <= 1
5559 and $bit_length, $bit_length, #127 @ bit_length %= 128
5560 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
5562 sub $bit_length, $bit_length, #128 @ bit_length -= 128
5563 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
5565 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
5566 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
5568 and $bit_length, $bit_length, #127 @ bit_length %= 128
5570 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
5571 cmp $bit_length, #64
5573 csel $ctr32x, $rk14_l, $rk14_h, lt
5574 csel $ctr96_b64x, $rk14_h, xzr, lt
5576 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
5577 and $output_l0, $output_l0, $ctr32x
5579 mov $ctr0.d[1], $ctr96_b64x
5580 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
5582 rev $ctr32w, $rctr32w
5584 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
5586 orr $output_l0, $output_l0, $end_input_ptr
5588 and $output_h0, $output_h0, $ctr96_b64x
5590 orr $output_h0, $output_h0, $main_end_input_ptr
5592 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
5594 rev64 $res0b, $res1b @ GHASH final block
5596 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5598 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
5600 mov $t0d, $res0.d[1] @ GHASH final block - mid
5602 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
5604 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
5606 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
5608 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
5610 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
5612 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
5613 movi $mod_constant.8b, #0xc2
5615 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5617 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5619 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5621 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5623 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5625 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5627 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5629 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5631 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5633 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5635 stp $output_l0, $output_h0, [$output_ptr]
5637 str $ctr32w, [$counter, #12] @ store the updated counter
5639 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5640 ext $acc_lb, $acc_lb, $acc_lb, #8
5641 rev64 $acc_lb, $acc_lb
5643 st1 { $acc_l.16b }, [$current_tag]
5645 ldp x21, x22, [sp, #16]
5646 ldp x23, x24, [sp, #32]
5647 ldp d8, d9, [sp, #48]
5648 ldp d10, d11, [sp, #64]
5649 ldp d12, d13, [sp, #80]
5650 ldp d14, d15, [sp, #96]
5651 ldp x19, x20, [sp], #112
5657 .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
5663 .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
5668 if ($flavour =~ /64/) { ######## 64-bit code
5672 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
5673 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
5674 $3<8?$3:$3+8,($4 eq "lo")?0:1;
5676 foreach(split("\n",$code)) {
5677 s/@\s/\/\//o; # old->new style commentary
5680 } else { ######## 32-bit code
5684 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
5685 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
5688 my ($mnemonic,$arg)=@_;
5690 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
5691 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
5692 |(($2&7)<<17)|(($2&8)<<4)
5693 |(($3&7)<<1) |(($3&8)<<2);
5694 $word |= 0x00010001 if ($mnemonic =~ "2");
5695 # since ARMv7 instructions are always encoded little-endian.
5696 # correct solution is to use .inst directive, but older%%%%
5697 # assemblers don't implement it:-(
5698 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
5699 $word&0xff,($word>>8)&0xff,
5700 ($word>>16)&0xff,($word>>24)&0xff,
5705 foreach(split("\n",$code)) {
5706 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
5707 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
5708 s/\/\/\s?/@ /o; # new->old style commentary
5710 # fix up remaining new-style suffixes
5713 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
5714 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
5715 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
5716 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
5717 s/^(\s+)b\./$1b/o or
5718 s/^(\s+)ret/$1bx\tlr/o;
5720 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
5728 close STDOUT or die "error closing STDOUT: $!"; # enforce flush