2 # Copyright 2019 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 #========================================================================
11 # Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
12 # derived from https://github.com/ARM-software/AArch64cryptolib, original
13 # author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14 # licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
15 # obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
16 #========================================================================
18 # Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
20 # main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
22 # ____________________________________________________
25 # |____________________________________________________|
27 # | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
28 # |________________|________________|__________________|
30 # | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
31 # |________________|________________|__________________|
33 # | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
34 # |________________|________________|__________________|
36 # | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
37 # |________________|____(mostly)____|__________________|
40 # |____________________________________________________|
43 # Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
44 # EXT low_acc, low_acc, low_acc, #8
45 # EOR res_curr (4k+0), res_curr (4k+0), low_acc
48 # Increment and byte reverse counter in scalar registers and transfer to SIMD registers
49 # REV ctr32, rev_ctr32
50 # ORR ctr64, constctr96_top32, ctr32, LSL #32
51 # INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF
52 # INS ctr_next.d[1], ctr64X
56 # Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
57 # Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
58 # Given we are very constrained in our ASIMD registers this is quite important
61 # LDR input_low, [ input_ptr ], #8
62 # LDR input_high, [ input_ptr ], #8
63 # EOR input_low, k14_low
64 # EOR input_high, k14_high
65 # INS res_curr.d[0], input_low
66 # INS res_curr.d[1], input_high
67 # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
68 # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
69 # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
70 # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
71 # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
72 # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
73 # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
74 # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
75 # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
76 # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
77 # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
78 # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
79 # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
81 # EOR res_curr, res_curr, ctr_curr
82 # ST1 { res_curr.16b }, [ output_ptr ], #16
85 # AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr
86 # AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr
87 # AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr
88 # AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr
89 # AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr
90 # AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr
91 # AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr
92 # AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr
93 # AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr
94 # AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr
95 # AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr
96 # AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr
97 # AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr
99 # LDR res_curr, [ input_ptr ], #16
100 # EOR res_curr, res_curr, ctr_curr
101 # MOV output_low, res_curr.d[0]
102 # MOV output_high, res_curr.d[1]
103 # EOR output_low, k14_low
104 # EOR output_high, k14_high
105 # STP output_low, output_high, [ output_ptr ], #16
108 # do 128b karatsuba polynomial multiplication on block
109 # We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
112 # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
114 # The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
115 # Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
117 # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
118 # multiplying with "twisted" powers of H
120 # Note: We can PMULL directly into the acc_x in first GHASH of the loop
121 # Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
122 # path latency dominates the performance
124 # This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
125 # than indicated here
126 # REV64 res_curr, res_curr
127 # INS t_m.d[0], res_curr.d[1]
128 # EOR t_m.8B, t_m.8B, res_curr.8B
129 # PMULL2 t_h, res_curr, HX
130 # PMULL t_l, res_curr, HX
131 # PMULL t_m, t_m, HX_k
132 # EOR acc_h, acc_h, t_h
133 # EOR acc_l, acc_l, t_l
134 # EOR acc_m, acc_m, t_m
136 # MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
137 # There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
138 # with a reversed constant
139 # EOR acc_m, acc_m, acc_h
140 # EOR acc_m, acc_m, acc_l // Finish off karatsuba processing
141 # PMULL t_mod, acc_h, mod_constant
142 # EXT acc_h, acc_h, acc_h, #8
143 # EOR acc_m, acc_m, acc_h
144 # EOR acc_m, acc_m, t_mod
145 # PMULL acc_h, acc_m, mod_constant
146 # EXT acc_m, acc_m, acc_m, #8
147 # EOR acc_l, acc_l, acc_h
148 # EOR acc_l, acc_l, acc_m
150 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
151 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
153 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
154 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
155 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
156 die "can't locate arm-xlate.pl";
158 open OUT,"| \"$^X\" $xlate $flavour $output";
161 $input_ptr="x0"; #argument block
169 my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
170 my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
171 my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
172 my ($output_l0,$output_h0)=map("x$_",(6..7));
175 my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
176 my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
178 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
179 my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
180 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
181 my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
183 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
184 my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
185 my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
187 my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
188 my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
189 my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
194 my ($t1,$t2,$t3)=map("v$_",(28..30));
195 my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
211 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
212 my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
213 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
215 my $mod_constantd="d8";
216 my $mod_constant="v8";
219 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
220 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
227 #include "arm_arch.h"
229 #if __ARM_MAX_ARCH__>=8
231 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
232 $code.=<<___ if ($flavour !~ /64/);
237 # define INST(a,b,c,d) $_byte c,0xef,a,b
240 # define INST(a,b,c,d) $_byte a,b,c,0xf2
246 #########################################################################################
247 # size_t aes_gcm_enc_128_kernel(const unsigned char *in,
249 # unsigned char *out,
251 # unsigned char ivec[16],
255 .global aes_gcm_enc_128_kernel
256 .type aes_gcm_enc_128_kernel,%function
258 aes_gcm_enc_128_kernel:
259 cbz x1, .L128_enc_ret
260 stp x19, x20, [sp, #-112]!
263 stp x21, x22, [sp, #16]
264 stp x23, x24, [sp, #32]
265 stp d8, d9, [sp, #48]
266 stp d10, d11, [sp, #64]
267 stp d12, d13, [sp, #80]
268 stp d14, d15, [sp, #96]
270 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
271 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
273 ld1 {$acc_lb}, [$current_tag]
274 ext $acc_lb, $acc_lb, $acc_lb, #8
275 rev64 $acc_lb, $acc_lb
276 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
277 mov $len, $main_end_input_ptr
279 ldr $rk9q, [$cc, #144] @ load rk9
280 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
281 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
283 lsr $rctr32x, $ctr96_t32x, #32
284 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
285 ext $h4b, $h4b, $h4b, #8
287 fmov $ctr1d, $ctr96_b64x @ CTR block 1
288 rev $rctr32w, $rctr32w @ rev_ctr32
290 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
291 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
292 ldr $rk0q, [$cc, #0] @ load rk0
294 rev $ctr32w, $rctr32w @ CTR block 1
295 add $rctr32w, $rctr32w, #1 @ CTR block 1
296 fmov $ctr3d, $ctr96_b64x @ CTR block 3
298 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
299 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
301 fmov $ctr1.d[1], $ctr32x @ CTR block 1
302 rev $ctr32w, $rctr32w @ CTR block 2
304 fmov $ctr2d, $ctr96_b64x @ CTR block 2
305 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
306 add $rctr32w, $rctr32w, #1 @ CTR block 2
308 fmov $ctr2.d[1], $ctr32x @ CTR block 2
309 rev $ctr32w, $rctr32w @ CTR block 3
311 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
312 ldr $rk1q, [$cc, #16] @ load rk1
314 add $rctr32w, $rctr32w, #1 @ CTR block 3
315 fmov $ctr3.d[1], $ctr32x @ CTR block 3
317 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
318 ext $h3b, $h3b, $h3b, #8
320 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
321 ldr $rk2q, [$cc, #32] @ load rk2
323 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
324 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
325 ext $h1b, $h1b, $h1b, #8
327 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
328 ldr $rk8q, [$cc, #128] @ load rk8
330 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
331 ldr $rk3q, [$cc, #48] @ load rk3
333 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
334 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
336 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
337 ldr $rk6q, [$cc, #96] @ load rk6
339 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
340 ldr $rk7q, [$cc, #112] @ load rk7
342 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
343 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
345 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
346 ldr $rk5q, [$cc, #80] @ load rk5
348 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
349 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
350 ext $h2b, $h2b, $h2b, #8
352 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
354 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
355 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
357 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
359 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
361 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
362 ldr $rk4q, [$cc, #64] @ load rk4
364 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
366 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
367 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
369 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
370 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
372 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
373 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
375 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
377 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
379 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
381 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
383 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
385 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
387 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
388 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
390 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
392 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
394 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
396 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
398 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
400 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
402 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
404 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
406 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
408 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
410 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
412 aese $ctr2b, $rk9 @ AES block 2 - round 9
414 aese $ctr0b, $rk9 @ AES block 0 - round 9
416 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
418 aese $ctr1b, $rk9 @ AES block 1 - round 9
420 aese $ctr3b, $rk9 @ AES block 3 - round 9
421 b.ge .L128_enc_tail @ handle tail
423 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
425 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
427 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
429 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
431 eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low
432 eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high
434 eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low
435 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
437 eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low
438 eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high
439 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
441 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
442 eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high
444 eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low
445 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
447 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
448 eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high
449 rev $ctr32w, $rctr32w @ CTR block 4
451 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
452 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
454 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
455 fmov $ctr0d, $ctr96_b64x @ CTR block 4
456 add $rctr32w, $rctr32w, #1 @ CTR block 4
458 fmov $ctr0.d[1], $ctr32x @ CTR block 4
459 rev $ctr32w, $rctr32w @ CTR block 5
461 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
462 fmov $ctr1d, $ctr96_b64x @ CTR block 5
463 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
465 add $rctr32w, $rctr32w, #1 @ CTR block 5
466 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
467 fmov $ctr1.d[1], $ctr32x @ CTR block 5
469 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
470 rev $ctr32w, $rctr32w @ CTR block 6
471 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
473 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
474 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
476 add $rctr32w, $rctr32w, #1 @ CTR block 6
477 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
478 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
480 fmov $ctr2d, $ctr96_b64x @ CTR block 6
481 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
483 fmov $ctr2.d[1], $ctr32x @ CTR block 6
484 rev $ctr32w, $rctr32w @ CTR block 7
485 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
487 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
489 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
490 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
491 b.ge .L128_enc_prepretail @ do prepretail
493 .L128_enc_main_loop: @ main loop start
494 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
495 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
496 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
498 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
499 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
501 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
502 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
504 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
505 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
506 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
508 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
509 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
511 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
512 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
514 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
515 eor $res0b, $res0b, $acc_lb @ PRE 1
517 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
518 eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high
520 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
521 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
522 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
524 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
525 rev $ctr32w, $rctr32w @ CTR block 4k+8
527 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
528 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
529 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
531 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
532 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
533 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
535 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
537 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
538 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
540 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
542 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
543 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
545 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
547 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
548 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
550 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
552 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
553 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
555 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
556 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
558 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
559 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
561 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
562 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
564 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
565 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
567 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
568 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
570 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
572 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
573 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
575 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
577 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
578 movi $mod_constant.8b, #0xc2
580 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
581 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
583 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
585 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
586 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
588 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
589 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
591 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
592 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
594 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
595 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
597 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
598 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
600 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
601 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
603 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
604 eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low
606 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
607 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
609 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
610 eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low
612 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
613 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
615 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
616 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
617 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
619 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
620 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
621 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
623 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
624 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
626 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
627 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
629 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
630 eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high
632 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
633 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
635 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
636 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
638 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
639 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
641 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
642 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
644 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
645 eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low
646 eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high
648 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
649 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
651 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
652 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
654 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
655 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
657 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
658 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
660 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
661 rev $ctr32w, $rctr32w @ CTR block 4k+9
662 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
664 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
665 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
667 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
668 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
669 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
671 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
672 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
673 rev $ctr32w, $rctr32w @ CTR block 4k+10
675 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
676 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
677 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
678 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
680 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
681 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
682 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
683 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
685 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
686 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
688 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
689 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
690 rev $ctr32w, $rctr32w @ CTR block 4k+11
692 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
693 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
695 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
696 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
697 b.lt .L128_enc_main_loop
699 .L128_enc_prepretail: @ PREPRETAIL
700 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
701 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
702 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
704 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
705 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
706 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
708 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
709 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
711 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
713 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
714 eor $res0b, $res0b, $acc_lb @ PRE 1
716 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
718 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
719 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
721 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
722 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
724 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
725 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
727 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
728 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
730 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
732 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
733 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
735 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
737 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
738 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
740 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
742 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
743 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
745 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
747 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
748 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
750 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
751 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
753 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
755 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
756 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
758 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
760 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
762 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
763 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
765 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
767 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
768 movi $mod_constant.8b, #0xc2
770 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
771 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
773 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
775 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
776 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
778 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
780 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
781 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
783 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
785 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
786 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
788 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
789 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
791 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
793 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
794 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
796 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
798 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
799 ext $acc_hb, $acc_hb, $acc_hb, #8
801 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
803 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
804 eor $acc_mb, $acc_mb, $acc_lb
806 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
808 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
810 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
812 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
813 eor $acc_mb, $acc_mb, $t1.16b
815 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
817 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
819 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
821 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
822 eor $acc_mb, $acc_mb, $acc_hb
824 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
826 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
828 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
830 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
832 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
833 ext $acc_mb, $acc_mb, $acc_mb, #8
835 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
837 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
838 eor $acc_lb, $acc_lb, $t1.16b
840 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
842 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
844 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
846 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
848 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
849 eor $acc_lb, $acc_lb, $acc_mb
851 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
852 .L128_enc_tail: @ TAIL
854 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
855 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
857 cmp $main_end_input_ptr, #48
859 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
860 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low
861 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high
863 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
865 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
867 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
869 b.gt .L128_enc_blocks_more_than_3
871 sub $rctr32w, $rctr32w, #1
875 cmp $main_end_input_ptr, #32
880 b.gt .L128_enc_blocks_more_than_2
883 cmp $main_end_input_ptr, #16
885 sub $rctr32w, $rctr32w, #1
886 b.gt .L128_enc_blocks_more_than_1
888 sub $rctr32w, $rctr32w, #1
889 b .L128_enc_blocks_less_than_1
890 .L128_enc_blocks_more_than_3: @ blocks left > 3
891 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
893 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
895 rev64 $res0b, $res1b @ GHASH final-3 block
897 eor $res0b, $res0b, $t0.16b @ feed in partial tag
898 eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high
899 eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low
901 fmov $res1d, $input_l0 @ AES final-2 block - mov low
903 movi $t0.8b, #0 @ suppress further partial tag feed in
904 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
906 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
907 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
909 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
911 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
913 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
914 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
916 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
917 .L128_enc_blocks_more_than_2: @ blocks left > 2
919 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
921 rev64 $res0b, $res1b @ GHASH final-2 block
922 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
924 eor $res0b, $res0b, $t0.16b @ feed in partial tag
926 eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low
928 fmov $res1d, $input_l0 @ AES final-1 block - mov low
929 eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high
931 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
932 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
934 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
936 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
938 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
940 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
942 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
944 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
946 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
948 movi $t0.8b, #0 @ suppress further partial tag feed in
950 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
951 .L128_enc_blocks_more_than_1: @ blocks left > 1
953 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
955 rev64 $res0b, $res1b @ GHASH final-1 block
956 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
958 eor $res0b, $res0b, $t0.16b @ feed in partial tag
960 eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high
961 eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low
963 fmov $res1d, $input_l0 @ AES final block - mov low
965 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
966 fmov $res1.d[1], $input_h0 @ AES final block - mov high
968 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
970 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
972 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
974 eor $res1b, $res1b, $ctr3b @ AES final block - result
976 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
978 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
980 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
982 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
984 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
985 movi $t0.8b, #0 @ suppress further partial tag feed in
986 .L128_enc_blocks_less_than_1: @ blocks left <= 1
988 and $bit_length, $bit_length, #127 @ bit_length %= 128
989 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
991 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
992 sub $bit_length, $bit_length, #128 @ bit_length -= 128
994 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
996 and $bit_length, $bit_length, #127 @ bit_length %= 128
998 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1001 csel $input_l0, $rk10_l, $rk10_h, lt
1002 csel $input_h0, $rk10_h, xzr, lt
1004 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
1006 fmov $ctr0.d[1], $input_h0
1008 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1010 rev64 $res0b, $res1b @ GHASH final block
1012 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1014 mov $t0d, $res0.d[1] @ GHASH final block - mid
1016 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1017 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
1019 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1021 rev $ctr32w, $rctr32w
1023 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1025 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1027 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1029 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1031 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1032 movi $mod_constant.8b, #0xc2
1034 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1036 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1038 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1040 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1042 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1044 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1046 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1048 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1050 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1052 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
1054 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
1055 st1 { $res1b}, [$output_ptr] @ store all 16B
1057 str $ctr32w, [$counter, #12] @ store the updated counter
1059 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1060 ext $acc_lb, $acc_lb, $acc_lb, #8
1061 rev64 $acc_lb, $acc_lb
1063 st1 { $acc_l.16b }, [$current_tag]
1064 ldp x21, x22, [sp, #16]
1065 ldp x23, x24, [sp, #32]
1066 ldp d8, d9, [sp, #48]
1067 ldp d10, d11, [sp, #64]
1068 ldp d12, d13, [sp, #80]
1069 ldp d14, d15, [sp, #96]
1070 ldp x19, x20, [sp], #112
1076 .size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
1079 #########################################################################################
1080 # size_t aes_gcm_dec_128_kernel(const unsigned char *in,
1082 # unsigned char *out,
1084 # unsigned char ivec[16],
1088 .global aes_gcm_dec_128_kernel
1089 .type aes_gcm_dec_128_kernel,%function
1091 aes_gcm_dec_128_kernel:
1092 cbz x1, .L128_dec_ret
1093 stp x19, x20, [sp, #-112]!
1096 stp x21, x22, [sp, #16]
1097 stp x23, x24, [sp, #32]
1098 stp d8, d9, [sp, #48]
1099 stp d10, d11, [sp, #64]
1100 stp d12, d13, [sp, #80]
1101 stp d14, d15, [sp, #96]
1103 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
1104 mov $len, $main_end_input_ptr
1105 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1107 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
1108 ldr $rk0q, [$cc, #0] @ load rk0
1110 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1111 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
1113 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
1114 ext $h2b, $h2b, $h2b, #8
1116 lsr $rctr32x, $ctr96_t32x, #32
1117 fmov $ctr2d, $ctr96_b64x @ CTR block 2
1119 ldr $rk1q, [$cc, #16] @ load rk1
1120 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1121 rev $rctr32w, $rctr32w @ rev_ctr32
1123 fmov $ctr1d, $ctr96_b64x @ CTR block 1
1124 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
1126 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
1127 rev $ctr32w, $rctr32w @ CTR block 1
1129 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
1130 ldr $rk2q, [$cc, #32] @ load rk2
1131 add $rctr32w, $rctr32w, #1 @ CTR block 1
1133 fmov $ctr1.d[1], $ctr32x @ CTR block 1
1134 rev $ctr32w, $rctr32w @ CTR block 2
1135 add $rctr32w, $rctr32w, #1 @ CTR block 2
1137 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
1138 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
1140 fmov $ctr2.d[1], $ctr32x @ CTR block 2
1141 rev $ctr32w, $rctr32w @ CTR block 3
1143 fmov $ctr3d, $ctr96_b64x @ CTR block 3
1144 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
1145 add $rctr32w, $rctr32w, #1 @ CTR block 3
1147 fmov $ctr3.d[1], $ctr32x @ CTR block 3
1148 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
1150 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
1151 ldr $rk3q, [$cc, #48] @ load rk3
1153 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
1154 ldr $rk6q, [$cc, #96] @ load rk6
1156 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
1157 ldr $rk7q, [$cc, #112] @ load rk7
1159 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
1160 ldr $rk4q, [$cc, #64] @ load rk4
1162 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
1164 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
1166 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
1167 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10
1169 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
1170 ld1 { $acc_lb}, [$current_tag]
1171 ext $acc_lb, $acc_lb, $acc_lb, #8
1172 rev64 $acc_lb, $acc_lb
1174 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
1175 ldr $rk5q, [$cc, #80] @ load rk5
1177 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
1179 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
1181 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
1182 ldr $rk9q, [$cc, #144] @ load rk9
1184 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
1186 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
1188 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
1189 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
1190 ext $h3b, $h3b, $h3b, #8
1192 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
1193 ldr $rk8q, [$cc, #128] @ load rk8
1195 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
1197 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
1199 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
1201 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
1203 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
1204 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
1205 ext $h1b, $h1b, $h1b, #8
1207 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
1209 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
1211 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
1213 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
1215 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
1216 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
1218 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
1219 ext $h4b, $h4b, $h4b, #8
1220 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
1221 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1223 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
1225 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
1227 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
1228 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
1230 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
1232 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
1233 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
1235 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
1237 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
1239 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
1240 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
1242 aese $ctr2b, $rk9 @ AES block 2 - round 9
1244 aese $ctr3b, $rk9 @ AES block 3 - round 9
1246 aese $ctr0b, $rk9 @ AES block 0 - round 9
1247 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
1249 aese $ctr1b, $rk9 @ AES block 1 - round 9
1250 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
1251 b.ge .L128_dec_tail @ handle tail
1253 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
1255 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
1257 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
1258 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
1260 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
1261 rev64 $res0b, $res0b @ GHASH block 0
1262 rev $ctr32w, $rctr32w @ CTR block 4
1264 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
1265 add $rctr32w, $rctr32w, #1 @ CTR block 4
1266 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
1268 rev64 $res1b, $res1b @ GHASH block 1
1269 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
1270 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
1272 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
1274 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
1275 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
1277 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
1279 fmov $ctr0d, $ctr96_b64x @ CTR block 4
1281 fmov $ctr0.d[1], $ctr32x @ CTR block 4
1282 rev $ctr32w, $rctr32w @ CTR block 5
1283 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low
1285 fmov $ctr1d, $ctr96_b64x @ CTR block 5
1286 add $rctr32w, $rctr32w, #1 @ CTR block 5
1287 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
1289 fmov $ctr1.d[1], $ctr32x @ CTR block 5
1290 rev $ctr32w, $rctr32w @ CTR block 6
1291 add $rctr32w, $rctr32w, #1 @ CTR block 6
1293 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
1295 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high
1296 eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low
1297 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
1299 eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high
1300 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
1302 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
1303 b.ge .L128_dec_prepretail @ do prepretail
1305 .L128_dec_main_loop: @ main loop start
1306 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1307 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1308 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1310 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1311 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1313 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1314 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1316 rev64 $res2b, $res2b @ GHASH block 4k+2
1317 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1318 rev $ctr32w, $rctr32w @ CTR block 4k+7
1320 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1321 eor $res0b, $res0b, $acc_lb @ PRE 1
1322 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1324 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1325 rev64 $res3b, $res3b @ GHASH block 4k+3
1327 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1328 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1329 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1331 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1332 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1333 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1335 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1336 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1338 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1339 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1341 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1342 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1344 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1346 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1347 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1349 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1350 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1352 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1354 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1355 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1357 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1358 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1360 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1361 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1362 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1364 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1365 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1367 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1369 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1370 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1372 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1374 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1375 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1377 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1379 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1380 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1382 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1384 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1385 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1387 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1388 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1390 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1391 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1393 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1394 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1396 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1397 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1399 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1400 movi $mod_constant.8b, #0xc2
1402 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1403 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1405 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1407 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1408 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1410 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1411 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1413 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1414 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1415 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
1417 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1418 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1420 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1421 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1423 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1424 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1426 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1427 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1429 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1430 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1432 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1433 rev $ctr32w, $rctr32w @ CTR block 4k+8
1435 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1436 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
1437 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1439 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1440 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
1442 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1443 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1445 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1447 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1448 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
1450 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1451 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
1453 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
1454 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1455 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
1457 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1458 ldr $res3q, [$input_ptr, #48] @ AES block 4k+3 - load ciphertext
1460 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1461 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
1463 rev64 $res1b, $res1b @ GHASH block 4k+5
1464 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1465 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1467 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1468 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1470 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1471 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
1473 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1474 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
1475 rev $ctr32w, $rctr32w @ CTR block 4k+9
1477 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1478 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
1479 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1481 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1482 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1484 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1485 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
1486 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1488 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
1489 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
1490 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
1492 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1493 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
1494 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
1496 rev64 $res0b, $res0b @ GHASH block 4k+4
1497 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1498 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
1500 rev $ctr32w, $rctr32w @ CTR block 4k+10
1501 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
1503 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high
1504 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
1506 eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low
1507 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
1509 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
1510 b.lt L128_dec_main_loop
1512 .L128_dec_prepretail: @ PREPRETAIL
1513 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
1514 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
1515 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
1517 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
1518 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
1520 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
1521 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
1523 eor $res0b, $res0b, $acc_lb @ PRE 1
1524 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
1525 rev64 $res2b, $res2b @ GHASH block 4k+2
1527 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
1528 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
1530 rev $ctr32w, $rctr32w @ CTR block 4k+7
1531 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
1532 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
1534 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
1535 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
1536 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
1538 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
1539 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
1541 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
1542 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
1544 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
1545 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
1546 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
1548 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
1549 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
1551 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
1552 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
1554 rev64 $res3b, $res3b @ GHASH block 4k+3
1556 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
1557 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
1559 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
1561 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
1562 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
1564 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
1566 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
1567 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
1569 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
1571 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
1572 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
1574 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
1576 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
1578 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
1579 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
1581 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
1582 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
1584 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
1586 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
1587 movi $mod_constant.8b, #0xc2
1589 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
1590 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
1592 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
1594 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
1595 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
1597 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
1598 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low
1600 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
1601 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low
1602 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
1604 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
1606 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
1607 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1609 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
1611 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
1612 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
1614 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
1616 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
1617 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1619 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
1621 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
1623 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
1625 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
1626 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1628 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1630 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
1631 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1633 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
1635 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
1636 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1638 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
1640 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
1642 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
1644 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
1645 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1647 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
1649 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
1651 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9
1653 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1654 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high
1656 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
1657 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1659 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
1661 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
1662 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1664 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
1666 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
1667 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high
1669 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9
1670 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
1672 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9
1673 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
1674 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
1676 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9
1677 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1678 .L128_dec_tail: @ TAIL
1680 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
1681 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
1683 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
1685 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
1687 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
1689 cmp $main_end_input_ptr, #48
1691 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high
1693 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
1694 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low
1695 b.gt .L128_dec_blocks_more_than_3
1698 sub $rctr32w, $rctr32w, #1
1705 cmp $main_end_input_ptr, #32
1706 b.gt .L128_dec_blocks_more_than_2
1708 cmp $main_end_input_ptr, #16
1711 sub $rctr32w, $rctr32w, #1
1712 b.gt .L128_dec_blocks_more_than_1
1714 sub $rctr32w, $rctr32w, #1
1715 b .L128_dec_blocks_less_than_1
1716 .L128_dec_blocks_more_than_3: @ blocks left > 3
1717 rev64 $res0b, $res1b @ GHASH final-3 block
1718 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
1720 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1722 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
1723 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
1724 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
1726 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
1727 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
1729 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
1730 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
1732 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
1734 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
1736 movi $t0.8b, #0 @ suppress further partial tag feed in
1737 eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high
1739 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
1740 eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low
1741 .L128_dec_blocks_more_than_2: @ blocks left > 2
1743 rev64 $res0b, $res1b @ GHASH final-2 block
1744 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
1746 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1748 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
1749 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
1751 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
1753 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
1755 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
1756 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
1758 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
1759 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
1761 movi $t0.8b, #0 @ suppress further partial tag feed in
1763 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
1765 eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low
1766 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
1768 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
1770 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
1771 eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high
1772 .L128_dec_blocks_more_than_1: @ blocks left > 1
1774 rev64 $res0b, $res1b @ GHASH final-1 block
1776 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
1777 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1779 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
1781 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
1783 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
1785 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
1786 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
1788 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
1789 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
1791 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
1793 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
1795 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
1796 movi $t0.8b, #0 @ suppress further partial tag feed in
1798 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
1800 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
1801 eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high
1803 eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low
1804 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
1805 .L128_dec_blocks_less_than_1: @ blocks left <= 1
1807 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff
1808 and $bit_length, $bit_length, #127 @ bit_length %= 128
1810 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff
1811 sub $bit_length, $bit_length, #128 @ bit_length -= 128
1813 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
1815 and $bit_length, $bit_length, #127 @ bit_length %= 128
1817 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block
1818 cmp $bit_length, #64
1820 csel $ctr96_b64x, $rk10_h, xzr, lt
1821 csel $ctr32x, $rk10_l, $rk10_h, lt
1823 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
1825 mov $ctr0.d[1], $ctr96_b64x
1827 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
1829 rev64 $res0b, $res1b @ GHASH final block
1831 eor $res0b, $res0b, $t0.16b @ feed in partial tag
1833 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
1835 and $output_h0, $output_h0, $ctr96_b64x
1837 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
1838 mov $t0d, $res0.d[1] @ GHASH final block - mid
1840 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
1841 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
1843 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
1845 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
1846 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
1847 and $output_l0, $output_l0, $ctr32x
1849 rev $ctr32w, $rctr32w
1851 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
1852 movi $mod_constant.8b, #0xc2
1854 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
1856 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
1857 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
1859 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
1861 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
1863 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
1865 orr $output_l0, $output_l0, $end_input_ptr
1866 str $ctr32w, [$counter, #12] @ store the updated counter
1868 orr $output_h0, $output_h0, $main_end_input_ptr
1869 stp $output_l0, $output_h0, [$output_ptr]
1870 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
1872 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
1874 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
1876 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
1877 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
1879 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
1881 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
1882 ext $acc_lb, $acc_lb, $acc_lb, #8
1883 rev64 $acc_lb, $acc_lb
1885 st1 { $acc_l.16b }, [$current_tag]
1887 ldp x21, x22, [sp, #16]
1888 ldp x23, x24, [sp, #32]
1889 ldp d8, d9, [sp, #48]
1890 ldp d10, d11, [sp, #64]
1891 ldp d12, d13, [sp, #80]
1892 ldp d14, d15, [sp, #96]
1893 ldp x19, x20, [sp], #112
1899 .size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
1904 my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
1905 my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
1906 my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
1907 my ($output_l0,$output_h0)=map("x$_",(6..7));
1910 my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
1911 my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
1913 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
1914 my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
1915 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
1916 my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
1918 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
1919 my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
1920 my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
1922 my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
1923 my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
1924 my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
1931 my ($t1,$t2)=map("v$_",(30..31));
1932 my ($t1d,$t2d)=map("d$_",(30..31));
1948 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
1949 my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
1950 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
1952 my $mod_constantd="d8";
1953 my $mod_constant="v8";
1956 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
1957 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
1963 #########################################################################################
1964 # size_t aes_gcm_enc_192_kernel(const unsigned char *in,
1966 # unsigned char *out,
1968 # unsigned char ivec[16],
1972 .global aes_gcm_enc_192_kernel
1973 .type aes_gcm_enc_192_kernel,%function
1975 aes_gcm_enc_192_kernel:
1976 cbz x1, .L192_enc_ret
1977 stp x19, x20, [sp, #-112]!
1980 stp x21, x22, [sp, #16]
1981 stp x23, x24, [sp, #32]
1982 stp d8, d9, [sp, #48]
1983 stp d10, d11, [sp, #64]
1984 stp d12, d13, [sp, #80]
1985 stp d14, d15, [sp, #96]
1987 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
1989 ldr $rk5q, [$cc, #80] @ load rk5
1991 ldr $rk4q, [$cc, #64] @ load rk4
1993 ldr $rk8q, [$cc, #128] @ load rk8
1995 lsr $rctr32x, $ctr96_t32x, #32
1996 ldr $rk6q, [$cc, #96] @ load rk6
1997 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1999 ldr $rk7q, [$cc, #112] @ load rk7
2000 rev $rctr32w, $rctr32w @ rev_ctr32
2002 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2003 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2005 rev $ctr32w, $rctr32w @ CTR block 1
2006 add $rctr32w, $rctr32w, #1 @ CTR block 1
2007 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2009 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2010 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2012 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2013 rev $ctr32w, $rctr32w @ CTR block 2
2014 add $rctr32w, $rctr32w, #1 @ CTR block 2
2016 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2017 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2019 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2020 rev $ctr32w, $rctr32w @ CTR block 3
2022 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2023 ldr $rk0q, [$cc, #0] @ load rk0
2025 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2027 ldr $rk3q, [$cc, #48] @ load rk3
2029 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2031 ldr $rk1q, [$cc, #16] @ load rk1
2033 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2034 ld1 { $acc_lb}, [$current_tag]
2035 ext $acc_lb, $acc_lb, $acc_lb, #8
2036 rev64 $acc_lb, $acc_lb
2038 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2039 ldr $rk11q, [$cc, #176] @ load rk11
2041 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2042 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2043 ext $h4b, $h4b, $h4b, #8
2045 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2046 ldr $rk2q, [$cc, #32] @ load rk2
2048 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2049 ldr $rk10q, [$cc, #160] @ load rk10
2051 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2052 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2053 ext $h1b, $h1b, $h1b, #8
2055 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2056 ldr $rk9q, [$cc, #144] @ load rk9
2058 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2059 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2060 ext $h3b, $h3b, $h3b, #8
2062 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2064 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2066 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2068 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2069 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2071 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2073 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2074 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2076 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2078 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2080 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2082 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2084 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2086 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2088 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2090 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2092 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2094 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2096 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2098 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2099 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2100 ext $h2b, $h2b, $h2b, #8
2102 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2104 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2106 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2108 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
2109 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2111 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2113 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
2115 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2116 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
2118 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
2120 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
2122 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
2124 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
2126 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
2128 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
2130 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
2132 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
2134 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
2136 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
2137 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2138 mov $len, $main_end_input_ptr
2140 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
2141 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
2143 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
2144 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2146 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
2148 aese $ctr2b, $rk11 @ AES block 2 - round 11
2149 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2150 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2152 aese $ctr1b, $rk11 @ AES block 1 - round 11
2153 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
2155 aese $ctr0b, $rk11 @ AES block 0 - round 11
2156 add $rctr32w, $rctr32w, #1 @ CTR block 3
2158 aese $ctr3b, $rk11 @ AES block 3 - round 11
2159 b.ge .L192_enc_tail @ handle tail
2161 rev $ctr32w, $rctr32w @ CTR block 4
2162 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
2164 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
2165 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
2167 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
2169 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
2170 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2171 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
2173 eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low
2175 eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high
2176 eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high
2177 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
2179 eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high
2180 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
2182 eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low
2183 eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low
2185 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
2186 eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high
2188 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
2190 eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low
2191 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
2193 add $rctr32w, $rctr32w, #1 @ CTR block 4
2194 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
2195 fmov $ctr0d, $ctr96_b64x @ CTR block 4
2197 fmov $ctr0.d[1], $ctr32x @ CTR block 4
2198 rev $ctr32w, $rctr32w @ CTR block 5
2200 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
2201 add $rctr32w, $rctr32w, #1 @ CTR block 5
2203 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
2204 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
2206 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
2208 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
2209 fmov $ctr1d, $ctr96_b64x @ CTR block 5
2210 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
2212 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
2214 fmov $ctr1.d[1], $ctr32x @ CTR block 5
2215 rev $ctr32w, $rctr32w @ CTR block 6
2217 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
2219 add $rctr32w, $rctr32w, #1 @ CTR block 6
2220 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
2221 fmov $ctr2d, $ctr96_b64x @ CTR block 6
2223 fmov $ctr2.d[1], $ctr32x @ CTR block 6
2224 rev $ctr32w, $rctr32w @ CTR block 7
2226 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
2227 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
2229 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
2230 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
2231 b.ge .L192_enc_prepretail @ do prepretail
2233 .L192_enc_main_loop: @ main loop start
2234 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2235 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2237 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2238 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
2240 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2241 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2242 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2244 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2245 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2247 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2248 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2249 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
2251 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2252 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext
2254 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2255 eor $res0b, $res0b, $acc_lb @ PRE 1
2257 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2259 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2260 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2262 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2263 eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high
2265 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2266 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2268 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2270 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2271 eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low
2273 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2274 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2276 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2277 eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low
2279 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2280 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2282 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2283 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2285 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2287 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2289 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2290 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2292 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2293 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2295 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2297 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2298 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2300 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2302 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2303 eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high
2304 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2306 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2307 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2309 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2310 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2312 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2313 eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high
2315 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2316 eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low
2317 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2319 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2320 rev $ctr32w, $rctr32w @ CTR block 4k+8
2322 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2323 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
2325 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2326 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2328 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2329 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
2331 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2332 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2334 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2335 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
2337 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2338 movi $mod_constant.8b, #0xc2
2340 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2341 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2342 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2344 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2345 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2347 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2348 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2350 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2351 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2353 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2354 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
2356 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2357 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2359 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2360 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
2362 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2363 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2365 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2366 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
2367 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2369 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2370 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2372 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2373 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low
2375 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2376 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2377 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
2379 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2380 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high
2382 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2383 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2384 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
2386 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2388 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2389 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2391 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2393 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2395 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2397 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2398 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2400 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2402 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2404 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2406 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2407 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2409 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2411 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2412 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
2414 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2415 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
2416 rev $ctr32w, $rctr32w @ CTR block 4k+9
2418 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2419 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
2420 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
2422 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2423 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
2425 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
2426 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
2427 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
2429 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2430 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
2431 rev $ctr32w, $rctr32w @ CTR block 4k+10
2433 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
2434 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2435 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
2437 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
2438 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2440 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2441 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
2442 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
2444 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
2445 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
2446 rev $ctr32w, $rctr32w @ CTR block 4k+11
2448 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2449 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
2451 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result
2452 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result
2453 b.lt .L192_enc_main_loop
2455 .L192_enc_prepretail: @ PREPRETAIL
2456 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
2457 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
2459 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
2460 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
2461 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
2463 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
2464 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
2466 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
2468 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
2469 eor $res0b, $res0b, $acc_lb @ PRE 1
2470 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
2472 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
2473 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
2475 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
2477 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
2478 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
2480 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
2481 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2483 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
2485 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
2486 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
2488 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
2489 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
2491 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
2492 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
2494 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
2496 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
2497 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
2499 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
2501 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
2502 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
2504 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
2506 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
2507 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
2509 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
2510 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
2512 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
2514 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
2515 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
2517 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
2519 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
2521 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
2523 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
2524 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
2526 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
2528 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
2529 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
2531 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
2533 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
2534 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
2536 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
2538 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
2539 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
2541 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
2543 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
2544 movi $mod_constant.8b, #0xc2
2546 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
2548 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
2550 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
2551 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
2553 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
2555 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
2557 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
2558 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
2560 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
2562 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
2563 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
2565 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
2567 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
2568 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2570 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
2572 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
2573 eor $acc_mb, $acc_mb, $acc_lb
2575 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
2577 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
2579 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
2580 ext $acc_hb, $acc_hb, $acc_hb, #8
2582 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
2584 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
2585 eor $acc_mb, $acc_mb, $t1.16b
2587 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
2589 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
2591 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
2593 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
2594 eor $acc_mb, $acc_mb, $acc_hb
2596 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
2598 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
2600 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
2602 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
2604 ext $acc_mb, $acc_mb, $acc_mb, #8
2606 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
2608 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
2610 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
2612 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
2613 eor $acc_lb, $acc_lb, $t1.16b
2615 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
2617 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
2619 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
2621 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
2622 eor $acc_lb, $acc_lb, $acc_mb
2623 .L192_enc_tail: @ TAIL
2625 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
2626 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
2628 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low
2629 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high
2631 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
2633 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
2634 cmp $main_end_input_ptr, #48
2636 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
2638 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
2639 b.gt .L192_enc_blocks_more_than_3
2641 sub $rctr32w, $rctr32w, #1
2646 cmp $main_end_input_ptr, #32
2650 b.gt .L192_enc_blocks_more_than_2
2652 sub $rctr32w, $rctr32w, #1
2655 cmp $main_end_input_ptr, #16
2656 b.gt .L192_enc_blocks_more_than_1
2658 sub $rctr32w, $rctr32w, #1
2659 b .L192_enc_blocks_less_than_1
2660 .L192_enc_blocks_more_than_3: @ blocks left > 3
2661 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
2663 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
2665 rev64 $res0b, $res1b @ GHASH final-3 block
2667 eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low
2668 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2670 eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high
2671 fmov $res1d, $input_l0 @ AES final-2 block - mov low
2673 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
2675 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
2677 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
2679 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
2681 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
2683 movi $t0.8b, #0 @ suppress further partial tag feed in
2685 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
2687 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
2688 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
2689 .L192_enc_blocks_more_than_2: @ blocks left > 2
2691 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
2693 rev64 $res0b, $res1b @ GHASH final-2 block
2694 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
2696 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2698 eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high
2700 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
2701 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
2703 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
2704 eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low
2706 fmov $res1d, $input_l0 @ AES final-1 block - mov low
2708 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
2709 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
2710 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
2712 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
2714 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
2716 movi $t0.8b, #0 @ suppress further partial tag feed in
2718 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
2720 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
2721 .L192_enc_blocks_more_than_1: @ blocks left > 1
2723 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
2725 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
2727 rev64 $res0b, $res1b @ GHASH final-1 block
2729 eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low
2730 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2731 movi $t0.8b, #0 @ suppress further partial tag feed in
2733 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
2735 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
2736 eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high
2737 fmov $res1d, $input_l0 @ AES final block - mov low
2739 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
2740 fmov $res1.d[1], $input_h0 @ AES final block - mov high
2742 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
2744 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
2746 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
2748 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
2750 eor $res1b, $res1b, $ctr3b @ AES final block - result
2752 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
2754 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
2755 .L192_enc_blocks_less_than_1: @ blocks left <= 1
2757 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
2758 rev $ctr32w, $rctr32w
2759 and $bit_length, $bit_length, #127 @ bit_length %= 128
2761 sub $bit_length, $bit_length, #128 @ bit_length -= 128
2762 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
2764 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
2765 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
2767 and $bit_length, $bit_length, #127 @ bit_length %= 128
2769 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
2770 cmp $bit_length, #64
2772 csel $input_l0, $rk12_l, $rk12_h, lt
2773 csel $input_h0, $rk12_h, xzr, lt
2775 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
2777 fmov $ctr0.d[1], $input_h0
2779 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
2781 rev64 $res0b, $res1b @ GHASH final block
2783 eor $res0b, $res0b, $t0.16b @ feed in partial tag
2785 mov $t0d, $res0.d[1] @ GHASH final block - mid
2787 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
2789 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
2791 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
2793 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
2795 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
2797 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
2799 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
2800 movi $mod_constant.8b, #0xc2
2802 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
2804 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
2806 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
2808 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
2810 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
2812 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
2814 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
2816 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
2818 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
2820 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
2822 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
2823 str $ctr32w, [$counter, #12] @ store the updated counter
2825 st1 { $res1b}, [$output_ptr] @ store all 16B
2827 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
2828 ext $acc_lb, $acc_lb, $acc_lb, #8
2829 rev64 $acc_lb, $acc_lb
2831 st1 { $acc_l.16b }, [$current_tag]
2833 ldp x21, x22, [sp, #16]
2834 ldp x23, x24, [sp, #32]
2835 ldp d8, d9, [sp, #48]
2836 ldp d10, d11, [sp, #64]
2837 ldp d12, d13, [sp, #80]
2838 ldp d14, d15, [sp, #96]
2839 ldp x19, x20, [sp], #112
2845 .size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
2848 #########################################################################################
2849 # size_t aes_gcm_dec_192_kernel(const unsigned char *in,
2851 # unsigned char *out,
2853 # unsigned char ivec[16],
2857 .global aes_gcm_dec_192_kernel
2858 .type aes_gcm_dec_192_kernel,%function
2860 aes_gcm_dec_192_kernel:
2861 cbz x1, .L192_dec_ret
2862 stp x19, x20, [sp, #-112]!
2865 stp x21, x22, [sp, #16]
2866 stp x23, x24, [sp, #32]
2867 stp d8, d9, [sp, #48]
2868 stp d10, d11, [sp, #64]
2869 stp d12, d13, [sp, #80]
2870 stp d14, d15, [sp, #96]
2872 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
2873 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
2875 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
2877 ldr $rk0q, [$cc, #0] @ load rk0
2879 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
2880 mov $len, $main_end_input_ptr
2881 ldr $rk2q, [$cc, #32] @ load rk2
2883 lsr $rctr32x, $ctr96_t32x, #32
2884 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2885 fmov $ctr3d, $ctr96_b64x @ CTR block 3
2887 rev $rctr32w, $rctr32w @ rev_ctr32
2888 fmov $ctr1d, $ctr96_b64x @ CTR block 1
2890 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
2891 ldr $rk1q, [$cc, #16] @ load rk1
2893 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
2894 rev $ctr32w, $rctr32w @ CTR block 1
2896 add $rctr32w, $rctr32w, #1 @ CTR block 1
2897 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
2898 ldr $rk3q, [$cc, #48] @ load rk3
2900 fmov $ctr1.d[1], $ctr32x @ CTR block 1
2901 rev $ctr32w, $rctr32w @ CTR block 2
2902 add $rctr32w, $rctr32w, #1 @ CTR block 2
2904 fmov $ctr2d, $ctr96_b64x @ CTR block 2
2905 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
2907 fmov $ctr2.d[1], $ctr32x @ CTR block 2
2908 rev $ctr32w, $rctr32w @ CTR block 3
2910 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
2911 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
2913 fmov $ctr3.d[1], $ctr32x @ CTR block 3
2915 ldr $rk8q, [$cc, #128] @ load rk8
2917 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
2919 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
2920 ldr $rk11q, [$cc, #176] @ load rk11
2922 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
2923 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
2924 ext $h4b, $h4b, $h4b, #8
2926 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
2927 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
2928 ext $h2b, $h2b, $h2b, #8
2930 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
2931 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
2932 ext $h3b, $h3b, $h3b, #8
2934 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
2935 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12
2937 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
2938 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
2939 ext $h1b, $h1b, $h1b, #8
2941 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
2942 ldr $rk10q, [$cc, #160] @ load rk10
2944 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
2945 ldr $rk9q, [$cc, #144] @ load rk9
2947 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
2948 ldr $rk7q, [$cc, #112] @ load rk7
2950 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
2951 ldr $rk4q, [$cc, #64] @ load rk4
2953 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
2954 ld1 { $acc_lb}, [$current_tag]
2955 ext $acc_lb, $acc_lb, $acc_lb, #8
2956 rev64 $acc_lb, $acc_lb
2958 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
2959 add $rctr32w, $rctr32w, #1 @ CTR block 3
2961 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
2962 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
2964 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
2965 ldr $rk5q, [$cc, #80] @ load rk5
2967 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
2968 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
2970 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
2972 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
2973 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
2975 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
2976 ldr $rk6q, [$cc, #96] @ load rk6
2978 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
2980 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
2982 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
2984 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
2986 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
2988 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
2990 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
2992 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
2994 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
2996 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
2998 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3000 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3002 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3004 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3006 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3008 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3009 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3011 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3012 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3014 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3015 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3017 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3018 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3020 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3021 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3023 aese $ctr3b, $rk11 @ AES block 3 - round 11
3025 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3027 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3029 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3030 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3032 aese $ctr2b, $rk11 @ AES block 2 - round 11
3034 aese $ctr1b, $rk11 @ AES block 1 - round 11
3035 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3037 aese $ctr0b, $rk11 @ AES block 0 - round 11
3038 b.ge .L192_dec_tail @ handle tail
3040 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
3042 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
3044 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
3046 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
3047 rev $ctr32w, $rctr32w @ CTR block 4
3048 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
3050 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
3052 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
3054 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
3056 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
3057 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
3058 add $rctr32w, $rctr32w, #1 @ CTR block 4
3060 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
3061 rev64 $res0b, $res0b @ GHASH block 0
3062 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
3064 fmov $ctr0d, $ctr96_b64x @ CTR block 4
3065 rev64 $res1b, $res1b @ GHASH block 1
3066 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
3068 eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low
3069 fmov $ctr0.d[1], $ctr32x @ CTR block 4
3070 rev $ctr32w, $rctr32w @ CTR block 5
3072 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
3073 fmov $ctr1d, $ctr96_b64x @ CTR block 5
3074 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high
3076 add $rctr32w, $rctr32w, #1 @ CTR block 5
3077 fmov $ctr1.d[1], $ctr32x @ CTR block 5
3078 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low
3080 rev $ctr32w, $rctr32w @ CTR block 6
3081 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high
3083 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
3084 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
3086 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
3088 add $rctr32w, $rctr32w, #1 @ CTR block 6
3089 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
3090 b.ge .L192_dec_prepretail @ do prepretail
3092 .L192_dec_main_loop: @ main loop start
3093 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3094 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3096 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3097 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3099 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3100 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3101 rev64 $res3b, $res3b @ GHASH block 4k+3
3103 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3104 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3106 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3107 eor $res0b, $res0b, $acc_lb @ PRE 1
3109 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3110 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3112 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3113 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3115 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3116 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3118 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3119 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3120 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3122 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3123 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3124 rev $ctr32w, $rctr32w @ CTR block 4k+7
3126 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3127 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3129 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3130 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3131 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3133 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3135 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3136 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3138 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3139 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3141 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3143 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3144 rev64 $res2b, $res2b @ GHASH block 4k+2
3146 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3148 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3149 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3150 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3152 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3154 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3156 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3157 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3159 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3160 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3162 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3164 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3165 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3167 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3169 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3171 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3172 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3174 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3176 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3178 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3179 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3181 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3183 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3184 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3186 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3188 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3189 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3191 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3193 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3194 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3196 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3198 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3199 movi $mod_constant.8b, #0xc2
3201 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3203 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3204 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3206 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3208 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3209 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3211 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3213 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3214 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3216 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3218 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3219 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3221 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3223 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3224 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3226 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3228 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3229 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
3231 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3232 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3234 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3235 ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
3236 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3238 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3239 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3241 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11
3242 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3244 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3245 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3247 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3248 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
3250 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11
3251 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
3252 rev $ctr32w, $rctr32w @ CTR block 4k+8
3254 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3255 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3257 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3258 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3260 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
3261 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
3263 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
3264 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3265 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
3267 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3268 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
3270 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3272 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3273 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
3275 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3276 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3277 rev64 $res1b, $res1b @ GHASH block 4k+5
3279 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11
3280 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3282 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3283 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
3285 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
3286 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
3287 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3289 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
3290 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
3291 rev $ctr32w, $rctr32w @ CTR block 4k+9
3293 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3294 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
3295 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3297 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
3298 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
3299 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low
3301 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
3302 rev $ctr32w, $rctr32w @ CTR block 4k+10
3303 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high
3305 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3306 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
3307 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3309 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
3310 rev64 $res0b, $res0b @ GHASH block 4k+4
3311 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
3313 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11
3314 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
3315 b.lt .L192_dec_main_loop
3317 .L192_dec_prepretail: @ PREPRETAIL
3318 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
3319 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
3320 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
3322 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
3323 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
3325 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
3326 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
3328 eor $res0b, $res0b, $acc_lb @ PRE 1
3329 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
3331 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
3332 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
3334 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
3335 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
3337 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
3338 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
3339 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
3341 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
3342 rev64 $res2b, $res2b @ GHASH block 4k+2
3344 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
3345 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
3346 rev $ctr32w, $rctr32w @ CTR block 4k+7
3348 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
3349 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
3350 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
3352 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
3353 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high
3354 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
3356 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
3357 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low
3359 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
3360 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high
3361 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
3363 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
3364 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low
3365 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
3367 rev64 $res3b, $res3b @ GHASH block 4k+3
3368 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
3370 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
3371 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
3373 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
3374 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
3376 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
3377 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
3379 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
3381 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
3382 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
3384 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
3386 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
3387 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
3389 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
3391 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
3393 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
3394 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
3396 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
3397 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
3399 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
3401 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
3402 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
3404 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
3406 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
3407 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
3409 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
3411 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
3412 movi $mod_constant.8b, #0xc2
3414 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
3416 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
3418 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3419 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
3421 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
3422 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
3424 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
3426 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3427 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
3429 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
3431 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
3432 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
3434 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
3436 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
3437 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3439 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
3441 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
3442 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3444 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
3446 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
3447 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3449 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
3451 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
3453 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
3455 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
3457 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
3458 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3460 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
3462 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
3464 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
3466 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
3467 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3469 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
3471 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
3473 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
3475 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
3477 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
3479 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3481 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
3483 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
3485 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
3486 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3488 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
3491 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3499 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3500 .L192_dec_tail: @ TAIL
3502 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
3503 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
3505 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
3507 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
3509 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
3511 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
3513 cmp $main_end_input_ptr, #48
3515 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high
3517 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low
3518 b.gt .L192_dec_blocks_more_than_3
3525 sub $rctr32w, $rctr32w, #1
3528 cmp $main_end_input_ptr, #32
3529 b.gt .L192_dec_blocks_more_than_2
3532 cmp $main_end_input_ptr, #16
3533 sub $rctr32w, $rctr32w, #1
3535 b.gt .L192_dec_blocks_more_than_1
3537 sub $rctr32w, $rctr32w, #1
3538 b .L192_dec_blocks_less_than_1
3539 .L192_dec_blocks_more_than_3: @ blocks left > 3
3540 rev64 $res0b, $res1b @ GHASH final-3 block
3541 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
3543 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
3545 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3547 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
3549 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
3550 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
3551 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
3553 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
3555 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
3556 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
3558 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
3560 eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low
3561 movi $t0.8b, #0 @ suppress further partial tag feed in
3563 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
3564 eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high
3565 .L192_dec_blocks_more_than_2: @ blocks left > 2
3567 rev64 $res0b, $res1b @ GHASH final-2 block
3568 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
3570 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3572 movi $t0.8b, #0 @ suppress further partial tag feed in
3574 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
3576 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
3578 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
3580 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
3582 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
3583 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
3585 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
3586 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
3588 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
3590 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
3592 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
3593 eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high
3595 eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low
3596 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
3597 .L192_dec_blocks_more_than_1: @ blocks left > 1
3599 rev64 $res0b, $res1b @ GHASH final-1 block
3601 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3602 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
3604 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
3606 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
3608 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
3609 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
3611 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
3613 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
3615 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
3616 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
3618 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
3619 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
3621 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
3623 movi $t0.8b, #0 @ suppress further partial tag feed in
3624 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
3625 eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high
3627 eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low
3629 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
3630 .L192_dec_blocks_less_than_1: @ blocks left <= 1
3632 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff
3633 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
3634 and $bit_length, $bit_length, #127 @ bit_length %= 128
3636 sub $bit_length, $bit_length, #128 @ bit_length -= 128
3638 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
3640 and $bit_length, $bit_length, #127 @ bit_length %= 128
3641 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff
3643 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block
3644 cmp $bit_length, #64
3646 csel $ctr32x, $rk12_l, $rk12_h, lt
3647 csel $ctr96_b64x, $rk12_h, xzr, lt
3649 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
3650 and $output_l0, $output_l0, $ctr32x
3651 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
3653 orr $output_l0, $output_l0, $end_input_ptr
3654 mov $ctr0.d[1], $ctr96_b64x
3656 rev $ctr32w, $rctr32w
3658 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
3659 str $ctr32w, [$counter, #12] @ store the updated counter
3661 rev64 $res0b, $res1b @ GHASH final block
3663 eor $res0b, $res0b, $t0.16b @ feed in partial tag
3664 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
3666 and $output_h0, $output_h0, $ctr96_b64x
3668 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
3669 mov $t0d, $res0.d[1] @ GHASH final block - mid
3671 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
3673 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
3675 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
3677 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
3679 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
3681 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
3682 movi $mod_constant.8b, #0xc2
3684 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
3686 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
3688 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
3690 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
3691 orr $output_h0, $output_h0, $main_end_input_ptr
3692 stp $output_l0, $output_h0, [$output_ptr]
3694 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
3696 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
3698 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
3700 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
3702 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
3704 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
3706 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
3707 ext $acc_lb, $acc_lb, $acc_lb, #8
3708 rev64 $acc_lb, $acc_lb
3710 st1 { $acc_l.16b }, [$current_tag]
3712 ldp x21, x22, [sp, #16]
3713 ldp x23, x24, [sp, #32]
3714 ldp d8, d9, [sp, #48]
3715 ldp d10, d11, [sp, #64]
3716 ldp d12, d13, [sp, #80]
3717 ldp d14, d15, [sp, #96]
3718 ldp x19, x20, [sp], #112
3724 .size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
3729 my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
3730 my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
3731 my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
3732 my ($output_l0,$output_h0)=map("x$_",(6..7));
3735 my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
3736 my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
3738 my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
3739 my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
3740 my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
3741 my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
3743 my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
3744 my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
3745 my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
3747 my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
3748 my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
3749 my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
3772 my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
3773 my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
3774 my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
3776 my $mod_constantd="d8";
3777 my $mod_constant="v8";
3780 my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
3781 my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
3787 #########################################################################################
3788 # size_t aes_gcm_enc_256_kernel(const unsigned char *in,
3790 # unsigned char *out,
3792 # unsigned char ivec[16],
3796 .global aes_gcm_enc_256_kernel
3797 .type aes_gcm_enc_256_kernel,%function
3799 aes_gcm_enc_256_kernel:
3800 cbz x1, .L256_enc_ret
3801 stp x19, x20, [sp, #-112]!
3804 stp x21, x22, [sp, #16]
3805 stp x23, x24, [sp, #32]
3806 stp d8, d9, [sp, #48]
3807 stp d10, d11, [sp, #64]
3808 stp d12, d13, [sp, #80]
3809 stp d14, d15, [sp, #96]
3811 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
3812 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
3813 mov $len, $main_end_input_ptr
3814 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
3816 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
3817 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
3819 ldr $rk0q, [$cc, #0] @ load rk0
3820 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3822 ldr $rk7q, [$cc, #112] @ load rk7
3823 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3825 lsr $rctr32x, $ctr96_t32x, #32
3826 fmov $ctr2d, $ctr96_b64x @ CTR block 2
3827 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
3829 rev $rctr32w, $rctr32w @ rev_ctr32
3830 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
3831 fmov $ctr1d, $ctr96_b64x @ CTR block 1
3833 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
3834 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
3836 rev $ctr32w, $rctr32w @ CTR block 1
3837 fmov $ctr3d, $ctr96_b64x @ CTR block 3
3839 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
3840 add $rctr32w, $rctr32w, #1 @ CTR block 1
3841 ldr $rk1q, [$cc, #16] @ load rk1
3843 fmov $ctr1.d[1], $ctr32x @ CTR block 1
3844 rev $ctr32w, $rctr32w @ CTR block 2
3845 add $rctr32w, $rctr32w, #1 @ CTR block 2
3847 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
3848 ldr $rk2q, [$cc, #32] @ load rk2
3850 fmov $ctr2.d[1], $ctr32x @ CTR block 2
3851 rev $ctr32w, $rctr32w @ CTR block 3
3853 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
3854 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
3856 fmov $ctr3.d[1], $ctr32x @ CTR block 3
3858 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
3859 ldr $rk3q, [$cc, #48] @ load rk3
3861 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
3862 ldr $rk6q, [$cc, #96] @ load rk6
3864 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
3865 ldr $rk5q, [$cc, #80] @ load rk5
3867 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
3868 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
3869 ext $h3b, $h3b, $h3b, #8
3871 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
3872 ldr $rk13q, [$cc, #208] @ load rk13
3874 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
3875 ldr $rk4q, [$cc, #64] @ load rk4
3877 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
3878 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
3879 ext $h2b, $h2b, $h2b, #8
3881 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
3882 ldr $rk12q, [$cc, #192] @ load rk12
3884 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
3885 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
3886 ext $h4b, $h4b, $h4b, #8
3888 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
3889 ldr $rk11q, [$cc, #176] @ load rk11
3891 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
3892 ldr $rk8q, [$cc, #128] @ load rk8
3894 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
3895 add $rctr32w, $rctr32w, #1 @ CTR block 3
3897 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
3898 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
3900 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
3901 ld1 { $acc_lb}, [$current_tag]
3902 ext $acc_lb, $acc_lb, $acc_lb, #8
3903 rev64 $acc_lb, $acc_lb
3905 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
3907 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
3909 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
3911 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
3913 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
3915 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
3917 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
3919 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
3921 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
3922 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
3924 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
3925 ldr $rk9q, [$cc, #144] @ load rk9
3927 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
3928 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
3929 ext $h1b, $h1b, $h1b, #8
3931 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
3932 ldr $rk10q, [$cc, #160] @ load rk10
3934 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
3935 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
3937 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
3939 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
3941 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
3942 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
3944 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
3946 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
3948 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
3950 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
3952 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
3954 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
3956 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
3958 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
3960 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
3962 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
3964 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
3966 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
3968 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
3970 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
3972 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
3974 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
3976 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
3977 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
3979 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
3981 aese $ctr2b, $rk13 @ AES block 2 - round 13
3982 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
3984 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
3986 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
3988 aese $ctr1b, $rk13 @ AES block 1 - round 13
3990 aese $ctr0b, $rk13 @ AES block 0 - round 13
3992 aese $ctr3b, $rk13 @ AES block 3 - round 13
3993 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
3994 b.ge .L256_enc_tail @ handle tail
3996 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext
3998 rev $ctr32w, $rctr32w @ CTR block 4
3999 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext
4001 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext
4003 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext
4004 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4006 eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low
4007 eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high
4009 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low
4010 eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low
4012 eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high
4013 eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high
4014 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low
4016 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4017 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high
4018 eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low
4020 eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low
4021 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high
4023 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low
4024 add $rctr32w, $rctr32w, #1 @ CTR block 4
4026 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4027 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low
4028 eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high
4030 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high
4032 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result
4033 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4035 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4036 rev $ctr32w, $rctr32w @ CTR block 5
4037 add $rctr32w, $rctr32w, #1 @ CTR block 5
4039 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result
4040 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4041 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4043 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4044 rev $ctr32w, $rctr32w @ CTR block 6
4045 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result
4047 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high
4048 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4049 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result
4051 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result
4053 add $rctr32w, $rctr32w, #1 @ CTR block 6
4054 fmov $ctr2d, $ctr96_b64x @ CTR block 6
4056 fmov $ctr2.d[1], $ctr32x @ CTR block 6
4057 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result
4058 rev $ctr32w, $rctr32w @ CTR block 7
4060 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7
4062 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result
4063 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result
4064 b.ge L256_enc_prepretail @ do prepretail
4066 .L256_enc_main_loop: @ main loop start
4067 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4068 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4070 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4071 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4073 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4074 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4076 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4077 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4079 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4080 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext
4082 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4083 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext
4085 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4086 eor $res0b, $res0b, $acc_lb @ PRE 1
4088 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4090 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4091 eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low
4093 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4094 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4096 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4097 eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high
4098 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4100 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4101 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4103 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4105 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4106 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4108 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4110 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4111 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4113 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4115 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4116 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4118 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4120 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4121 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4123 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4125 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4126 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4128 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4130 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4131 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4133 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4134 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4136 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4138 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4139 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4141 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4143 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4145 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4147 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4148 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4150 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4152 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4154 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4156 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4157 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4159 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4161 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4163 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4165 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4166 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4168 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4169 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext
4171 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4172 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4174 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4175 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4177 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4179 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4180 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4182 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4183 eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low
4185 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4186 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4188 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4189 eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low
4191 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4192 movi $mod_constant.8b, #0xc2
4194 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4195 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4196 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low
4198 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4199 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext
4201 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4202 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4204 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4205 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4207 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4209 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4210 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4212 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4213 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4215 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4216 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4218 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4219 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4221 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4222 rev $ctr32w, $rctr32w @ CTR block 4k+8
4223 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4225 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4226 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4228 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4229 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4231 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4232 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4234 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4235 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
4236 eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid
4238 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4239 eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high
4241 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4242 eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high
4244 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4245 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
4247 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4248 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4249 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4251 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4252 fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low
4254 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4255 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high
4257 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low
4258 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
4260 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high
4262 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4263 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4264 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
4266 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
4267 rev $ctr32w, $rctr32w @ CTR block 4k+9
4268 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
4270 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result
4271 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
4272 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
4274 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4275 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
4277 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4278 rev $ctr32w, $rctr32w @ CTR block 4k+10
4279 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result
4281 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
4282 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4283 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high
4285 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4286 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result
4287 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
4289 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4290 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result
4291 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10
4293 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result
4294 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10
4295 rev $ctr32w, $rctr32w @ CTR block 4k+11
4297 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4298 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11
4300 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result
4301 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result
4302 b.lt L256_enc_main_loop
4304 .L256_enc_prepretail: @ PREPRETAIL
4305 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4306 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free)
4308 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
4309 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3
4311 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4312 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free)
4314 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3
4315 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4317 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
4319 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
4321 eor $res0b, $res0b, $acc_lb @ PRE 1
4322 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free)
4324 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
4326 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
4327 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
4329 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
4331 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
4332 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
4334 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
4336 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
4338 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
4339 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
4341 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
4343 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
4345 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
4347 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
4349 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
4351 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
4353 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
4355 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
4356 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
4358 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
4359 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
4361 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
4363 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
4364 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
4366 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
4367 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4369 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
4371 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
4372 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
4373 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3
4375 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
4377 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
4379 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
4380 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
4382 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
4384 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
4385 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
4387 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
4389 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
4390 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
4392 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
4394 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
4396 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
4398 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
4400 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
4402 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
4403 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
4405 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
4407 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
4409 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
4411 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
4412 movi $mod_constant.8b, #0xc2
4414 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
4416 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
4417 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
4419 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
4421 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
4422 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4424 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
4425 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
4427 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
4429 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
4431 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
4433 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
4434 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
4436 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
4438 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up
4440 pmull $t1.1q, $acc_h.1d, $mod_constant.1d
4441 ext $acc_hb, $acc_hb, $acc_hb, #8
4443 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
4445 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
4446 eor $acc_mb, $acc_mb, $acc_lb
4448 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
4450 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
4452 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
4454 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
4455 eor $acc_mb, $acc_mb, $t1.16b
4457 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
4459 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
4461 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
4463 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
4464 eor $acc_mb, $acc_mb, $acc_hb
4466 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
4468 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
4470 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
4472 pmull $t1.1q, $acc_m.1d, $mod_constant.1d
4474 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
4475 ext $acc_mb, $acc_mb, $acc_mb, #8
4477 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
4479 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
4480 eor $acc_lb, $acc_lb, $t1.16b
4482 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
4484 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
4486 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
4488 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
4489 eor $acc_lb, $acc_lb, $acc_mb
4490 .L256_enc_tail: @ TAIL
4492 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
4493 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
4494 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext
4496 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low
4497 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high
4499 cmp $main_end_input_ptr, #48
4500 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low
4502 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high
4504 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result
4505 b.gt .L256_enc_blocks_more_than_3
4507 cmp $main_end_input_ptr, #32
4512 sub $rctr32w, $rctr32w, #1
4516 b.gt .L256_enc_blocks_more_than_2
4519 sub $rctr32w, $rctr32w, #1
4520 cmp $main_end_input_ptr, #16
4522 b.gt .L256_enc_blocks_more_than_1
4524 sub $rctr32w, $rctr32w, #1
4525 b .L256_enc_blocks_less_than_1
4526 .L256_enc_blocks_more_than_3: @ blocks left > 3
4527 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result
4529 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high
4531 rev64 $res0b, $res1b @ GHASH final-3 block
4533 eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low
4534 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4536 eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high
4538 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
4539 fmov $res1d, $input_l0 @ AES final-2 block - mov low
4541 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high
4543 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
4544 movi $t0.8b, #0 @ suppress further partial tag feed in
4546 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
4548 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
4550 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
4552 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
4553 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result
4554 .L256_enc_blocks_more_than_2: @ blocks left > 2
4556 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result
4558 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high
4560 rev64 $res0b, $res1b @ GHASH final-2 block
4562 eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low
4563 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4565 fmov $res1d, $input_l0 @ AES final-1 block - mov low
4566 eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high
4568 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high
4570 movi $t0.8b, #0 @ suppress further partial tag feed in
4572 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
4573 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
4575 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
4577 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
4579 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result
4581 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
4583 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
4585 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
4587 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
4588 .L256_enc_blocks_more_than_1: @ blocks left > 1
4590 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result
4592 rev64 $res0b, $res1b @ GHASH final-1 block
4594 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high
4596 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4598 movi $t0.8b, #0 @ suppress further partial tag feed in
4600 eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low
4601 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
4603 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
4604 eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high
4606 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
4608 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
4610 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
4611 fmov $res1d, $input_l0 @ AES final block - mov low
4613 fmov $res1.d[1], $input_h0 @ AES final block - mov high
4615 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
4617 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
4619 eor $res1b, $res1b, $ctr3b @ AES final block - result
4620 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
4622 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
4623 .L256_enc_blocks_less_than_1: @ blocks left <= 1
4625 and $bit_length, $bit_length, #127 @ bit_length %= 128
4627 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
4628 sub $bit_length, $bit_length, #128 @ bit_length -= 128
4630 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
4631 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored
4633 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
4634 and $bit_length, $bit_length, #127 @ bit_length %= 128
4636 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
4637 cmp $bit_length, #64
4639 csel $input_l0, $rk14_l, $rk14_h, lt
4640 csel $input_h0, $rk14_h, xzr, lt
4642 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block
4644 fmov $ctr0.d[1], $input_h0
4646 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
4648 rev64 $res0b, $res1b @ GHASH final block
4650 eor $res0b, $res0b, $t0.16b @ feed in partial tag
4652 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing
4654 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
4655 mov $t0d, $res0.d[1] @ GHASH final block - mid
4656 rev $ctr32w, $rctr32w
4658 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
4660 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
4661 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
4663 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
4665 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
4667 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
4668 movi $mod_constant.8b, #0xc2
4670 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
4672 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
4674 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
4676 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
4678 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
4680 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
4682 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
4684 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
4686 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
4688 str $ctr32w, [$counter, #12] @ store the updated counter
4690 st1 { $res1b}, [$output_ptr] @ store all 16B
4691 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low
4693 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
4694 ext $acc_lb, $acc_lb, $acc_lb, #8
4695 rev64 $acc_lb, $acc_lb
4697 st1 { $acc_l.16b }, [$current_tag]
4699 ldp x21, x22, [sp, #16]
4700 ldp x23, x24, [sp, #32]
4701 ldp d8, d9, [sp, #48]
4702 ldp d10, d11, [sp, #64]
4703 ldp d12, d13, [sp, #80]
4704 ldp d14, d15, [sp, #96]
4705 ldp x19, x20, [sp], #112
4711 .size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
4719 #########################################################################################
4720 # size_t aes_gcm_dec_256_kernel(const unsigned char *in,
4722 # unsigned char *out,
4724 # unsigned char ivec[16],
4728 .global aes_gcm_dec_256_kernel
4729 .type aes_gcm_dec_256_kernel,%function
4731 aes_gcm_dec_256_kernel:
4732 cbz x1, .L256_dec_ret
4733 stp x19, x20, [sp, #-112]!
4736 stp x21, x22, [sp, #16]
4737 stp x23, x24, [sp, #32]
4738 stp d8, d9, [sp, #48]
4739 stp d10, d11, [sp, #64]
4740 stp d12, d13, [sp, #80]
4741 stp d14, d15, [sp, #96]
4743 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len
4744 mov $len, $main_end_input_ptr
4745 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32
4747 ldr $rk8q, [$cc, #128] @ load rk8
4748 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1
4750 ldr $rk7q, [$cc, #112] @ load rk7
4751 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4753 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr
4754 ldr $rk6q, [$cc, #96] @ load rk6
4756 lsr $rctr32x, $ctr96_t32x, #32
4757 ldr $rk5q, [$cc, #80] @ load rk5
4758 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
4760 ldr $rk3q, [$cc, #48] @ load rk3
4761 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4762 rev $rctr32w, $rctr32w @ rev_ctr32
4764 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32
4765 fmov $ctr3d, $ctr96_b64x @ CTR block 3
4767 rev $ctr32w, $rctr32w @ CTR block 1
4768 add $rctr32w, $rctr32w, #1 @ CTR block 1
4769 fmov $ctr1d, $ctr96_b64x @ CTR block 1
4771 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1
4772 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible
4774 fmov $ctr1.d[1], $ctr32x @ CTR block 1
4775 rev $ctr32w, $rctr32w @ CTR block 2
4776 add $rctr32w, $rctr32w, #1 @ CTR block 2
4778 fmov $ctr2d, $ctr96_b64x @ CTR block 2
4779 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2
4781 fmov $ctr2.d[1], $ctr32x @ CTR block 2
4782 rev $ctr32w, $rctr32w @ CTR block 3
4784 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3
4785 ldr $rk0q, [$cc, #0] @ load rk0
4787 fmov $ctr3.d[1], $ctr32x @ CTR block 3
4788 add $rctr32w, $rctr32w, #1 @ CTR block 3
4790 ldr $rk4q, [$cc, #64] @ load rk4
4792 ldr $rk13q, [$cc, #208] @ load rk13
4794 ldr $rk1q, [$cc, #16] @ load rk1
4796 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0
4797 ldr $h3q, [$current_tag, #80] @ load h3l | h3h
4798 ext $h3b, $h3b, $h3b, #8
4800 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0
4801 ldr $h4q, [$current_tag, #112] @ load h4l | h4h
4802 ext $h4b, $h4b, $h4b, #8
4804 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0
4805 ldr $h2q, [$current_tag, #64] @ load h2l | h2h
4806 ext $h2b, $h2b, $h2b, #8
4808 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0
4809 ldr $rk2q, [$cc, #32] @ load rk2
4811 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1
4812 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14
4814 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1
4815 ld1 { $acc_lb}, [$current_tag]
4816 ext $acc_lb, $acc_lb, $acc_lb, #8
4817 rev64 $acc_lb, $acc_lb
4819 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1
4820 ldr $rk9q, [$cc, #144] @ load rk9
4822 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1
4823 ldr $rk12q, [$cc, #192] @ load rk12
4825 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2
4826 ldr $h1q, [$current_tag, #32] @ load h1l | h1h
4827 ext $h1b, $h1b, $h1b, #8
4829 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2
4830 ldr $rk10q, [$cc, #160] @ load rk10
4832 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2
4834 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3
4836 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2
4838 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3
4840 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4
4841 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks
4843 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3
4845 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3
4847 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4
4849 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4
4851 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4
4853 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5
4855 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5
4857 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5
4859 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5
4861 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6
4863 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6
4865 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6
4867 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6
4869 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7
4871 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7
4873 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7
4875 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8
4877 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7
4879 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8
4881 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8
4883 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9
4885 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8
4886 ldr $rk11q, [$cc, #176] @ load rk11
4888 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9
4890 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10
4892 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9
4894 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10
4896 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9
4898 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10
4900 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11
4902 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10
4904 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11
4906 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11
4908 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11
4910 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h
4912 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l
4914 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h
4915 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l
4917 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12
4919 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12
4921 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12
4923 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12
4924 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k
4926 aese $ctr1b, $rk13 @ AES block 1 - round 13
4928 aese $ctr2b, $rk13 @ AES block 2 - round 13
4929 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k
4931 aese $ctr3b, $rk13 @ AES block 3 - round 13
4933 aese $ctr0b, $rk13 @ AES block 0 - round 13
4934 b.ge .L256_dec_tail @ handle tail
4936 ldr $res0q, [$input_ptr, #0] @ AES block 0 - load ciphertext
4938 ldr $res1q, [$input_ptr, #16] @ AES block 1 - load ciphertext
4940 rev $ctr32w, $rctr32w @ CTR block 4
4942 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result
4944 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result
4945 rev64 $res1b, $res1b @ GHASH block 1
4946 ldr $res3q, [$input_ptr, #48] @ AES block 3 - load ciphertext
4948 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high
4950 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low
4951 rev64 $res0b, $res0b @ GHASH block 0
4952 add $rctr32w, $rctr32w, #1 @ CTR block 4
4954 fmov $ctr0d, $ctr96_b64x @ CTR block 4
4955 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4
4957 fmov $ctr0.d[1], $ctr32x @ CTR block 4
4958 rev $ctr32w, $rctr32w @ CTR block 5
4959 add $rctr32w, $rctr32w, #1 @ CTR block 5
4961 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low
4963 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5
4964 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high
4965 eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high
4967 eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low
4968 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result
4969 fmov $ctr1d, $ctr96_b64x @ CTR block 5
4971 ldr $res2q, [$input_ptr, #32] @ AES block 2 - load ciphertext
4972 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
4974 fmov $ctr1.d[1], $ctr32x @ CTR block 5
4975 rev $ctr32w, $rctr32w @ CTR block 6
4976 add $rctr32w, $rctr32w, #1 @ CTR block 6
4978 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low
4979 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6
4981 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high
4982 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result
4984 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result
4985 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks
4986 b.ge .L256_dec_prepretail @ do prepretail
4988 .L256_dec_main_loop: @ main loop start
4989 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
4990 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
4991 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
4993 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
4994 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
4996 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
4997 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
4999 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5000 eor $res0b, $res0b, $acc_lb @ PRE 1
5001 rev $ctr32w, $rctr32w @ CTR block 4k+7
5003 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5004 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5006 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5007 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5009 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5010 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5011 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5013 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5014 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5016 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5017 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5019 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5020 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5022 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5023 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5025 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5026 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5028 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5029 rev64 $res2b, $res2b @ GHASH block 4k+2
5031 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5032 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5034 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5035 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5037 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5039 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5041 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5042 rev64 $res3b, $res3b @ GHASH block 4k+3
5044 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5045 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5047 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5048 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5049 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5051 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5053 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5054 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5056 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5057 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5059 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5060 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5062 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5063 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5065 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5066 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5068 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5070 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5071 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5073 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5075 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5076 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5078 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5079 rev $ctr32w, $rctr32w @ CTR block 4k+8
5081 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5082 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5084 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5085 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8
5087 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5089 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5090 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5092 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5094 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5095 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5097 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5099 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5101 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5102 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5104 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5106 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5107 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8
5108 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5110 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5112 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5113 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5115 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5117 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5118 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5120 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5122 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5123 movi $mod_constant.8b, #0xc2
5125 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5126 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5128 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5130 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5131 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5133 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5134 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5136 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5138 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5139 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5141 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5142 ldr $res0q, [$input_ptr, #0] @ AES block 4k+4 - load ciphertext
5144 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5145 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5147 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5148 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5150 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5151 ldr $res1q, [$input_ptr, #16] @ AES block 4k+5 - load ciphertext
5153 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5154 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result
5156 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5157 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5159 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5160 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5162 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5163 ldr $res3q, [$input_ptr, #48] @ AES block 4k+7 - load ciphertext
5165 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5166 ldr $res2q, [$input_ptr, #32] @ AES block 4k+6 - load ciphertext
5168 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5169 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5171 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5172 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5174 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5175 add $input_ptr, $input_ptr, #64 @ AES input_ptr update
5176 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5178 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5179 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8
5181 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5182 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8
5184 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5185 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result
5186 rev $ctr32w, $rctr32w @ CTR block 4k+9
5188 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5189 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9
5190 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL
5192 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9
5194 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5195 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5197 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high
5198 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result
5199 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5201 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5202 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low
5204 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9
5205 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5207 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9
5208 rev $ctr32w, $rctr32w @ CTR block 4k+10
5209 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10
5211 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5212 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10
5214 rev64 $res1b, $res1b @ GHASH block 4k+5
5215 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high
5216 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result
5218 eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low
5219 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result
5221 rev64 $res0b, $res0b @ GHASH block 4k+4
5222 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5223 b.lt .L256_dec_main_loop
5226 .L256_dec_prepretail: @ PREPRETAIL
5227 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0
5228 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low
5229 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result
5231 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0
5232 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high
5234 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0
5235 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6
5237 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6
5238 rev $ctr32w, $rctr32w @ CTR block 4k+7
5239 eor $res0b, $res0b, $acc_lb @ PRE 1
5241 rev64 $res2b, $res2b @ GHASH block 4k+2
5242 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7
5243 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low
5245 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1
5246 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high
5248 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low
5249 mov $t0d, $res0.d[1] @ GHASH block 4k - mid
5250 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7
5252 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high
5253 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7
5255 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0
5256 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid
5258 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1
5259 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid
5261 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high
5263 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1
5264 rev64 $res3b, $res3b @ GHASH block 4k+3
5266 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0
5268 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid
5269 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high
5271 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low
5273 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1
5274 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid
5276 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2
5278 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2
5279 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low
5281 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2
5283 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3
5284 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid
5286 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2
5287 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid
5289 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low
5291 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4
5293 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3
5294 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid
5296 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid
5298 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5
5299 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low
5301 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4
5303 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high
5304 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid
5306 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high
5308 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5
5309 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid
5311 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3
5313 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3
5314 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high
5316 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low
5318 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4
5319 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid
5321 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4
5323 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid
5325 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5
5326 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid
5328 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5
5330 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6
5331 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid
5333 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6
5335 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6
5336 movi $mod_constant.8b, #0xc2
5338 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6
5339 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low
5341 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid
5343 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7
5344 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high
5346 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7
5348 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7
5349 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid
5351 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8
5353 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7
5354 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5356 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8
5358 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8
5359 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5361 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8
5363 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9
5364 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5366 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5368 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9
5369 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5371 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9
5373 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9
5374 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5376 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10
5378 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10
5380 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10
5381 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high
5383 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10
5384 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low
5386 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11
5387 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5389 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11
5390 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7
5392 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11
5393 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low
5395 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12
5397 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5398 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high
5400 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11
5401 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result
5403 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12
5404 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5406 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12
5407 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result
5409 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12
5410 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5412 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13
5414 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13
5416 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13
5418 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13
5419 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5420 .L256_dec_tail: @ TAIL
5422 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process
5423 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext
5425 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result
5427 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low
5429 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high
5430 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag
5432 cmp $main_end_input_ptr, #48
5434 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low
5436 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high
5437 b.gt .L256_dec_blocks_more_than_3
5439 sub $rctr32w, $rctr32w, #1
5444 cmp $main_end_input_ptr, #32
5448 b.gt .L256_dec_blocks_more_than_2
5450 sub $rctr32w, $rctr32w, #1
5453 cmp $main_end_input_ptr, #16
5454 b.gt .L256_dec_blocks_more_than_1
5456 sub $rctr32w, $rctr32w, #1
5457 b .L256_dec_blocks_less_than_1
5458 .L256_dec_blocks_more_than_3: @ blocks left > 3
5459 rev64 $res0b, $res1b @ GHASH final-3 block
5460 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext
5462 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result
5464 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid
5466 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5468 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result
5470 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid
5472 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low
5474 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high
5476 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid
5478 movi $t0.8b, #0 @ suppress further partial tag feed in
5480 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high
5482 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid
5483 eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low
5485 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low
5486 eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high
5487 .L256_dec_blocks_more_than_2: @ blocks left > 2
5489 rev64 $res0b, $res1b @ GHASH final-2 block
5490 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext
5492 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5493 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result
5495 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result
5497 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid
5499 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low
5501 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high
5503 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid
5504 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low
5506 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high
5507 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low
5508 movi $t0.8b, #0 @ suppress further partial tag feed in
5510 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid
5512 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high
5513 eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low
5515 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid
5516 eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high
5517 .L256_dec_blocks_more_than_1: @ blocks left > 1
5519 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result
5520 rev64 $res0b, $res1b @ GHASH final-1 block
5522 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext
5524 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5525 movi $t0.8b, #0 @ suppress further partial tag feed in
5527 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid
5529 eor $ctr0b, $res1b, $ctr3b @ AES final block - result
5531 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high
5533 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid
5535 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low
5536 mov $output_l0, $ctr0.d[0] @ AES final block - mov low
5538 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid
5540 mov $output_h0, $ctr0.d[1] @ AES final block - mov high
5542 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid
5543 eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low
5545 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low
5547 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high
5549 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid
5550 eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high
5551 .L256_dec_blocks_less_than_1: @ blocks left <= 1
5553 and $bit_length, $bit_length, #127 @ bit_length %= 128
5554 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff
5556 sub $bit_length, $bit_length, #128 @ bit_length -= 128
5557 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff
5559 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
5560 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128])
5562 and $bit_length, $bit_length, #127 @ bit_length %= 128
5564 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block
5565 cmp $bit_length, #64
5567 csel $ctr32x, $rk14_l, $rk14_h, lt
5568 csel $ctr96_b64x, $rk14_h, xzr, lt
5570 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block
5571 and $output_l0, $output_l0, $ctr32x
5573 mov $ctr0.d[1], $ctr96_b64x
5574 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes
5576 rev $ctr32w, $rctr32w
5578 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
5580 orr $output_l0, $output_l0, $end_input_ptr
5582 and $output_h0, $output_h0, $ctr96_b64x
5584 orr $output_h0, $output_h0, $main_end_input_ptr
5586 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits
5588 rev64 $res0b, $res1b @ GHASH final block
5590 eor $res0b, $res0b, $t0.16b @ feed in partial tag
5592 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low
5594 mov $t0d, $res0.d[1] @ GHASH final block - mid
5596 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid
5598 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high
5600 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid
5602 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high
5604 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low
5606 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid
5607 movi $mod_constant.8b, #0xc2
5609 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up
5611 shl $mod_constantd, $mod_constantd, #56 @ mod_constant
5613 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up
5615 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid
5617 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment
5619 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid
5621 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid
5623 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low
5625 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment
5627 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low
5629 stp $output_l0, $output_h0, [$output_ptr]
5631 str $ctr32w, [$counter, #12] @ store the updated counter
5633 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low
5634 ext $acc_lb, $acc_lb, $acc_lb, #8
5635 rev64 $acc_lb, $acc_lb
5637 st1 { $acc_l.16b }, [$current_tag]
5639 ldp x21, x22, [sp, #16]
5640 ldp x23, x24, [sp, #32]
5641 ldp d8, d9, [sp, #48]
5642 ldp d10, d11, [sp, #64]
5643 ldp d12, d13, [sp, #80]
5644 ldp d14, d15, [sp, #96]
5645 ldp x19, x20, [sp], #112
5651 .size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
5657 .asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
5662 if ($flavour =~ /64/) { ######## 64-bit code
5666 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
5667 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
5668 $3<8?$3:$3+8,($4 eq "lo")?0:1;
5670 foreach(split("\n",$code)) {
5671 s/@\s/\/\//o; # old->new style commentary
5674 } else { ######## 32-bit code
5678 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
5679 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
5682 my ($mnemonic,$arg)=@_;
5684 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
5685 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
5686 |(($2&7)<<17)|(($2&8)<<4)
5687 |(($3&7)<<1) |(($3&8)<<2);
5688 $word |= 0x00010001 if ($mnemonic =~ "2");
5689 # since ARMv7 instructions are always encoded little-endian.
5690 # correct solution is to use .inst directive, but older%%%%
5691 # assemblers don't implement it:-(
5692 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
5693 $word&0xff,($word>>8)&0xff,
5694 ($word>>16)&0xff,($word>>24)&0xff,
5699 foreach(split("\n",$code)) {
5700 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
5701 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
5702 s/\/\/\s?/@ /o; # new->old style commentary
5704 # fix up remaining new-style suffixes
5707 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
5708 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
5709 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or
5710 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or
5711 s/^(\s+)b\./$1b/o or
5712 s/^(\s+)ret/$1bx\tlr/o;
5714 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
5722 close STDOUT or die "error closing STDOUT"; # enforce flush