2 # Copyright 2014-2023 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
42 # Performance in cycles per byte processed with 128-bit key:
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
80 #if __ARM_MAX_ARCH__>=7
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___ if ($flavour !~ /64/);
84 .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
92 # define INST(a,b,c,d) $_byte a,b,c,d
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
119 ${prefix}_set_encrypt_key:
122 $code.=<<___ if ($flavour =~ /64/);
123 AARCH64_VALID_CALL_TARGET
124 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
125 stp x29,x30,[sp,#-16]!
145 veor $zero,$zero,$zero
146 vld1.8 {$in0},[$inp],#16
147 mov $bits,#8 // reuse $bits
148 vld1.32 {$rcon,$mask},[$ptr],#32
156 vtbl.8 $key,{$in0},$mask
157 vext.8 $tmp,$zero,$in0,#12
158 vst1.32 {$in0},[$out],#16
163 vext.8 $tmp,$zero,$tmp,#12
165 vext.8 $tmp,$zero,$tmp,#12
168 vshl.u8 $rcon,$rcon,#1
172 vld1.32 {$rcon},[$ptr]
174 vtbl.8 $key,{$in0},$mask
175 vext.8 $tmp,$zero,$in0,#12
176 vst1.32 {$in0},[$out],#16
180 vext.8 $tmp,$zero,$tmp,#12
182 vext.8 $tmp,$zero,$tmp,#12
185 vshl.u8 $rcon,$rcon,#1
188 vtbl.8 $key,{$in0},$mask
189 vext.8 $tmp,$zero,$in0,#12
190 vst1.32 {$in0},[$out],#16
194 vext.8 $tmp,$zero,$tmp,#12
196 vext.8 $tmp,$zero,$tmp,#12
200 vst1.32 {$in0},[$out]
208 vld1.8 {$in1},[$inp],#8
209 vmov.i8 $key,#8 // borrow $key
210 vst1.32 {$in0},[$out],#16
211 vsub.i8 $mask,$mask,$key // adjust the mask
214 vtbl.8 $key,{$in1},$mask
215 vext.8 $tmp,$zero,$in0,#12
217 vst1.32 {$in1},[$out],#16
220 vst1.32 {$in1},[$out],#8
226 vext.8 $tmp,$zero,$tmp,#12
228 vext.8 $tmp,$zero,$tmp,#12
231 vdup.32 $tmp,${in0}[3]
234 vext.8 $in1,$zero,$in1,#12
235 vshl.u8 $rcon,$rcon,#1
239 vst1.32 {$in0},[$out],#16
251 vst1.32 {$in0},[$out],#16
254 vtbl.8 $key,{$in1},$mask
255 vext.8 $tmp,$zero,$in0,#12
256 vst1.32 {$in1},[$out],#16
261 vext.8 $tmp,$zero,$tmp,#12
263 vext.8 $tmp,$zero,$tmp,#12
266 vshl.u8 $rcon,$rcon,#1
268 vst1.32 {$in0},[$out],#16
271 vdup.32 $key,${in0}[3] // just splat
272 vext.8 $tmp,$zero,$in1,#12
276 vext.8 $tmp,$zero,$tmp,#12
278 vext.8 $tmp,$zero,$tmp,#12
289 mov x0,$ptr // return value
290 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
292 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
294 .globl ${prefix}_set_decrypt_key
295 .type ${prefix}_set_decrypt_key,%function
297 ${prefix}_set_decrypt_key:
299 $code.=<<___ if ($flavour =~ /64/);
300 AARCH64_SIGN_LINK_REGISTER
301 stp x29,x30,[sp,#-16]!
304 $code.=<<___ if ($flavour !~ /64/);
313 sub $out,$out,#240 // restore original $out
315 add $inp,$out,x12,lsl#4 // end of key schedule
317 vld1.32 {v0.16b},[$out]
318 vld1.32 {v1.16b},[$inp]
319 vst1.32 {v0.16b},[$inp],x4
320 vst1.32 {v1.16b},[$out],#16
323 vld1.32 {v0.16b},[$out]
324 vld1.32 {v1.16b},[$inp]
327 vst1.32 {v0.16b},[$inp],x4
328 vst1.32 {v1.16b},[$out],#16
332 vld1.32 {v0.16b},[$out]
334 vst1.32 {v0.16b},[$inp]
336 eor x0,x0,x0 // return value
339 $code.=<<___ if ($flavour !~ /64/);
342 $code.=<<___ if ($flavour =~ /64/);
344 AARCH64_VALIDATE_LINK_REGISTER
348 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
354 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
355 my ($inp,$out,$key)=map("x$_",(0..2));
357 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
360 .globl ${prefix}_${dir}crypt
361 .type ${prefix}_${dir}crypt,%function
363 ${prefix}_${dir}crypt:
365 $code.=<<___ if ($flavour =~ /64/);
366 AARCH64_VALID_CALL_TARGET
369 ldr $rounds,[$key,#240]
370 vld1.32 {$rndkey0},[$key],#16
371 vld1.8 {$inout},[$inp]
372 sub $rounds,$rounds,#2
373 vld1.32 {$rndkey1},[$key],#16
376 aes$e $inout,$rndkey0
378 vld1.32 {$rndkey0},[$key],#16
379 subs $rounds,$rounds,#2
380 aes$e $inout,$rndkey1
382 vld1.32 {$rndkey1},[$key],#16
385 aes$e $inout,$rndkey0
387 vld1.32 {$rndkey0},[$key]
388 aes$e $inout,$rndkey1
389 veor $inout,$inout,$rndkey0
391 vst1.8 {$inout},[$out]
393 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
400 # Performance in cycles per byte.
401 # Processed with AES-ECB different key size.
402 # It shows the value before and after optimization as below:
405 # AES-128-ECB AES-192-ECB AES-256-ECB
406 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
407 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
409 # Optimization is implemented by loop unrolling and interleaving.
410 # Commonly, we choose the unrolling factor as 5, if the input
411 # data size smaller than 5 blocks, but not smaller than 3 blocks,
412 # choose 3 as the unrolling factor.
413 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
414 # as one iteration, every loop the left size lsize -= 5*16.
415 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
416 # every loop lsize -=3*16.
417 # If lsize < 3*16 bytes, treat them as the tail, interleave the
418 # two blocks AES instructions.
419 # There is one special case, if the original input data size dsize
420 # = 16 bytes, we will treat it separately to improve the
421 # performance: one independent code block without LR, FP load and
422 # store, just looks like what the original ECB implementation does.
425 my ($inp,$out,$len,$key)=map("x$_",(0..3));
426 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
427 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
429 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
431 ### q7 last round key
432 ### q10-q15 q7 Last 7 round keys
433 ### q8-q9 preloaded round keys except last 7 keys for big size
434 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
437 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
439 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
440 my ($dat4,$in4,$tmp4);
441 if ($flavour =~ /64/) {
442 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
446 .globl ${prefix}_ecb_encrypt
447 .type ${prefix}_ecb_encrypt,%function
449 ${prefix}_ecb_encrypt:
451 $code.=<<___ if ($flavour =~ /64/);
452 AARCH64_VALID_CALL_TARGET
454 // Original input data size bigger than 16, jump to big size processing.
456 vld1.8 {$dat0},[$inp]
457 cmp $enc,#0 // en- or decrypting?
458 ldr $rounds,[$key,#240]
459 vld1.32 {q5-q6},[$key],#32 // load key schedule...
464 vld1.32 {q8-q9},[$key],#32 // load key schedule...
467 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
472 vld1.32 {q8},[$key],#16 // load key schedule...
475 vld1.32 {q9},[$key],#16 // load key schedule...
476 subs $rounds,$rounds,#2 // bias
477 b.gt .Lecb_round_loop
479 vld1.32 {q10-q11},[$key],#32 // load key schedule...
484 vld1.32 {q12-q13},[$key],#32 // load key schedule...
489 vld1.32 {q14-q15},[$key],#32 // load key schedule...
494 vld1.32 {$rndlast},[$key]
498 veor $dat0,$dat0,$rndlast
499 vst1.8 {$dat0},[$out]
504 vld1.32 {q8-q9},[$key],#32 // load key schedule...
507 subs $rounds,$rounds,#10 // bias
509 .Lecb_dec_round_loop:
512 vld1.32 {q8},[$key],#16 // load key schedule...
515 vld1.32 {q9},[$key],#16 // load key schedule...
516 subs $rounds,$rounds,#2 // bias
517 b.gt .Lecb_dec_round_loop
519 vld1.32 {q10-q11},[$key],#32 // load key schedule...
524 vld1.32 {q12-q13},[$key],#32 // load key schedule...
529 vld1.32 {q14-q15},[$key],#32 // load key schedule...
534 vld1.32 {$rndlast},[$key]
538 veor $dat0,$dat0,$rndlast
539 vst1.8 {$dat0},[$out]
543 $code.=<<___ if ($flavour =~ /64/);
544 stp x29,x30,[sp,#-16]!
547 $code.=<<___ if ($flavour !~ /64/);
550 vstmdb sp!,{d8-d15} @ ABI specification says so
551 ldmia ip,{r4-r5} @ load remaining args
559 cmp $enc,#0 // en- or decrypting?
560 ldr $rounds,[$key,#240]
562 vld1.8 {$dat},[$inp],$step
564 vld1.32 {q8-q9},[$key] // load key schedule...
565 sub $rounds,$rounds,#6
566 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
567 sub $rounds,$rounds,#2
568 vld1.32 {q10-q11},[$key_],#32
569 vld1.32 {q12-q13},[$key_],#32
570 vld1.32 {q14-q15},[$key_],#32
571 vld1.32 {$rndlast},[$key_]
577 vld1.8 {$dat1},[$inp],#16
578 subs $len,$len,#32 // bias
580 vorr $in1,$dat1,$dat1
581 vorr $dat2,$dat1,$dat1
586 vld1.8 {$dat2},[$inp],#16
588 $code.=<<___ if ($flavour =~ /64/);
592 vld1.8 {$dat3},[$inp],#16
593 vld1.8 {$dat4},[$inp],#16
594 sub $len,$len,#32 // bias
608 vld1.32 {q8},[$key_],#16
620 vld1.32 {q9},[$key_],#16
633 cmp $len,#0x40 // because .Lecb_enc_tail4x
646 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
659 add $inp,$inp,x6 // $inp is adjusted in such way that
660 // at exit from the loop $dat1-$dat4
661 // are loaded with last "words"
662 add x6,$len,#0x60 // because .Lecb_enc_tail4x
709 vld1.8 {$in0},[$inp],#16
711 vld1.8 {$in1},[$inp],#16
713 vld1.8 {$in2},[$inp],#16
715 vld1.8 {$in3},[$inp],#16
717 vld1.8 {$in4},[$inp],#16
718 cbz x6,.Lecb_enc_tail4x
719 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
720 veor $tmp0,$rndlast,$dat0
722 veor $tmp1,$rndlast,$dat1
724 veor $tmp2,$rndlast,$dat2
726 veor $tmp3,$rndlast,$dat3
728 veor $tmp4,$rndlast,$dat4
729 vst1.8 {$tmp0},[$out],#16
731 vst1.8 {$tmp1},[$out],#16
733 vst1.8 {$tmp2},[$out],#16
734 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
735 vst1.8 {$tmp3},[$out],#16
736 vst1.8 {$tmp4},[$out],#16
753 veor $tmp1,$rndlast,$dat1
754 veor $tmp2,$rndlast,$dat2
755 veor $tmp3,$rndlast,$dat3
756 veor $tmp4,$rndlast,$dat4
757 vst1.8 {$tmp1},[$out],#16
758 vst1.8 {$tmp2},[$out],#16
759 vst1.8 {$tmp3},[$out],#16
760 vst1.8 {$tmp4},[$out],#16
773 vld1.32 {q8},[$key_],#16
781 vld1.32 {q9},[$key_],#16
791 mov.lo x6,$len // x6, $cnt, is zero at this point
798 add $inp,$inp,x6 // $inp is adjusted in such way that
799 // at exit from the loop $dat1-$dat2
800 // are loaded with last "words"
808 vld1.8 {$in0},[$inp],#16
815 vld1.8 {$in1},[$inp],#16
822 vld1.8 {$in2},[$inp],#16
826 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
828 veor $tmp0,$rndlast,$dat0
829 veor $tmp1,$rndlast,$dat1
830 veor $dat2,$dat2,$rndlast
831 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
832 vst1.8 {$tmp0},[$out],#16
834 vst1.8 {$tmp1},[$out],#16
836 vst1.8 {$dat2},[$out],#16
849 vld1.32 {q8},[$key_],#16
855 vld1.32 {q9},[$key_],#16
882 veor $tmp1,$rndlast,$dat1
883 veor $tmp2,$rndlast,$dat2
884 vst1.8 {$tmp1},[$out],#16
885 vst1.8 {$tmp2},[$out],#16
889 veor $tmp1,$rndlast,$dat2
890 vst1.8 {$tmp1},[$out],#16
897 vld1.8 {$dat1},[$inp],#16
898 subs $len,$len,#32 // bias
900 vorr $in1,$dat1,$dat1
901 vorr $dat2,$dat1,$dat1
906 vld1.8 {$dat2},[$inp],#16
908 $code.=<<___ if ($flavour =~ /64/);
912 vld1.8 {$dat3},[$inp],#16
913 vld1.8 {$dat4},[$inp],#16
914 sub $len,$len,#32 // bias
928 vld1.32 {q8},[$key_],#16
940 vld1.32 {q9},[$key_],#16
953 cmp $len,#0x40 // because .Lecb_tail4x
966 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
979 add $inp,$inp,x6 // $inp is adjusted in such way that
980 // at exit from the loop $dat1-$dat4
981 // are loaded with last "words"
982 add x6,$len,#0x60 // because .Lecb_tail4x
1029 vld1.8 {$in0},[$inp],#16
1031 vld1.8 {$in1},[$inp],#16
1033 vld1.8 {$in2},[$inp],#16
1035 vld1.8 {$in3},[$inp],#16
1037 vld1.8 {$in4},[$inp],#16
1039 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1040 veor $tmp0,$rndlast,$dat0
1041 vorr $dat0,$in0,$in0
1042 veor $tmp1,$rndlast,$dat1
1043 vorr $dat1,$in1,$in1
1044 veor $tmp2,$rndlast,$dat2
1045 vorr $dat2,$in2,$in2
1046 veor $tmp3,$rndlast,$dat3
1047 vorr $dat3,$in3,$in3
1048 veor $tmp4,$rndlast,$dat4
1049 vst1.8 {$tmp0},[$out],#16
1050 vorr $dat4,$in4,$in4
1051 vst1.8 {$tmp1},[$out],#16
1053 vst1.8 {$tmp2},[$out],#16
1054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1055 vst1.8 {$tmp3},[$out],#16
1056 vst1.8 {$tmp4},[$out],#16
1057 b.hs .Loop5x_ecb_dec
1063 subs $len,$len,#0x30
1064 vorr $dat0,$in2,$in2
1065 vorr $dat1,$in3,$in3
1066 vorr $dat2,$in4,$in4
1073 veor $tmp1,$rndlast,$dat1
1074 veor $tmp2,$rndlast,$dat2
1075 veor $tmp3,$rndlast,$dat3
1076 veor $tmp4,$rndlast,$dat4
1077 vst1.8 {$tmp1},[$out],#16
1078 vst1.8 {$tmp2},[$out],#16
1079 vst1.8 {$tmp3},[$out],#16
1080 vst1.8 {$tmp4},[$out],#16
1093 vld1.32 {q8},[$key_],#16
1101 vld1.32 {q9},[$key_],#16
1102 b.gt .Loop3x_ecb_dec
1110 subs $len,$len,#0x30
1111 mov.lo x6,$len // x6, $cnt, is zero at this point
1118 add $inp,$inp,x6 // $inp is adjusted in such way that
1119 // at exit from the loop $dat1-$dat2
1120 // are loaded with last "words"
1128 vld1.8 {$in0},[$inp],#16
1135 vld1.8 {$in1},[$inp],#16
1142 vld1.8 {$in2},[$inp],#16
1146 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1148 veor $tmp0,$rndlast,$dat0
1149 veor $tmp1,$rndlast,$dat1
1150 veor $dat2,$dat2,$rndlast
1151 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1152 vst1.8 {$tmp0},[$out],#16
1153 vorr $dat0,$in0,$in0
1154 vst1.8 {$tmp1},[$out],#16
1155 vorr $dat1,$in1,$in1
1156 vst1.8 {$dat2},[$out],#16
1157 vorr $dat2,$in2,$in2
1158 b.hs .Loop3x_ecb_dec
1169 vld1.32 {q8},[$key_],#16
1175 vld1.32 {q9},[$key_],#16
1202 veor $tmp1,$rndlast,$dat1
1203 veor $tmp2,$rndlast,$dat2
1204 vst1.8 {$tmp1},[$out],#16
1205 vst1.8 {$tmp2},[$out],#16
1209 veor $tmp1,$rndlast,$dat2
1210 vst1.8 {$tmp1},[$out],#16
1215 $code.=<<___ if ($flavour !~ /64/);
1217 ldmia sp!,{r4-r8,pc}
1219 $code.=<<___ if ($flavour =~ /64/);
1222 $code.=<<___ if ($flavour =~ /64/);
1227 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1231 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1232 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1233 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1235 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1236 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1238 ### q8-q15 preloaded key schedule
1241 .globl ${prefix}_cbc_encrypt
1242 .type ${prefix}_cbc_encrypt,%function
1244 ${prefix}_cbc_encrypt:
1246 $code.=<<___ if ($flavour =~ /64/);
1247 AARCH64_VALID_CALL_TARGET
1248 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1249 stp x29,x30,[sp,#-16]!
1252 $code.=<<___ if ($flavour !~ /64/);
1254 stmdb sp!,{r4-r8,lr}
1255 vstmdb sp!,{d8-d15} @ ABI specification says so
1256 ldmia ip,{r4-r5} @ load remaining args
1264 cmp $enc,#0 // en- or decrypting?
1265 ldr $rounds,[$key,#240]
1267 vld1.8 {$ivec},[$ivp]
1268 vld1.8 {$dat},[$inp],$step
1270 vld1.32 {q8-q9},[$key] // load key schedule...
1271 sub $rounds,$rounds,#6
1272 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1273 sub $rounds,$rounds,#2
1274 vld1.32 {q10-q11},[$key_],#32
1275 vld1.32 {q12-q13},[$key_],#32
1276 vld1.32 {q14-q15},[$key_],#32
1277 vld1.32 {$rndlast},[$key_]
1284 veor $dat,$dat,$ivec
1285 veor $rndzero_n_last,q8,$rndlast
1288 vld1.32 {$in0-$in1},[$key_]
1290 add $key4,$key,#16*4
1291 add $key5,$key,#16*5
1294 add $key6,$key,#16*6
1295 add $key7,$key,#16*7
1302 vst1.8 {$ivec},[$out],#16
1308 vld1.32 {q8},[$key4]
1312 vld1.32 {q9},[$key5]
1317 vld1.32 {q8},[$key6]
1320 vld1.32 {q9},[$key7]
1334 vld1.8 {q8},[$inp],$step
1337 veor q8,q8,$rndzero_n_last
1340 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1344 veor $ivec,$dat,$rndlast
1347 vst1.8 {$ivec},[$out],#16
1352 vld1.32 {$in0-$in1},[$key_]
1355 b .Lenter_cbc_enc128
1359 vst1.8 {$ivec},[$out],#16
1373 vld1.8 {q8},[$inp],$step
1380 veor q8,q8,$rndzero_n_last
1382 veor $ivec,$dat,$rndlast
1383 b.hs .Loop_cbc_enc128
1385 vst1.8 {$ivec},[$out],#16
1389 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1391 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1392 my ($dat4,$in4,$tmp4);
1393 if ($flavour =~ /64/) {
1394 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1400 vld1.8 {$dat2},[$inp],#16
1401 subs $len,$len,#32 // bias
1404 vorr $dat1,$dat,$dat
1405 vorr $in2,$dat2,$dat2
1408 vorr $dat1,$dat2,$dat2
1409 vld1.8 {$dat2},[$inp],#16
1411 vorr $in1,$dat1,$dat1
1412 vorr $in2,$dat2,$dat2
1414 $code.=<<___ if ($flavour =~ /64/);
1416 b.lo .Loop3x_cbc_dec
1418 vld1.8 {$dat3},[$inp],#16
1419 vld1.8 {$dat4},[$inp],#16
1420 sub $len,$len,#32 // bias
1422 vorr $in3,$dat3,$dat3
1423 vorr $in4,$dat4,$dat4
1436 vld1.32 {q8},[$key_],#16
1448 vld1.32 {q9},[$key_],#16
1449 b.gt .Loop5x_cbc_dec
1461 cmp $len,#0x40 // because .Lcbc_tail4x
1474 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1487 add $inp,$inp,x6 // $inp is adjusted in such way that
1488 // at exit from the loop $dat1-$dat4
1489 // are loaded with last "words"
1490 add x6,$len,#0x60 // because .Lcbc_tail4x
1536 veor $tmp0,$ivec,$rndlast
1538 veor $tmp1,$in0,$rndlast
1539 vld1.8 {$in0},[$inp],#16
1541 veor $tmp2,$in1,$rndlast
1542 vld1.8 {$in1},[$inp],#16
1544 veor $tmp3,$in2,$rndlast
1545 vld1.8 {$in2},[$inp],#16
1547 veor $tmp4,$in3,$rndlast
1548 vld1.8 {$in3},[$inp],#16
1550 vorr $ivec,$in4,$in4
1551 vld1.8 {$in4},[$inp],#16
1553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1554 veor $tmp0,$tmp0,$dat0
1555 vorr $dat0,$in0,$in0
1556 veor $tmp1,$tmp1,$dat1
1557 vorr $dat1,$in1,$in1
1558 veor $tmp2,$tmp2,$dat2
1559 vorr $dat2,$in2,$in2
1560 veor $tmp3,$tmp3,$dat3
1561 vorr $dat3,$in3,$in3
1562 veor $tmp4,$tmp4,$dat4
1563 vst1.8 {$tmp0},[$out],#16
1564 vorr $dat4,$in4,$in4
1565 vst1.8 {$tmp1},[$out],#16
1567 vst1.8 {$tmp2},[$out],#16
1568 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1569 vst1.8 {$tmp3},[$out],#16
1570 vst1.8 {$tmp4},[$out],#16
1571 b.hs .Loop5x_cbc_dec
1577 subs $len,$len,#0x30
1578 vorr $dat0,$in2,$in2
1580 vorr $dat1,$in3,$in3
1582 vorr $dat2,$in4,$in4
1590 veor $tmp1,$tmp0,$dat1
1591 veor $tmp2,$tmp2,$dat2
1592 veor $tmp3,$tmp3,$dat3
1593 veor $tmp4,$tmp4,$dat4
1594 vst1.8 {$tmp1},[$out],#16
1595 vst1.8 {$tmp2},[$out],#16
1596 vst1.8 {$tmp3},[$out],#16
1597 vst1.8 {$tmp4},[$out],#16
1610 vld1.32 {q8},[$key_],#16
1618 vld1.32 {q9},[$key_],#16
1619 b.gt .Loop3x_cbc_dec
1627 veor $tmp0,$ivec,$rndlast
1628 subs $len,$len,#0x30
1629 veor $tmp1,$in0,$rndlast
1630 mov.lo x6,$len // x6, $cnt, is zero at this point
1637 veor $tmp2,$in1,$rndlast
1638 add $inp,$inp,x6 // $inp is adjusted in such way that
1639 // at exit from the loop $dat1-$dat2
1640 // are loaded with last "words"
1641 vorr $ivec,$in2,$in2
1649 vld1.8 {$in0},[$inp],#16
1656 vld1.8 {$in1},[$inp],#16
1663 vld1.8 {$in2},[$inp],#16
1667 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1669 veor $tmp0,$tmp0,$dat0
1670 veor $tmp1,$tmp1,$dat1
1671 veor $dat2,$dat2,$tmp2
1672 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1673 vst1.8 {$tmp0},[$out],#16
1674 vorr $dat0,$in0,$in0
1675 vst1.8 {$tmp1},[$out],#16
1676 vorr $dat1,$in1,$in1
1677 vst1.8 {$dat2},[$out],#16
1678 vorr $dat2,$in2,$in2
1679 b.hs .Loop3x_cbc_dec
1690 vld1.32 {q8},[$key_],#16
1696 vld1.32 {q9},[$key_],#16
1716 veor $tmp1,$ivec,$rndlast
1721 veor $tmp2,$in1,$rndlast
1725 veor $tmp1,$tmp1,$dat1
1726 veor $tmp2,$tmp2,$dat2
1727 vorr $ivec,$in2,$in2
1728 vst1.8 {$tmp1},[$out],#16
1729 vst1.8 {$tmp2},[$out],#16
1733 veor $tmp1,$tmp1,$dat2
1734 vorr $ivec,$in2,$in2
1735 vst1.8 {$tmp1},[$out],#16
1738 vst1.8 {$ivec},[$ivp]
1742 $code.=<<___ if ($flavour !~ /64/);
1744 ldmia sp!,{r4-r8,pc}
1746 $code.=<<___ if ($flavour =~ /64/);
1751 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1756 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1757 my ($rounds,$roundsx,$cnt,$key_)=("w5","x5","w6","x7");
1758 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1759 my ($tctr3,$tctr4,$tctr5,$tctr6)=map("w$_",(11,13..15));
1760 my ($tctr7,$tctr8,$tctr9,$tctr10,$tctr11)=map("w$_",(19..23));
1762 # q0-q7 => v0-v7; q8-q23 => v16-v31; q24-q31 => v8-v15
1763 my ($ivec,$rndlast,$rndping,$rndpang)=map("q$_",(0..3));
1764 my ($in0,$in1,$in2,$in3,$in4,$in5)=map("q$_",(4..9));
1765 my ($in6,$in7,$in8,$in9,$in10,$in11)=map("q$_",(10..15));
1766 my ($dat0,$dat1,$dat2,$dat3,$dat4,$dat5)=map("q$_",(16..21));
1767 my ($dat6,$dat7,$dat8,$dat9,$dat10,$dat11)=map("q$_",(22..27));
1768 my ($tmp0,$tmp1,$tmp2)=map("q$_",(25..27));
1770 #q_X => qX, for ldp & stp
1771 my ($in0q,$in1q,$in2q,$in3q)=map("q_$_",(4..7));
1772 my ($in4q,$in5q,$in6q,$in7q,$in8q,$in9q,$in10q,$in11q)=map("q_$_",(16..23));
1774 my ($dat8d,$dat9d,$dat10d,$dat11d)=map("d$_",(8..11));
1776 $code.=<<___ if ($flavour =~ /64/);
1777 .globl ${prefix}_ctr32_encrypt_blocks_unroll12_eor3
1778 .type ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,%function
1780 ${prefix}_ctr32_encrypt_blocks_unroll12_eor3:
1781 AARCH64_VALID_CALL_TARGET
1782 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
1783 stp x29,x30,[sp,#-80]!
1785 stp d10,d11,[sp, #32]
1786 stp d12,d13,[sp, #48]
1787 stp d14,d15,[sp, #64]
1790 ldr $rounds,[$key,#240]
1792 ldr $ctr, [$ivp, #12]
1793 #ifdef __AARCH64EB__
1794 vld1.8 {$dat0},[$ivp]
1796 vld1.32 {$dat0},[$ivp]
1798 vld1.32 {$rndping-$rndpang},[$key] // load key schedule...
1799 sub $rounds,$rounds,#4
1801 add $key_,$key,$roundsx,lsl#4 // pointer to last round key
1802 sub $rounds,$rounds,#2
1803 add $key_, $key_, #64
1804 vld1.32 {$rndlast},[$key_]
1807 #ifndef __AARCH64EB__
1811 vorr $dat1,$dat0,$dat0
1812 add $tctr1, $ctr, #1
1813 vorr $dat2,$dat0,$dat0
1815 vorr $ivec,$dat0,$dat0
1817 vmov.32 ${dat1}[3],$tctr1
1818 b.ls .Lctr32_tail_unroll
1821 sub $len,$len,#3 // bias
1822 vmov.32 ${dat2}[3],$tctr2
1823 b.lo .Loop3x_ctr32_unroll
1825 vorr $dat3,$dat0,$dat0
1826 add $tctr3, $ctr, #1
1827 vorr $dat4,$dat0,$dat0
1828 add $tctr4, $ctr, #2
1830 vorr $dat5,$dat0,$dat0
1833 vmov.32 ${dat3}[3],$tctr3
1835 vmov.32 ${dat4}[3],$tctr4
1836 vmov.32 ${dat5}[3],$tctr5
1838 b.lo .Loop6x_ctr32_unroll
1840 // push regs to stack when 12 data chunks are interleaved
1841 stp x19,x20,[sp,#-16]!
1842 stp x21,x22,[sp,#-16]!
1843 stp x23,x24,[sp,#-16]!
1844 stp $dat8d,$dat9d,[sp,#-32]!
1845 stp $dat10d,$dat11d,[sp,#-32]!
1853 vorr $dat6,$dat0,$dat0
1855 vorr $dat7,$dat0,$dat0
1857 vorr $dat8,$dat0,$dat0
1859 vorr $dat9,$dat0,$dat0
1861 vorr $dat10,$dat0,$dat0
1863 vorr $dat11,$dat0,$dat0
1866 sub $len,$len,#6 // bias
1867 vmov.32 ${dat6}[3],$tctr6
1868 vmov.32 ${dat7}[3],$tctr7
1869 vmov.32 ${dat8}[3],$tctr8
1870 vmov.32 ${dat9}[3],$tctr9
1871 vmov.32 ${dat10}[3],$tctr10
1872 vmov.32 ${dat11}[3],$tctr11
1873 b .Loop12x_ctr32_unroll
1876 .Loop12x_ctr32_unroll:
1897 aese $dat10,$rndping
1899 aese $dat11,$rndping
1901 vld1.32 {$rndping},[$key_],#16
1923 aese $dat10,$rndpang
1925 aese $dat11,$rndpang
1927 vld1.32 {$rndpang},[$key_],#16
1928 b.gt .Loop12x_ctr32_unroll
1950 aese $dat10,$rndping
1952 aese $dat11,$rndping
1954 vld1.32 {$rndping},[$key_],#16
1976 aese $dat10,$rndpang
1978 aese $dat11,$rndpang
1980 vld1.32 {$rndpang},[$key_],#16
2009 add $tctr10,$ctr,#11
2010 add $tctr11,$ctr,#12
2026 aese $dat10,$rndping
2028 aese $dat11,$rndping
2030 vld1.32 {$rndping},[$key_],#16
2040 vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64
2049 vld1.8 {$in4,$in5,$in6,$in7},[$inp],#64
2054 aese $dat10,$rndpang
2056 aese $dat11,$rndpang
2058 vld1.8 {$in8,$in9,$in10,$in11},[$inp],#64
2059 vld1.32 {$rndpang},[$key_],#16
2082 aese $dat10,$rndping
2084 aese $dat11,$rndping
2086 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
2089 eor3 $in0,$in0,$rndlast,$dat0
2090 vorr $dat0,$ivec,$ivec
2092 eor3 $in1,$in1,$rndlast,$dat1
2093 vorr $dat1,$ivec,$ivec
2095 eor3 $in2,$in2,$rndlast,$dat2
2096 vorr $dat2,$ivec,$ivec
2098 eor3 $in3,$in3,$rndlast,$dat3
2099 vorr $dat3,$ivec,$ivec
2101 eor3 $in4,$in4,$rndlast,$dat4
2102 vorr $dat4,$ivec,$ivec
2104 eor3 $in5,$in5,$rndlast,$dat5
2105 vorr $dat5,$ivec,$ivec
2107 eor3 $in6,$in6,$rndlast,$dat6
2108 vorr $dat6,$ivec,$ivec
2110 eor3 $in7,$in7,$rndlast,$dat7
2111 vorr $dat7,$ivec,$ivec
2113 eor3 $in8,$in8,$rndlast,$dat8
2114 vorr $dat8,$ivec,$ivec
2116 eor3 $in9,$in9,$rndlast,$dat9
2117 vorr $dat9,$ivec,$ivec
2118 aese $dat10,$rndpang
2119 eor3 $in10,$in10,$rndlast,$dat10
2120 vorr $dat10,$ivec,$ivec
2121 aese $dat11,$rndpang
2122 eor3 $in11,$in11,$rndlast,$dat11
2123 vorr $dat11,$ivec,$ivec
2124 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
2126 vmov.32 ${dat0}[3],$tctr0
2127 vmov.32 ${dat1}[3],$tctr1
2128 vmov.32 ${dat2}[3],$tctr2
2129 vmov.32 ${dat3}[3],$tctr3
2130 vst1.8 {$in0,$in1,$in2,$in3},[$out],#64
2131 vmov.32 ${dat4}[3],$tctr4
2132 vmov.32 ${dat5}[3],$tctr5
2133 vmov.32 ${dat6}[3],$tctr6
2134 vmov.32 ${dat7}[3],$tctr7
2135 vst1.8 {$in4,$in5,$in6,$in7},[$out],#64
2136 vmov.32 ${dat8}[3],$tctr8
2137 vmov.32 ${dat9}[3],$tctr9
2138 vmov.32 ${dat10}[3],$tctr10
2139 vmov.32 ${dat11}[3],$tctr11
2140 vst1.8 {$in8,$in9,$in10,$in11},[$out],#64
2146 b.hs .Loop12x_ctr32_unroll
2148 // pop regs from stack when 12 data chunks are interleaved
2149 ldp $dat10d,$dat11d,[sp],#32
2150 ldp $dat8d,$dat9d,[sp],#32
2151 ldp x23,x24,[sp],#16
2152 ldp x21,x22,[sp],#16
2153 ldp x19,x20,[sp],#16
2156 cbz $len,.Lctr32_done_unroll
2160 b.ls .Lctr32_tail_unroll
2163 sub $len,$len,#3 // bias
2165 b.lo .Loop3x_ctr32_unroll
2169 b.lo .Loop6x_ctr32_unroll
2172 .Loop6x_ctr32_unroll:
2185 vld1.32 {$rndping},[$key_],#16
2199 vld1.32 {$rndpang},[$key_],#16
2200 b.gt .Loop6x_ctr32_unroll
2214 vld1.32 {$rndping},[$key_],#16
2228 vld1.32 {$rndpang},[$key_],#16
2254 vld1.32 {$rndping},[$key_],#16
2260 vld1.8 {$in0,$in1,$in2,$in3},[$inp],#64
2265 vld1.8 {$in4,$in5},[$inp],#32
2270 vld1.32 {$rndpang},[$key_],#16
2285 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
2288 eor3 $in0,$in0,$rndlast,$dat0
2290 eor3 $in1,$in1,$rndlast,$dat1
2292 eor3 $in2,$in2,$rndlast,$dat2
2294 eor3 $in3,$in3,$rndlast,$dat3
2296 eor3 $in4,$in4,$rndlast,$dat4
2298 eor3 $in5,$in5,$rndlast,$dat5
2299 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
2301 vorr $dat0,$ivec,$ivec
2302 vorr $dat1,$ivec,$ivec
2303 vorr $dat2,$ivec,$ivec
2304 vorr $dat3,$ivec,$ivec
2305 vorr $dat4,$ivec,$ivec
2306 vorr $dat5,$ivec,$ivec
2308 vmov.32 ${dat0}[3],$tctr0
2309 vmov.32 ${dat1}[3],$tctr1
2310 vst1.8 {$in0,$in1,$in2,$in3},[$out],#64
2311 vmov.32 ${dat2}[3],$tctr2
2312 vmov.32 ${dat3}[3],$tctr3
2313 vst1.8 {$in4,$in5},[$out],#32
2314 vmov.32 ${dat4}[3],$tctr4
2315 vmov.32 ${dat5}[3],$tctr5
2317 cbz $len,.Lctr32_done_unroll
2321 b.ls .Lctr32_tail_unroll
2323 sub $len,$len,#3 // bias
2325 b .Loop3x_ctr32_unroll
2328 .Loop3x_ctr32_unroll:
2335 vld1.32 {$rndping},[$key_],#16
2343 vld1.32 {$rndpang},[$key_],#16
2344 b.gt .Loop3x_ctr32_unroll
2350 vld1.8 {$in0,$in1,$in2},[$inp],#48
2351 vorr $dat0,$ivec,$ivec
2354 vld1.32 {$rndping},[$key_],#16
2355 vorr $dat1,$ivec,$ivec
2362 vld1.32 {$rndpang},[$key_],#16
2363 vorr $dat2,$ivec,$ivec
2372 vld1.32 {$rndping},[$key_],#16
2382 vld1.32 {$rndpang},[$key_],#16
2383 vmov.32 ${dat0}[3], $tctr0
2391 vmov.32 ${dat1}[3], $tctr1
2395 vmov.32 ${dat2}[3], $tctr2
2401 eor3 $in0,$in0,$rndlast,$tmp0
2402 vld1.32 {$rndping},[$key_],#16 // re-pre-load rndkey[0]
2403 eor3 $in1,$in1,$rndlast,$tmp1
2405 eor3 $in2,$in2,$rndlast,$tmp2
2406 vld1.32 {$rndpang},[$key_],#16 // re-pre-load rndkey[1]
2407 vst1.8 {$in0,$in1,$in2},[$out],#48
2409 cbz $len,.Lctr32_done_unroll
2411 .Lctr32_tail_unroll:
2413 b.eq .Lctr32_tail_1_unroll
2415 .Lctr32_tail_2_unroll:
2420 vld1.32 {$rndping},[$key_],#16
2426 vld1.32 {$rndpang},[$key_],#16
2427 b.gt .Lctr32_tail_2_unroll
2433 vld1.32 {$rndping},[$key_],#16
2438 vld1.32 {$rndpang},[$key_],#16
2439 vld1.8 {$in0,$in1},[$inp],#32
2444 vld1.32 {$rndping},[$key_],#16
2449 vld1.32 {$rndpang},[$key_],#16
2457 eor3 $in0,$in0,$rndlast,$dat0
2458 eor3 $in1,$in1,$rndlast,$dat1
2459 vst1.8 {$in0,$in1},[$out],#32
2460 b .Lctr32_done_unroll
2462 .Lctr32_tail_1_unroll:
2465 vld1.32 {$rndping},[$key_],#16
2469 vld1.32 {$rndpang},[$key_],#16
2470 b.gt .Lctr32_tail_1_unroll
2474 vld1.32 {$rndping},[$key_],#16
2477 vld1.32 {$rndpang},[$key_],#16
2478 vld1.8 {$in0},[$inp]
2481 vld1.32 {$rndping},[$key_],#16
2484 vld1.32 {$rndpang},[$key_],#16
2489 eor3 $in0,$in0,$rndlast,$dat0
2490 vst1.8 {$in0},[$out],#16
2492 .Lctr32_done_unroll:
2494 ldp d10,d11,[sp, #32]
2495 ldp d12,d13,[sp, #48]
2496 ldp d15,d16,[sp, #64]
2499 .size ${prefix}_ctr32_encrypt_blocks_unroll12_eor3,.-${prefix}_ctr32_encrypt_blocks_unroll12_eor3
2504 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
2505 my ($rounds,$cnt,$key_)=("w5","w6","x7");
2506 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
2507 my $step="x12"; # aliases with $tctr2
2509 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
2510 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2512 # used only in 64-bit mode...
2513 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
2515 my ($dat,$tmp)=($dat0,$tmp0);
2517 ### q8-q15 preloaded key schedule
2520 .globl ${prefix}_ctr32_encrypt_blocks
2521 .type ${prefix}_ctr32_encrypt_blocks,%function
2523 ${prefix}_ctr32_encrypt_blocks:
2525 $code.=<<___ if ($flavour =~ /64/);
2526 AARCH64_VALID_CALL_TARGET
2527 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later.
2528 stp x29,x30,[sp,#-16]!
2531 $code.=<<___ if ($flavour !~ /64/);
2533 stmdb sp!,{r4-r10,lr}
2534 vstmdb sp!,{d8-d15} @ ABI specification says so
2535 ldr r4, [ip] @ load remaining arg
2538 ldr $rounds,[$key,#240]
2540 ldr $ctr, [$ivp, #12]
2542 vld1.8 {$dat0},[$ivp]
2544 vld1.32 {$dat0},[$ivp]
2546 vld1.32 {q8-q9},[$key] // load key schedule...
2547 sub $rounds,$rounds,#4
2550 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
2551 sub $rounds,$rounds,#2
2552 vld1.32 {q12-q13},[$key_],#32
2553 vld1.32 {q14-q15},[$key_],#32
2554 vld1.32 {$rndlast},[$key_]
2562 $code.=<<___ if ($flavour =~ /64/);
2563 vorr $dat1,$dat0,$dat0
2564 add $tctr1, $ctr, #1
2565 vorr $dat2,$dat0,$dat0
2567 vorr $ivec,$dat0,$dat0
2569 vmov.32 ${dat1}[3],$tctr1
2572 sub $len,$len,#3 // bias
2573 vmov.32 ${dat2}[3],$tctr2
2575 $code.=<<___ if ($flavour !~ /64/);
2576 add $tctr1, $ctr, #1
2577 vorr $ivec,$dat0,$dat0
2579 vmov.32 ${ivec}[3],$tctr1
2581 vorr $dat1,$ivec,$ivec
2584 vmov.32 ${ivec}[3],$tctr2
2585 sub $len,$len,#3 // bias
2586 vorr $dat2,$ivec,$ivec
2588 $code.=<<___ if ($flavour =~ /64/);
2594 vorr $dat3,$dat0,$dat0
2596 vorr $dat4,$dat0,$dat0
2598 vmov.32 ${dat3}[3],w13
2599 sub $len,$len,#2 // bias
2600 vmov.32 ${dat4}[3],w14
2616 vld1.32 {q8},[$key_],#16
2628 vld1.32 {q9},[$key_],#16
2642 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2654 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2690 vld1.8 {$in0},[$inp],#16
2693 vld1.8 {$in1},[$inp],#16
2696 vld1.8 {$in2},[$inp],#16
2699 vld1.8 {$in3},[$inp],#16
2702 vld1.8 {$in4},[$inp],#16
2705 veor $in0,$in0,$rndlast
2707 veor $in1,$in1,$rndlast
2709 veor $in2,$in2,$rndlast
2711 veor $in3,$in3,$rndlast
2713 veor $in4,$in4,$rndlast
2715 veor $in0,$in0,$dat0
2716 vorr $dat0,$ivec,$ivec
2717 veor $in1,$in1,$dat1
2718 vorr $dat1,$ivec,$ivec
2719 veor $in2,$in2,$dat2
2720 vorr $dat2,$ivec,$ivec
2721 veor $in3,$in3,$dat3
2722 vorr $dat3,$ivec,$ivec
2723 veor $in4,$in4,$dat4
2724 vorr $dat4,$ivec,$ivec
2726 vst1.8 {$in0},[$out],#16
2727 vmov.32 ${dat0}[3],$tctr0
2728 vst1.8 {$in1},[$out],#16
2729 vmov.32 ${dat1}[3],$tctr1
2730 vst1.8 {$in2},[$out],#16
2731 vmov.32 ${dat2}[3],$tctr2
2732 vst1.8 {$in3},[$out],#16
2733 vmov.32 ${dat3}[3],w13
2734 vst1.8 {$in4},[$out],#16
2735 vmov.32 ${dat4}[3],w14
2738 cbz $len,.Lctr32_done
2752 sub $len,$len,#3 // bias
2766 vld1.32 {q8},[$key_],#16
2774 vld1.32 {q9},[$key_],#16
2781 vld1.8 {$in0},[$inp],#16
2783 $code.=<<___ if ($flavour =~ /64/);
2784 vorr $dat0,$ivec,$ivec
2786 $code.=<<___ if ($flavour !~ /64/);
2792 vld1.8 {$in1},[$inp],#16
2794 $code.=<<___ if ($flavour =~ /64/);
2795 vorr $dat1,$ivec,$ivec
2797 $code.=<<___ if ($flavour !~ /64/);
2805 vld1.8 {$in2},[$inp],#16
2810 $code.=<<___ if ($flavour =~ /64/);
2811 vorr $dat2,$ivec,$ivec
2819 veor $in0,$in0,$rndlast
2823 veor $in1,$in1,$rndlast
2829 veor $in2,$in2,$rndlast
2831 $code.=<<___ if ($flavour =~ /64/);
2835 vmov.32 ${dat0}[3], $tctr0
2837 $code.=<<___ if ($flavour !~ /64/);
2838 vmov.32 ${ivec}[3], $tctr0
2841 vorr $dat0,$ivec,$ivec
2848 $code.=<<___ if ($flavour !~ /64/);
2849 vmov.32 ${ivec}[3], $tctr1
2856 $code.=<<___ if ($flavour =~ /64/);
2857 vmov.32 ${dat1}[3], $tctr1
2861 vmov.32 ${dat2}[3], $tctr2
2863 $code.=<<___ if ($flavour !~ /64/);
2864 vorr $dat1,$ivec,$ivec
2865 vmov.32 ${ivec}[3], $tctr2
2868 vorr $dat2,$ivec,$ivec
2876 veor $in0,$in0,$tmp0
2877 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2878 vst1.8 {$in0},[$out],#16
2879 veor $in1,$in1,$tmp1
2881 vst1.8 {$in1},[$out],#16
2882 veor $in2,$in2,$tmp2
2883 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2884 vst1.8 {$in2},[$out],#16
2898 vld1.32 {q8},[$key_],#16
2904 vld1.32 {q9},[$key_],#16
2915 vld1.8 {$in0},[$inp],$step
2920 vld1.8 {$in1},[$inp]
2925 veor $in0,$in0,$rndlast
2930 veor $in1,$in1,$rndlast
2935 veor $in0,$in0,$dat0
2936 veor $in1,$in1,$dat1
2937 vst1.8 {$in0},[$out],#16
2939 vst1.8 {$in1},[$out]
2943 $code.=<<___ if ($flavour !~ /64/);
2945 ldmia sp!,{r4-r10,pc}
2947 $code.=<<___ if ($flavour =~ /64/);
2952 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2955 # Performance in cycles per byte.
2956 # Processed with AES-XTS different key size.
2957 # It shows the value before and after optimization as below:
2960 # AES-128-XTS AES-256-XTS
2961 # Cortex-A57 3.36/1.09 4.02/1.37
2962 # Cortex-A72 3.03/1.02 3.28/1.33
2964 # Optimization is implemented by loop unrolling and interleaving.
2965 # Commonly, we choose the unrolling factor as 5, if the input
2966 # data size smaller than 5 blocks, but not smaller than 3 blocks,
2967 # choose 3 as the unrolling factor.
2968 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
2969 # as one iteration, every loop the left size lsize -= 5*16.
2970 # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2971 # will be processed specially, which be integrated into the 5*16 bytes
2972 # loop to improve the efficiency.
2973 # There is one special case, if the original input data size dsize
2974 # = 16 bytes, we will treat it separately to improve the
2975 # performance: one independent code block without LR, FP load and
2977 # Encryption will process the (length -tailcnt) bytes as mentioned
2978 # previously, then encrypt the composite block as last second
2980 # Decryption will process the (length -tailcnt -1) bytes as mentioned
2981 # previously, then decrypt the last second cipher block to get the
2982 # last plain block(tail), decrypt the composite block as last second
2986 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2987 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2988 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2989 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2990 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2991 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2992 my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2993 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2994 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2996 my ($tmpin)=("v26.16b");
2997 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
3000 # q10-q15, q7 Last 7 round keys
3001 # q8-q9 preloaded round keys except last 7 keys for big size
3002 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
3005 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
3007 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
3008 my ($dat4,$in4,$tmp4);
3009 if ($flavour =~ /64/) {
3010 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
3013 $code.=<<___ if ($flavour =~ /64/);
3014 .globl ${prefix}_xts_encrypt
3015 .type ${prefix}_xts_encrypt,%function
3017 ${prefix}_xts_encrypt:
3019 $code.=<<___ if ($flavour =~ /64/);
3020 AARCH64_VALID_CALL_TARGET
3022 // Original input data size bigger than 16, jump to big size processing.
3023 b.ne .Lxts_enc_big_size
3024 // Encrypt the iv with key2, as the first XEX iv.
3025 ldr $rounds,[$key2,#240]
3026 vld1.32 {$dat},[$key2],#16
3027 vld1.8 {$iv0},[$ivp]
3028 sub $rounds,$rounds,#2
3029 vld1.32 {$dat1},[$key2],#16
3034 vld1.32 {$dat},[$key2],#16
3035 subs $rounds,$rounds,#2
3038 vld1.32 {$dat1},[$key2],#16
3039 b.gt .Loop_enc_iv_enc
3043 vld1.32 {$dat},[$key2]
3047 vld1.8 {$dat0},[$inp]
3048 veor $dat0,$iv0,$dat0
3050 ldr $rounds,[$key1,#240]
3051 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
3055 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
3058 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
3060 .Lxts_enc_round_loop:
3063 vld1.32 {q8},[$key1],#16 // load key schedule...
3066 vld1.32 {q9},[$key1],#16 // load key schedule...
3067 subs $rounds,$rounds,#2 // bias
3068 b.gt .Lxts_enc_round_loop
3070 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
3075 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
3080 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
3085 vld1.32 {$rndlast},[$key1]
3089 veor $dat0,$dat0,$rndlast
3090 veor $dat0,$dat0,$iv0
3091 vst1.8 {$dat0},[$out]
3092 b .Lxts_enc_final_abort
3097 $code.=<<___ if ($flavour =~ /64/);
3098 stp $constnumx,$tmpinp,[sp,#-64]!
3099 stp $tailcnt,$midnumx,[sp,#48]
3100 stp $ivd10,$ivd20,[sp,#32]
3101 stp $ivd30,$ivd40,[sp,#16]
3103 // tailcnt store the tail value of length%16.
3104 and $tailcnt,$len,#0xf
3109 csel $step,xzr,$step,eq
3111 // Firstly, encrypt the iv with key2, as the first iv of XEX.
3112 ldr $rounds,[$key2,#240]
3113 vld1.32 {$dat},[$key2],#16
3114 vld1.8 {$iv0},[$ivp]
3115 sub $rounds,$rounds,#2
3116 vld1.32 {$dat1},[$key2],#16
3121 vld1.32 {$dat},[$key2],#16
3122 subs $rounds,$rounds,#2
3125 vld1.32 {$dat1},[$key2],#16
3130 vld1.32 {$dat},[$key2]
3134 // The iv for second block
3135 // $ivl- iv(low), $ivh - iv(high)
3136 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3140 extr $midnumx,$ivh,$ivh,#32
3141 extr $ivh,$ivh,$ivl,#63
3142 and $tmpmw,$constnum,$midnum,asr#31
3143 eor $ivl,$tmpmx,$ivl,lsl#1
3147 ldr $rounds0,[$key1,#240] // next starting point
3148 vld1.8 {$dat},[$inp],$step
3150 vld1.32 {q8-q9},[$key1] // load key schedule...
3151 sub $rounds0,$rounds0,#6
3152 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3153 sub $rounds0,$rounds0,#2
3154 vld1.32 {q10-q11},[$key_],#32
3155 vld1.32 {q12-q13},[$key_],#32
3156 vld1.32 {q14-q15},[$key_],#32
3157 vld1.32 {$rndlast},[$key_]
3160 mov $rounds,$rounds0
3164 vld1.8 {$dat2},[$inp],#16
3165 subs $len,$len,#32 // bias
3166 add $rounds,$rounds0,#2
3168 vorr $dat1,$dat,$dat
3170 vorr $in2,$dat2,$dat2
3171 vorr $in4,$dat2,$dat2
3172 b.lo .Lxts_inner_enc_tail
3173 veor $dat,$dat,$iv0 // before encryption, xor with iv
3174 veor $dat2,$dat2,$iv1
3176 // The iv for third block
3177 extr $midnumx,$ivh,$ivh,#32
3178 extr $ivh,$ivh,$ivl,#63
3179 and $tmpmw,$constnum,$midnum,asr#31
3180 eor $ivl,$tmpmx,$ivl,lsl#1
3185 vorr $dat1,$dat2,$dat2
3186 vld1.8 {$dat2},[$inp],#16
3188 vorr $in1,$dat1,$dat1
3189 veor $in2,$dat2,$iv2 // the third block
3190 veor $dat2,$dat2,$iv2
3192 b.lo .Lxts_outer_enc_tail
3194 // The iv for fourth block
3195 extr $midnumx,$ivh,$ivh,#32
3196 extr $ivh,$ivh,$ivl,#63
3197 and $tmpmw,$constnum,$midnum,asr#31
3198 eor $ivl,$tmpmx,$ivl,lsl#1
3202 vld1.8 {$dat3},[$inp],#16
3203 // The iv for fifth block
3204 extr $midnumx,$ivh,$ivh,#32
3205 extr $ivh,$ivh,$ivl,#63
3206 and $tmpmw,$constnum,$midnum,asr#31
3207 eor $ivl,$tmpmx,$ivl,lsl#1
3211 vld1.8 {$dat4},[$inp],#16
3212 veor $dat3,$dat3,$iv3 // the fourth block
3213 veor $dat4,$dat4,$iv4
3214 sub $len,$len,#32 // bias
3215 mov $rounds,$rounds0
3230 vld1.32 {q8},[$key_],#16
3231 subs $rounds,$rounds,#2
3242 vld1.32 {q9},[$key_],#16
3243 b.gt .Loop5x_xts_enc
3255 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
3267 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3280 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3281 // at exit from the loop v1.16b-v26.16b
3282 // are loaded with last "words"
3283 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
3329 veor $tmp0,$rndlast,$iv0
3331 // The iv for first block of one iteration
3332 extr $midnumx,$ivh,$ivh,#32
3333 extr $ivh,$ivh,$ivl,#63
3334 and $tmpmw,$constnum,$midnum,asr#31
3335 eor $ivl,$tmpmx,$ivl,lsl#1
3338 veor $tmp1,$rndlast,$iv1
3339 vld1.8 {$in0},[$inp],#16
3341 // The iv for second block
3342 extr $midnumx,$ivh,$ivh,#32
3343 extr $ivh,$ivh,$ivl,#63
3344 and $tmpmw,$constnum,$midnum,asr#31
3345 eor $ivl,$tmpmx,$ivl,lsl#1
3348 veor $tmp2,$rndlast,$iv2
3349 vld1.8 {$in1},[$inp],#16
3351 // The iv for third block
3352 extr $midnumx,$ivh,$ivh,#32
3353 extr $ivh,$ivh,$ivl,#63
3354 and $tmpmw,$constnum,$midnum,asr#31
3355 eor $ivl,$tmpmx,$ivl,lsl#1
3358 veor $tmp3,$rndlast,$iv3
3359 vld1.8 {$in2},[$inp],#16
3361 // The iv for fourth block
3362 extr $midnumx,$ivh,$ivh,#32
3363 extr $ivh,$ivh,$ivl,#63
3364 and $tmpmw,$constnum,$midnum,asr#31
3365 eor $ivl,$tmpmx,$ivl,lsl#1
3368 veor $tmp4,$rndlast,$iv4
3369 vld1.8 {$in3},[$inp],#16
3372 // The iv for fifth block
3373 extr $midnumx,$ivh,$ivh,#32
3374 extr $ivh,$ivh,$ivl,#63
3375 and $tmpmw,$constnum,$midnum,asr #31
3376 eor $ivl,$tmpmx,$ivl,lsl #1
3380 vld1.8 {$in4},[$inp],#16
3381 cbz $xoffset,.Lxts_enc_tail4x
3382 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3383 veor $tmp0,$tmp0,$dat0
3384 veor $dat0,$in0,$iv0
3385 veor $tmp1,$tmp1,$dat1
3386 veor $dat1,$in1,$iv1
3387 veor $tmp2,$tmp2,$dat2
3388 veor $dat2,$in2,$iv2
3389 veor $tmp3,$tmp3,$dat3
3390 veor $dat3,$in3,$iv3
3391 veor $tmp4,$tmp4,$dat4
3392 vst1.8 {$tmp0},[$out],#16
3393 veor $dat4,$in4,$iv4
3394 vst1.8 {$tmp1},[$out],#16
3395 mov $rounds,$rounds0
3396 vst1.8 {$tmp2},[$out],#16
3397 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3398 vst1.8 {$tmp3},[$out],#16
3399 vst1.8 {$tmp4},[$out],#16
3400 b.hs .Loop5x_xts_enc
3403 // If left 4 blocks, borrow the five block's processing.
3405 b.ne .Loop5x_enc_after
3412 veor $dat0,$iv0,$in0
3413 veor $dat1,$iv1,$in1
3414 veor $dat2,$in2,$iv2
3415 veor $dat3,$in3,$iv3
3416 veor $dat4,$in4,$iv4
3417 b.eq .Loop5x_xts_enc
3421 cbz $len,.Lxts_enc_done
3423 add $rounds,$rounds0,#2
3424 subs $len,$len,#0x30
3425 b.lo .Lxts_inner_enc_tail
3427 veor $dat0,$iv0,$in2
3428 veor $dat1,$iv1,$in3
3429 veor $dat2,$in4,$iv2
3430 b .Lxts_outer_enc_tail
3435 veor $tmp1,$dat1,$tmp1
3436 vst1.8 {$tmp1},[$out],#16
3437 veor $tmp2,$dat2,$tmp2
3438 vst1.8 {$tmp2},[$out],#16
3439 veor $tmp3,$dat3,$tmp3
3440 veor $tmp4,$dat4,$tmp4
3441 vst1.8 {$tmp3-$tmp4},[$out],#32
3445 .Lxts_outer_enc_tail:
3452 vld1.32 {q8},[$key_],#16
3453 subs $rounds,$rounds,#2
3460 vld1.32 {q9},[$key_],#16
3461 b.gt .Lxts_outer_enc_tail
3469 veor $tmp0,$iv0,$rndlast
3470 subs $len,$len,#0x30
3471 // The iv for first block
3474 //mov $constnum,#0x87
3475 extr $midnumx,$ivh,$ivh,#32
3476 extr $ivh,$ivh,$ivl,#63
3477 and $tmpmw,$constnum,$midnum,asr#31
3478 eor $ivl,$tmpmx,$ivl,lsl#1
3481 veor $tmp1,$iv1,$rndlast
3482 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3489 veor $tmp2,$iv2,$rndlast
3491 add $xoffset,$xoffset,#0x20
3492 add $inp,$inp,$xoffset
3516 vld1.8 {$in2},[$inp],#16
3517 add $rounds,$rounds0,#2
3518 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3519 veor $tmp0,$tmp0,$dat0
3520 veor $tmp1,$tmp1,$dat1
3521 veor $dat2,$dat2,$tmp2
3522 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3523 vst1.8 {$tmp0},[$out],#16
3524 vst1.8 {$tmp1},[$out],#16
3525 vst1.8 {$dat2},[$out],#16
3533 .Lxts_inner_enc_tail:
3535 veor $dat1,$in3,$iv0
3536 veor $dat2,$in4,$iv1
3537 b.eq .Lxts_enc_tail_loop
3538 veor $dat2,$in4,$iv0
3539 .Lxts_enc_tail_loop:
3544 vld1.32 {q8},[$key_],#16
3545 subs $rounds,$rounds,#2
3550 vld1.32 {q9},[$key_],#16
3551 b.gt .Lxts_enc_tail_loop
3570 veor $tmp1,$iv0,$rndlast
3575 veor $tmp2,$iv1,$rndlast
3579 veor $tmp1,$tmp1,$dat1
3580 vst1.8 {$tmp1},[$out],#16
3581 veor $tmp2,$tmp2,$dat2
3583 vst1.8 {$tmp2},[$out],#16
3587 extr $midnumx,$ivh,$ivh,#32
3588 extr $ivh,$ivh,$ivl,#63
3589 and $tmpmw,$constnum,$midnum,asr #31
3590 eor $ivl,$tmpmx,$ivl,lsl #1
3596 veor $tmp1,$tmp1,$dat2
3598 vst1.8 {$tmp1},[$out],#16
3602 extr $midnumx,$ivh,$ivh,#32
3603 extr $ivh,$ivh,$ivl,#63
3604 and $tmpmw,$constnum,$midnum,asr #31
3605 eor $ivl,$tmpmx,$ivl,lsl #1
3611 // Process the tail block with cipher stealing.
3618 .composite_enc_loop:
3619 subs $tailcnt,$tailcnt,#1
3620 ldrb $l2outp,[$out,$tailcnt]
3621 ldrb $loutp,[$tmpinp,$tailcnt]
3622 strb $l2outp,[$tmpoutp,$tailcnt]
3623 strb $loutp,[$out,$tailcnt]
3624 b.gt .composite_enc_loop
3625 .Lxts_enc_load_done:
3626 vld1.8 {$tmpin},[$out]
3627 veor $tmpin,$tmpin,$iv0
3629 // Encrypt the composite block to get the last second encrypted text block
3630 ldr $rounds,[$key1,#240] // load key schedule...
3631 vld1.32 {$dat},[$key1],#16
3632 sub $rounds,$rounds,#2
3633 vld1.32 {$dat1},[$key1],#16 // load key schedule...
3637 vld1.32 {$dat0},[$key1],#16
3638 subs $rounds,$rounds,#2
3641 vld1.32 {$dat1},[$key1],#16
3642 b.gt .Loop_final_enc
3646 vld1.32 {$dat0},[$key1]
3648 veor $tmpin,$tmpin,$dat0
3649 veor $tmpin,$tmpin,$iv0
3650 vst1.8 {$tmpin},[$out]
3653 ldp $tailcnt,$midnumx,[sp,#48]
3654 ldp $ivd10,$ivd20,[sp,#32]
3655 ldp $ivd30,$ivd40,[sp,#16]
3656 ldp $constnumx,$tmpinp,[sp],#64
3657 .Lxts_enc_final_abort:
3659 .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
3664 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
3665 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
3666 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
3667 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
3668 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
3669 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
3670 my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
3671 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
3672 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
3674 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
3677 # q10-q15, q7 Last 7 round keys
3678 # q8-q9 preloaded round keys except last 7 keys for big size
3679 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
3682 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
3684 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
3685 my ($dat4,$in4,$tmp4);
3686 if ($flavour =~ /64/) {
3687 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
3690 $code.=<<___ if ($flavour =~ /64/);
3691 .globl ${prefix}_xts_decrypt
3692 .type ${prefix}_xts_decrypt,%function
3694 ${prefix}_xts_decrypt:
3695 AARCH64_VALID_CALL_TARGET
3697 $code.=<<___ if ($flavour =~ /64/);
3699 // Original input data size bigger than 16, jump to big size processing.
3700 b.ne .Lxts_dec_big_size
3701 // Encrypt the iv with key2, as the first XEX iv.
3702 ldr $rounds,[$key2,#240]
3703 vld1.32 {$dat},[$key2],#16
3704 vld1.8 {$iv0},[$ivp]
3705 sub $rounds,$rounds,#2
3706 vld1.32 {$dat1},[$key2],#16
3708 .Loop_dec_small_iv_enc:
3711 vld1.32 {$dat},[$key2],#16
3712 subs $rounds,$rounds,#2
3715 vld1.32 {$dat1},[$key2],#16
3716 b.gt .Loop_dec_small_iv_enc
3720 vld1.32 {$dat},[$key2]
3724 vld1.8 {$dat0},[$inp]
3725 veor $dat0,$iv0,$dat0
3727 ldr $rounds,[$key1,#240]
3728 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
3732 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
3735 subs $rounds,$rounds,#10 // bias
3737 .Lxts_dec_round_loop:
3740 vld1.32 {q8},[$key1],#16 // load key schedule...
3743 vld1.32 {q9},[$key1],#16 // load key schedule...
3744 subs $rounds,$rounds,#2 // bias
3745 b.gt .Lxts_dec_round_loop
3747 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
3752 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
3757 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
3762 vld1.32 {$rndlast},[$key1]
3766 veor $dat0,$dat0,$rndlast
3767 veor $dat0,$iv0,$dat0
3768 vst1.8 {$dat0},[$out]
3769 b .Lxts_dec_final_abort
3772 $code.=<<___ if ($flavour =~ /64/);
3773 stp $constnumx,$tmpinp,[sp,#-64]!
3774 stp $tailcnt,$midnumx,[sp,#48]
3775 stp $ivd10,$ivd20,[sp,#32]
3776 stp $ivd30,$ivd40,[sp,#16]
3778 and $tailcnt,$len,#0xf
3782 b.lo .Lxts_dec_abort
3784 // Encrypt the iv with key2, as the first XEX iv
3785 ldr $rounds,[$key2,#240]
3786 vld1.32 {$dat},[$key2],#16
3787 vld1.8 {$iv0},[$ivp]
3788 sub $rounds,$rounds,#2
3789 vld1.32 {$dat1},[$key2],#16
3794 vld1.32 {$dat},[$key2],#16
3795 subs $rounds,$rounds,#2
3798 vld1.32 {$dat1},[$key2],#16
3799 b.gt .Loop_dec_iv_enc
3803 vld1.32 {$dat},[$key2]
3807 // The iv for second block
3808 // $ivl- iv(low), $ivh - iv(high)
3809 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
3813 extr $midnumx,$ivh,$ivh,#32
3814 extr $ivh,$ivh,$ivl,#63
3815 and $tmpmw,$constnum,$midnum,asr #31
3816 eor $ivl,$tmpmx,$ivl,lsl #1
3820 ldr $rounds0,[$key1,#240] // load rounds number
3822 // The iv for third block
3823 extr $midnumx,$ivh,$ivh,#32
3824 extr $ivh,$ivh,$ivl,#63
3825 and $tmpmw,$constnum,$midnum,asr #31
3826 eor $ivl,$tmpmx,$ivl,lsl #1
3830 vld1.32 {q8-q9},[$key1] // load key schedule...
3831 sub $rounds0,$rounds0,#6
3832 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3833 sub $rounds0,$rounds0,#2
3834 vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3835 vld1.32 {q12-q13},[$key_],#32
3836 vld1.32 {q14-q15},[$key_],#32
3837 vld1.32 {$rndlast},[$key_]
3839 // The iv for fourth block
3840 extr $midnumx,$ivh,$ivh,#32
3841 extr $ivh,$ivh,$ivl,#63
3842 and $tmpmw,$constnum,$midnum,asr #31
3843 eor $ivl,$tmpmx,$ivl,lsl #1
3848 mov $rounds,$rounds0
3855 b.eq .Lxts_dec_begin
3857 csel $step,xzr,$step,eq
3858 vld1.8 {$dat},[$inp],#16
3862 vld1.8 {$dat},[$inp],$step
3863 subs $len,$len,#32 // bias
3864 add $rounds,$rounds0,#2
3866 vorr $dat1,$dat,$dat
3868 vld1.8 {$dat2},[$inp],#16
3869 vorr $in2,$dat2,$dat2
3870 vorr $in4,$dat2,$dat2
3871 b.lo .Lxts_inner_dec_tail
3872 veor $dat,$dat,$iv0 // before decryt, xor with iv
3873 veor $dat2,$dat2,$iv1
3875 vorr $dat1,$dat2,$dat2
3876 vld1.8 {$dat2},[$inp],#16
3878 vorr $in1,$dat1,$dat1
3879 veor $in2,$dat2,$iv2 // third block xox with third iv
3880 veor $dat2,$dat2,$iv2
3882 b.lo .Lxts_outer_dec_tail
3884 vld1.8 {$dat3},[$inp],#16
3886 // The iv for fifth block
3887 extr $midnumx,$ivh,$ivh,#32
3888 extr $ivh,$ivh,$ivl,#63
3889 and $tmpmw,$constnum,$midnum,asr #31
3890 eor $ivl,$tmpmx,$ivl,lsl #1
3894 vld1.8 {$dat4},[$inp],#16
3895 veor $dat3,$dat3,$iv3 // the fourth block
3896 veor $dat4,$dat4,$iv4
3897 sub $len,$len,#32 // bias
3898 mov $rounds,$rounds0
3913 vld1.32 {q8},[$key_],#16 // load key schedule...
3914 subs $rounds,$rounds,#2
3925 vld1.32 {q9},[$key_],#16 // load key schedule...
3926 b.gt .Loop5x_xts_dec
3938 subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3950 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3963 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3964 // at exit from the loop v1.16b-v26.16b
3965 // are loaded with last "words"
3966 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
4012 veor $tmp0,$rndlast,$iv0
4014 // The iv for first block of next iteration.
4015 extr $midnumx,$ivh,$ivh,#32
4016 extr $ivh,$ivh,$ivl,#63
4017 and $tmpmw,$constnum,$midnum,asr #31
4018 eor $ivl,$tmpmx,$ivl,lsl #1
4021 veor $tmp1,$rndlast,$iv1
4022 vld1.8 {$in0},[$inp],#16
4024 // The iv for second block
4025 extr $midnumx,$ivh,$ivh,#32
4026 extr $ivh,$ivh,$ivl,#63
4027 and $tmpmw,$constnum,$midnum,asr #31
4028 eor $ivl,$tmpmx,$ivl,lsl #1
4031 veor $tmp2,$rndlast,$iv2
4032 vld1.8 {$in1},[$inp],#16
4034 // The iv for third block
4035 extr $midnumx,$ivh,$ivh,#32
4036 extr $ivh,$ivh,$ivl,#63
4037 and $tmpmw,$constnum,$midnum,asr #31
4038 eor $ivl,$tmpmx,$ivl,lsl #1
4041 veor $tmp3,$rndlast,$iv3
4042 vld1.8 {$in2},[$inp],#16
4044 // The iv for fourth block
4045 extr $midnumx,$ivh,$ivh,#32
4046 extr $ivh,$ivh,$ivl,#63
4047 and $tmpmw,$constnum,$midnum,asr #31
4048 eor $ivl,$tmpmx,$ivl,lsl #1
4051 veor $tmp4,$rndlast,$iv4
4052 vld1.8 {$in3},[$inp],#16
4055 // The iv for fifth block
4056 extr $midnumx,$ivh,$ivh,#32
4057 extr $ivh,$ivh,$ivl,#63
4058 and $tmpmw,$constnum,$midnum,asr #31
4059 eor $ivl,$tmpmx,$ivl,lsl #1
4063 vld1.8 {$in4},[$inp],#16
4064 cbz $xoffset,.Lxts_dec_tail4x
4065 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
4066 veor $tmp0,$tmp0,$dat0
4067 veor $dat0,$in0,$iv0
4068 veor $tmp1,$tmp1,$dat1
4069 veor $dat1,$in1,$iv1
4070 veor $tmp2,$tmp2,$dat2
4071 veor $dat2,$in2,$iv2
4072 veor $tmp3,$tmp3,$dat3
4073 veor $dat3,$in3,$iv3
4074 veor $tmp4,$tmp4,$dat4
4075 vst1.8 {$tmp0},[$out],#16
4076 veor $dat4,$in4,$iv4
4077 vst1.8 {$tmp1},[$out],#16
4078 mov $rounds,$rounds0
4079 vst1.8 {$tmp2},[$out],#16
4080 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
4081 vst1.8 {$tmp3},[$out],#16
4082 vst1.8 {$tmp4},[$out],#16
4083 b.hs .Loop5x_xts_dec
4086 b.ne .Loop5x_dec_after
4087 // If x2($len) equal to -0x10, the left blocks is 4.
4088 // After specially processing, utilize the five blocks processing again.
4089 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
4096 veor $dat0,$iv0,$in0
4097 veor $dat1,$iv1,$in1
4098 veor $dat2,$in2,$iv2
4099 veor $dat3,$in3,$iv3
4100 veor $dat4,$in4,$iv4
4101 b.eq .Loop5x_xts_dec
4107 add $rounds,$rounds0,#2
4108 subs $len,$len,#0x30
4109 b.lo .Lxts_inner_dec_tail
4111 veor $dat0,$iv0,$in2
4112 veor $dat1,$iv1,$in3
4113 veor $dat2,$in4,$iv2
4114 b .Lxts_outer_dec_tail
4120 veor $tmp1,$dat1,$tmp0
4121 vst1.8 {$tmp1},[$out],#16
4122 veor $tmp2,$dat2,$tmp2
4123 vst1.8 {$tmp2},[$out],#16
4124 veor $tmp3,$dat3,$tmp3
4125 veor $tmp4,$dat4,$tmp4
4126 vst1.8 {$tmp3-$tmp4},[$out],#32
4128 b.eq .Lxts_dec_abort
4129 vld1.8 {$dat0},[$inp],#16
4132 .Lxts_outer_dec_tail:
4139 vld1.32 {q8},[$key_],#16
4140 subs $rounds,$rounds,#2
4147 vld1.32 {q9},[$key_],#16
4148 b.gt .Lxts_outer_dec_tail
4156 veor $tmp0,$iv0,$rndlast
4157 subs $len,$len,#0x30
4158 // The iv for first block
4162 extr $midnumx,$ivh,$ivh,#32
4163 extr $ivh,$ivh,$ivl,#63
4164 and $tmpmw,$constnum,$midnum,asr #31
4165 eor $ivl,$tmpmx,$ivl,lsl #1
4168 veor $tmp1,$iv1,$rndlast
4169 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
4176 veor $tmp2,$iv2,$rndlast
4177 // The iv for second block
4178 extr $midnumx,$ivh,$ivh,#32
4179 extr $ivh,$ivh,$ivl,#63
4180 and $tmpmw,$constnum,$midnum,asr #31
4181 eor $ivl,$tmpmx,$ivl,lsl #1
4185 add $xoffset,$xoffset,#0x20
4186 add $inp,$inp,$xoffset // $inp is adjusted to the last data
4190 // The iv for third block
4191 extr $midnumx,$ivh,$ivh,#32
4192 extr $ivh,$ivh,$ivl,#63
4193 and $tmpmw,$constnum,$midnum,asr #31
4194 eor $ivl,$tmpmx,$ivl,lsl #1
4216 vld1.8 {$in2},[$inp],#16
4220 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
4221 add $rounds,$rounds0,#2
4222 veor $tmp0,$tmp0,$dat0
4223 veor $tmp1,$tmp1,$dat1
4224 veor $dat2,$dat2,$tmp2
4225 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
4226 vst1.8 {$tmp0},[$out],#16
4227 vst1.8 {$tmp1},[$out],#16
4228 vst1.8 {$dat2},[$out],#16
4238 .Lxts_inner_dec_tail:
4239 // $len == -0x10 means two blocks left.
4241 veor $dat1,$in3,$iv0
4242 veor $dat2,$in4,$iv1
4243 b.eq .Lxts_dec_tail_loop
4244 veor $dat2,$in4,$iv0
4245 .Lxts_dec_tail_loop:
4250 vld1.32 {q8},[$key_],#16
4251 subs $rounds,$rounds,#2
4256 vld1.32 {q9},[$key_],#16
4257 b.gt .Lxts_dec_tail_loop
4276 veor $tmp1,$iv0,$rndlast
4281 veor $tmp2,$iv1,$rndlast
4285 veor $tmp1,$tmp1,$dat1
4286 veor $tmp2,$tmp2,$dat2
4289 vst1.8 {$tmp1},[$out],#16
4290 vst1.8 {$tmp2},[$out],#16
4295 veor $tmp1,$tmp1,$dat2
4298 vst1.8 {$tmp1},[$out],#16
4303 b.eq .Lxts_dec_abort
4304 // Processing the last two blocks with cipher stealing.
4306 cbnz x2,.Lxts_dec_1st_done
4307 vld1.8 {$dat0},[$inp],#16
4309 // Decrypt the last second block to get the last plain text block
4311 eor $tmpin,$dat0,$iv1
4312 ldr $rounds,[$key1,#240]
4313 vld1.32 {$dat0},[$key1],#16
4314 sub $rounds,$rounds,#2
4315 vld1.32 {$dat1},[$key1],#16
4316 .Loop_final_2nd_dec:
4318 aesimc $tmpin,$tmpin
4319 vld1.32 {$dat0},[$key1],#16 // load key schedule...
4320 subs $rounds,$rounds,#2
4322 aesimc $tmpin,$tmpin
4323 vld1.32 {$dat1},[$key1],#16 // load key schedule...
4324 b.gt .Loop_final_2nd_dec
4327 aesimc $tmpin,$tmpin
4328 vld1.32 {$dat0},[$key1]
4330 veor $tmpin,$tmpin,$dat0
4331 veor $tmpin,$tmpin,$iv1
4332 vst1.8 {$tmpin},[$out]
4335 add $tmpoutp,$out,#16
4337 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
4338 // to get the last encrypted block.
4339 .composite_dec_loop:
4340 subs $tailcnt,$tailcnt,#1
4341 ldrb $l2outp,[$out,$tailcnt]
4342 ldrb $loutp,[$tmpinp,$tailcnt]
4343 strb $l2outp,[$tmpoutp,$tailcnt]
4344 strb $loutp,[$out,$tailcnt]
4345 b.gt .composite_dec_loop
4346 .Lxts_dec_load_done:
4347 vld1.8 {$tmpin},[$out]
4348 veor $tmpin,$tmpin,$iv0
4350 // Decrypt the composite block to get the last second plain text block
4351 ldr $rounds,[$key_,#240]
4352 vld1.32 {$dat},[$key_],#16
4353 sub $rounds,$rounds,#2
4354 vld1.32 {$dat1},[$key_],#16
4357 aesimc $tmpin,$tmpin
4358 vld1.32 {$dat0},[$key_],#16 // load key schedule...
4359 subs $rounds,$rounds,#2
4361 aesimc $tmpin,$tmpin
4362 vld1.32 {$dat1},[$key_],#16 // load key schedule...
4363 b.gt .Loop_final_dec
4366 aesimc $tmpin,$tmpin
4367 vld1.32 {$dat0},[$key_]
4369 veor $tmpin,$tmpin,$dat0
4370 veor $tmpin,$tmpin,$iv0
4371 vst1.8 {$tmpin},[$out]
4374 ldp $tailcnt,$midnumx,[sp,#48]
4375 ldp $ivd10,$ivd20,[sp,#32]
4376 ldp $ivd30,$ivd40,[sp,#16]
4377 ldp $constnumx,$tmpinp,[sp],#64
4379 .Lxts_dec_final_abort:
4381 .size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
4388 ########################################
4389 if ($flavour =~ /64/) { ######## 64-bit code
4391 "aesd" => 0x4e285800, "aese" => 0x4e284800,
4392 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800,
4393 "eor3" => 0xce000000, );
4395 local *unaes = sub {
4396 my ($mnemonic,$arg)=@_;
4398 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
4399 sprintf ".inst\t0x%08x\t//%s %s",
4400 $opcode{$mnemonic}|$1|($2<<5),
4405 my ($mnemonic,$arg)=@_;
4407 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/
4409 sprintf ".inst\t0x%08x\t//%s %s",
4410 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10),
4414 foreach(split("\n",$code)) {
4415 s/\`([^\`]*)\`/eval($1)/geo;
4417 s/\bq([0-9]+)\b/"v".($1<8?$1:($1<24?$1+8:$1-16)).".16b"/geo; # old->new registers
4418 s/\bq_([0-9]+)\b/"q".$1/geo; # old->new registers
4419 s/@\s/\/\//o; # old->new style commentary
4421 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
4422 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
4423 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
4424 s/vmov\.i8/movi/o or # fix up legacy mnemonics
4426 s/vrev32\.8/rev32/o or
4427 s/vtst\.8/cmtst/o or
4429 s/^(\s+)v/$1/o or # strip off v prefix
4430 s/\bbx\s+lr\b/ret/o;
4431 s/\b(eor3)\s+(v.*)/unsha3($1,$2)/ge;
4433 # fix up remaining legacy suffixes
4435 m/\],#8/o and s/\.16b/\.8b/go;
4436 s/\.[ui]?32//o and s/\.16b/\.4s/go;
4437 s/\.[ui]?64//o and s/\.16b/\.2d/go;
4438 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
4440 # Switch preprocessor checks to aarch64 versions.
4441 s/__ARME([BL])__/__AARCH64E$1__/go;
4445 } else { ######## 32-bit code
4447 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
4448 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
4450 local *unaes = sub {
4451 my ($mnemonic,$arg)=@_;
4453 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
4454 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
4455 |(($2&7)<<1) |(($2&8)<<2);
4456 # since ARMv7 instructions are always encoded little-endian.
4457 # correct solution is to use .inst directive, but older
4458 # assemblers don't implement it:-(
4459 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
4460 $word&0xff,($word>>8)&0xff,
4461 ($word>>16)&0xff,($word>>24)&0xff,
4469 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
4470 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
4471 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
4477 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
4478 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
4484 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
4485 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
4488 foreach(split("\n",$code)) {
4489 s/\`([^\`]*)\`/eval($1)/geo;
4491 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
4492 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
4493 s/\/\/\s?/@ /o; # new->old style commentary
4495 # fix up remaining new-style suffixes
4496 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
4499 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
4500 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
4501 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
4502 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
4503 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
4504 s/^(\s+)b\./$1b/o or
4505 s/^(\s+)ret/$1bx\tlr/o;
4507 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
4515 close STDOUT or die "error closing STDOUT: $!";