2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
42 # Performance in cycles per byte processed with 128-bit key:
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
80 #if __ARM_MAX_ARCH__>=7
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___ if ($flavour !~ /64/);
84 .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
92 # define INST(a,b,c,d) $_byte a,b,c,d
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
119 ${prefix}_set_encrypt_key:
122 $code.=<<___ if ($flavour =~ /64/);
123 stp x29,x30,[sp,#-16]!
143 veor $zero,$zero,$zero
144 vld1.8 {$in0},[$inp],#16
145 mov $bits,#8 // reuse $bits
146 vld1.32 {$rcon,$mask},[$ptr],#32
154 vtbl.8 $key,{$in0},$mask
155 vext.8 $tmp,$zero,$in0,#12
156 vst1.32 {$in0},[$out],#16
161 vext.8 $tmp,$zero,$tmp,#12
163 vext.8 $tmp,$zero,$tmp,#12
166 vshl.u8 $rcon,$rcon,#1
170 vld1.32 {$rcon},[$ptr]
172 vtbl.8 $key,{$in0},$mask
173 vext.8 $tmp,$zero,$in0,#12
174 vst1.32 {$in0},[$out],#16
178 vext.8 $tmp,$zero,$tmp,#12
180 vext.8 $tmp,$zero,$tmp,#12
183 vshl.u8 $rcon,$rcon,#1
186 vtbl.8 $key,{$in0},$mask
187 vext.8 $tmp,$zero,$in0,#12
188 vst1.32 {$in0},[$out],#16
192 vext.8 $tmp,$zero,$tmp,#12
194 vext.8 $tmp,$zero,$tmp,#12
198 vst1.32 {$in0},[$out]
206 vld1.8 {$in1},[$inp],#8
207 vmov.i8 $key,#8 // borrow $key
208 vst1.32 {$in0},[$out],#16
209 vsub.i8 $mask,$mask,$key // adjust the mask
212 vtbl.8 $key,{$in1},$mask
213 vext.8 $tmp,$zero,$in0,#12
214 vst1.32 {$in1},[$out],#8
219 vext.8 $tmp,$zero,$tmp,#12
221 vext.8 $tmp,$zero,$tmp,#12
224 vdup.32 $tmp,${in0}[3]
227 vext.8 $in1,$zero,$in1,#12
228 vshl.u8 $rcon,$rcon,#1
232 vst1.32 {$in0},[$out],#16
244 vst1.32 {$in0},[$out],#16
247 vtbl.8 $key,{$in1},$mask
248 vext.8 $tmp,$zero,$in0,#12
249 vst1.32 {$in1},[$out],#16
254 vext.8 $tmp,$zero,$tmp,#12
256 vext.8 $tmp,$zero,$tmp,#12
259 vshl.u8 $rcon,$rcon,#1
261 vst1.32 {$in0},[$out],#16
264 vdup.32 $key,${in0}[3] // just splat
265 vext.8 $tmp,$zero,$in1,#12
269 vext.8 $tmp,$zero,$tmp,#12
271 vext.8 $tmp,$zero,$tmp,#12
282 mov x0,$ptr // return value
283 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
285 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
287 .globl ${prefix}_set_decrypt_key
288 .type ${prefix}_set_decrypt_key,%function
290 ${prefix}_set_decrypt_key:
292 $code.=<<___ if ($flavour =~ /64/);
293 .inst 0xd503233f // paciasp
294 stp x29,x30,[sp,#-16]!
297 $code.=<<___ if ($flavour !~ /64/);
306 sub $out,$out,#240 // restore original $out
308 add $inp,$out,x12,lsl#4 // end of key schedule
310 vld1.32 {v0.16b},[$out]
311 vld1.32 {v1.16b},[$inp]
312 vst1.32 {v0.16b},[$inp],x4
313 vst1.32 {v1.16b},[$out],#16
316 vld1.32 {v0.16b},[$out]
317 vld1.32 {v1.16b},[$inp]
320 vst1.32 {v0.16b},[$inp],x4
321 vst1.32 {v1.16b},[$out],#16
325 vld1.32 {v0.16b},[$out]
327 vst1.32 {v0.16b},[$inp]
329 eor x0,x0,x0 // return value
332 $code.=<<___ if ($flavour !~ /64/);
335 $code.=<<___ if ($flavour =~ /64/);
337 .inst 0xd50323bf // autiasp
341 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
347 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
348 my ($inp,$out,$key)=map("x$_",(0..2));
350 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
353 .globl ${prefix}_${dir}crypt
354 .type ${prefix}_${dir}crypt,%function
356 ${prefix}_${dir}crypt:
357 ldr $rounds,[$key,#240]
358 vld1.32 {$rndkey0},[$key],#16
359 vld1.8 {$inout},[$inp]
360 sub $rounds,$rounds,#2
361 vld1.32 {$rndkey1},[$key],#16
364 aes$e $inout,$rndkey0
366 vld1.32 {$rndkey0},[$key],#16
367 subs $rounds,$rounds,#2
368 aes$e $inout,$rndkey1
370 vld1.32 {$rndkey1},[$key],#16
373 aes$e $inout,$rndkey0
375 vld1.32 {$rndkey0},[$key]
376 aes$e $inout,$rndkey1
377 veor $inout,$inout,$rndkey0
379 vst1.8 {$inout},[$out]
381 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
388 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
389 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
390 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
392 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
393 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
395 ### q8-q15 preloaded key schedule
398 .globl ${prefix}_cbc_encrypt
399 .type ${prefix}_cbc_encrypt,%function
401 ${prefix}_cbc_encrypt:
403 $code.=<<___ if ($flavour =~ /64/);
404 stp x29,x30,[sp,#-16]!
407 $code.=<<___ if ($flavour !~ /64/);
410 vstmdb sp!,{d8-d15} @ ABI specification says so
411 ldmia ip,{r4-r5} @ load remaining args
419 cmp $enc,#0 // en- or decrypting?
420 ldr $rounds,[$key,#240]
422 vld1.8 {$ivec},[$ivp]
423 vld1.8 {$dat},[$inp],$step
425 vld1.32 {q8-q9},[$key] // load key schedule...
426 sub $rounds,$rounds,#6
427 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
428 sub $rounds,$rounds,#2
429 vld1.32 {q10-q11},[$key_],#32
430 vld1.32 {q12-q13},[$key_],#32
431 vld1.32 {q14-q15},[$key_],#32
432 vld1.32 {$rndlast},[$key_]
440 veor $rndzero_n_last,q8,$rndlast
443 vld1.32 {$in0-$in1},[$key_]
457 vst1.8 {$ivec},[$out],#16
489 vld1.8 {q8},[$inp],$step
492 veor q8,q8,$rndzero_n_last
495 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
499 veor $ivec,$dat,$rndlast
502 vst1.8 {$ivec},[$out],#16
507 vld1.32 {$in0-$in1},[$key_]
514 vst1.8 {$ivec},[$out],#16
528 vld1.8 {q8},[$inp],$step
535 veor q8,q8,$rndzero_n_last
537 veor $ivec,$dat,$rndlast
538 b.hs .Loop_cbc_enc128
540 vst1.8 {$ivec},[$out],#16
544 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
546 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
547 my ($dat4,$in4,$tmp4);
548 if ($flavour =~ /64/) {
549 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
555 vld1.8 {$dat2},[$inp],#16
556 subs $len,$len,#32 // bias
560 vorr $in2,$dat2,$dat2
563 vorr $dat1,$dat2,$dat2
564 vld1.8 {$dat2},[$inp],#16
566 vorr $in1,$dat1,$dat1
567 vorr $in2,$dat2,$dat2
569 $code.=<<___ if ($flavour =~ /64/);
573 vld1.8 {$dat3},[$inp],#16
574 vld1.8 {$dat4},[$inp],#16
575 sub $len,$len,#32 // bias
577 vorr $in3,$dat3,$dat3
578 vorr $in4,$dat4,$dat4
591 vld1.32 {q8},[$key_],#16
603 vld1.32 {q9},[$key_],#16
616 cmp $len,#0x40 // because .Lcbc_tail4x
629 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
642 add $inp,$inp,x6 // $inp is adjusted in such way that
643 // at exit from the loop $dat1-$dat4
644 // are loaded with last "words"
645 add x6,$len,#0x60 // because .Lcbc_tail4x
691 veor $tmp0,$ivec,$rndlast
693 veor $tmp1,$in0,$rndlast
694 vld1.8 {$in0},[$inp],#16
696 veor $tmp2,$in1,$rndlast
697 vld1.8 {$in1},[$inp],#16
699 veor $tmp3,$in2,$rndlast
700 vld1.8 {$in2},[$inp],#16
702 veor $tmp4,$in3,$rndlast
703 vld1.8 {$in3},[$inp],#16
706 vld1.8 {$in4},[$inp],#16
708 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
709 veor $tmp0,$tmp0,$dat0
711 veor $tmp1,$tmp1,$dat1
713 veor $tmp2,$tmp2,$dat2
715 veor $tmp3,$tmp3,$dat3
717 veor $tmp4,$tmp4,$dat4
718 vst1.8 {$tmp0},[$out],#16
720 vst1.8 {$tmp1},[$out],#16
722 vst1.8 {$tmp2},[$out],#16
723 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
724 vst1.8 {$tmp3},[$out],#16
725 vst1.8 {$tmp4},[$out],#16
745 veor $tmp1,$tmp0,$dat1
746 veor $tmp2,$tmp2,$dat2
747 veor $tmp3,$tmp3,$dat3
748 veor $tmp4,$tmp4,$dat4
749 vst1.8 {$tmp1},[$out],#16
750 vst1.8 {$tmp2},[$out],#16
751 vst1.8 {$tmp3},[$out],#16
752 vst1.8 {$tmp4},[$out],#16
765 vld1.32 {q8},[$key_],#16
773 vld1.32 {q9},[$key_],#16
782 veor $tmp0,$ivec,$rndlast
784 veor $tmp1,$in0,$rndlast
785 mov.lo x6,$len // x6, $cnt, is zero at this point
792 veor $tmp2,$in1,$rndlast
793 add $inp,$inp,x6 // $inp is adjusted in such way that
794 // at exit from the loop $dat1-$dat2
795 // are loaded with last "words"
804 vld1.8 {$in0},[$inp],#16
811 vld1.8 {$in1},[$inp],#16
818 vld1.8 {$in2},[$inp],#16
822 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
824 veor $tmp0,$tmp0,$dat0
825 veor $tmp1,$tmp1,$dat1
826 veor $dat2,$dat2,$tmp2
827 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
828 vst1.8 {$tmp0},[$out],#16
830 vst1.8 {$tmp1},[$out],#16
832 vst1.8 {$dat2},[$out],#16
845 vld1.32 {q8},[$key_],#16
851 vld1.32 {q9},[$key_],#16
871 veor $tmp1,$ivec,$rndlast
876 veor $tmp2,$in1,$rndlast
880 veor $tmp1,$tmp1,$dat1
881 veor $tmp2,$tmp2,$dat2
883 vst1.8 {$tmp1},[$out],#16
884 vst1.8 {$tmp2},[$out],#16
888 veor $tmp1,$tmp1,$dat2
890 vst1.8 {$tmp1},[$out],#16
893 vst1.8 {$ivec},[$ivp]
897 $code.=<<___ if ($flavour !~ /64/);
901 $code.=<<___ if ($flavour =~ /64/);
906 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
910 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
911 my ($rounds,$cnt,$key_)=("w5","w6","x7");
912 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
913 my $step="x12"; # aliases with $tctr2
915 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
916 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
918 # used only in 64-bit mode...
919 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
921 my ($dat,$tmp)=($dat0,$tmp0);
923 ### q8-q15 preloaded key schedule
926 .globl ${prefix}_ctr32_encrypt_blocks
927 .type ${prefix}_ctr32_encrypt_blocks,%function
929 ${prefix}_ctr32_encrypt_blocks:
931 $code.=<<___ if ($flavour =~ /64/);
932 stp x29,x30,[sp,#-16]!
935 $code.=<<___ if ($flavour !~ /64/);
937 stmdb sp!,{r4-r10,lr}
938 vstmdb sp!,{d8-d15} @ ABI specification says so
939 ldr r4, [ip] @ load remaining arg
942 ldr $rounds,[$key,#240]
944 ldr $ctr, [$ivp, #12]
945 vld1.32 {$dat0},[$ivp]
947 vld1.32 {q8-q9},[$key] // load key schedule...
948 sub $rounds,$rounds,#4
951 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
952 sub $rounds,$rounds,#2
953 vld1.32 {q12-q13},[$key_],#32
954 vld1.32 {q14-q15},[$key_],#32
955 vld1.32 {$rndlast},[$key_]
962 vorr $dat1,$dat0,$dat0
964 vorr $dat2,$dat0,$dat0
966 vorr $ivec,$dat0,$dat0
968 vmov.32 ${dat1}[3],$tctr1
971 sub $len,$len,#3 // bias
972 vmov.32 ${dat2}[3],$tctr2
974 $code.=<<___ if ($flavour =~ /64/);
980 vorr $dat3,$dat0,$dat0
982 vorr $dat4,$dat0,$dat0
984 vmov.32 ${dat3}[3],w13
985 sub $len,$len,#2 // bias
986 vmov.32 ${dat4}[3],w14
1002 vld1.32 {q8},[$key_],#16
1014 vld1.32 {q9},[$key_],#16
1028 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1040 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1076 vld1.8 {$in0},[$inp],#16
1079 vld1.8 {$in1},[$inp],#16
1082 vld1.8 {$in2},[$inp],#16
1085 vld1.8 {$in3},[$inp],#16
1088 vld1.8 {$in4},[$inp],#16
1091 veor $in0,$in0,$rndlast
1093 veor $in1,$in1,$rndlast
1095 veor $in2,$in2,$rndlast
1097 veor $in3,$in3,$rndlast
1099 veor $in4,$in4,$rndlast
1101 veor $in0,$in0,$dat0
1102 vorr $dat0,$ivec,$ivec
1103 veor $in1,$in1,$dat1
1104 vorr $dat1,$ivec,$ivec
1105 veor $in2,$in2,$dat2
1106 vorr $dat2,$ivec,$ivec
1107 veor $in3,$in3,$dat3
1108 vorr $dat3,$ivec,$ivec
1109 veor $in4,$in4,$dat4
1110 vorr $dat4,$ivec,$ivec
1112 vst1.8 {$in0},[$out],#16
1113 vmov.32 ${dat0}[3],$tctr0
1114 vst1.8 {$in1},[$out],#16
1115 vmov.32 ${dat1}[3],$tctr1
1116 vst1.8 {$in2},[$out],#16
1117 vmov.32 ${dat2}[3],$tctr2
1118 vst1.8 {$in3},[$out],#16
1119 vmov.32 ${dat3}[3],w13
1120 vst1.8 {$in4},[$out],#16
1121 vmov.32 ${dat4}[3],w14
1124 cbz $len,.Lctr32_done
1138 sub $len,$len,#3 // bias
1152 vld1.32 {q8},[$key_],#16
1160 vld1.32 {q9},[$key_],#16
1167 vld1.8 {$in0},[$inp],#16
1168 vorr $dat0,$ivec,$ivec
1171 vld1.8 {$in1},[$inp],#16
1172 vorr $dat1,$ivec,$ivec
1177 vld1.8 {$in2},[$inp],#16
1181 vorr $dat2,$ivec,$ivec
1187 veor $in0,$in0,$rndlast
1191 veor $in1,$in1,$rndlast
1197 veor $in2,$in2,$rndlast
1201 vmov.32 ${dat0}[3], $tctr0
1207 vmov.32 ${dat1}[3], $tctr1
1211 vmov.32 ${dat2}[3], $tctr2
1217 veor $in0,$in0,$tmp0
1218 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1219 vst1.8 {$in0},[$out],#16
1220 veor $in1,$in1,$tmp1
1222 vst1.8 {$in1},[$out],#16
1223 veor $in2,$in2,$tmp2
1224 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1225 vst1.8 {$in2},[$out],#16
1239 vld1.32 {q8},[$key_],#16
1245 vld1.32 {q9},[$key_],#16
1256 vld1.8 {$in0},[$inp],$step
1261 vld1.8 {$in1},[$inp]
1266 veor $in0,$in0,$rndlast
1271 veor $in1,$in1,$rndlast
1276 veor $in0,$in0,$dat0
1277 veor $in1,$in1,$dat1
1278 vst1.8 {$in0},[$out],#16
1280 vst1.8 {$in1},[$out]
1284 $code.=<<___ if ($flavour !~ /64/);
1286 ldmia sp!,{r4-r10,pc}
1288 $code.=<<___ if ($flavour =~ /64/);
1293 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1299 ########################################
1300 if ($flavour =~ /64/) { ######## 64-bit code
1302 "aesd" => 0x4e285800, "aese" => 0x4e284800,
1303 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
1305 local *unaes = sub {
1306 my ($mnemonic,$arg)=@_;
1308 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
1309 sprintf ".inst\t0x%08x\t//%s %s",
1310 $opcode{$mnemonic}|$1|($2<<5),
1314 foreach(split("\n",$code)) {
1315 s/\`([^\`]*)\`/eval($1)/geo;
1317 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
1318 s/@\s/\/\//o; # old->new style commentary
1320 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
1321 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
1322 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
1323 s/vmov\.i8/movi/o or # fix up legacy mnemonics
1325 s/vrev32\.8/rev32/o or
1326 s/vtst\.8/cmtst/o or
1328 s/^(\s+)v/$1/o or # strip off v prefix
1329 s/\bbx\s+lr\b/ret/o;
1331 # fix up remaining legacy suffixes
1333 m/\],#8/o and s/\.16b/\.8b/go;
1334 s/\.[ui]?32//o and s/\.16b/\.4s/go;
1335 s/\.[ui]?64//o and s/\.16b/\.2d/go;
1336 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
1340 } else { ######## 32-bit code
1342 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
1343 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
1345 local *unaes = sub {
1346 my ($mnemonic,$arg)=@_;
1348 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
1349 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
1350 |(($2&7)<<1) |(($2&8)<<2);
1351 # since ARMv7 instructions are always encoded little-endian.
1352 # correct solution is to use .inst directive, but older
1353 # assemblers don't implement it:-(
1354 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
1355 $word&0xff,($word>>8)&0xff,
1356 ($word>>16)&0xff,($word>>24)&0xff,
1364 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
1365 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
1366 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
1372 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
1373 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
1379 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
1380 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
1383 foreach(split("\n",$code)) {
1384 s/\`([^\`]*)\`/eval($1)/geo;
1386 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
1387 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
1388 s/\/\/\s?/@ /o; # new->old style commentary
1390 # fix up remaining new-style suffixes
1391 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
1394 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
1395 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
1396 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
1397 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
1398 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
1399 s/^(\s+)b\./$1b/o or
1400 s/^(\s+)ret/$1bx\tlr/o;
1402 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {