# module is endian-agnostic in sense that it supports both big- and
# little-endian cases. As does it support both 32- and 64-bit modes
# of operation. Latter is achieved by limiting amount of utilized
-# registers to 16, which implies additional instructions. This has
-# no effect on mighty Apple A7, as results are literally equal to
-# the theoretical estimates based on instruction latencies and issue
-# rate. It remains to be seen how does it affect other platforms...
+# registers to 16, which implies additional NEON load and integer
+# instructions. This has no effect on mighty Apple A7, where results
+# are literally equal to the theoretical estimates based on AES
+# instruction latencies and issue rates. On Cortex-A53, an in-order
+# execution core, this costs up to 10-15%, which is partially
+# compensated by implementing dedicated code path for 128-bit
+# CBC encrypt case. On Cortex-A57 parallelizable mode performance
+# seems to be limited by sheer amount of NEON instructions...
#
# Performance in cycles per byte processed with 128-bit key:
#
-# CBC enc CBC dec
-# Apple A7 2.39 1.20
+# CBC enc CBC dec CTR
+# Apple A7 2.39 1.20 1.20
+# Cortex-A53 2.45 1.87 1.94
+# Cortex-A57 3.64 1.34 1.32
$flavour = shift;
-$prefix="AES";
+open STDOUT,">".shift;
-$code=".text\n";
+$prefix="aes_v8";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_ARCH__>=7
+.text
+___
$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
$code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
vst1.8 {$ivec},[$out],#16
b .Lcbc_done
-
-.align 5
-.Lcbc_dec128:
- vld1.32 {$tmp0-$tmp1},[$key_]
- veor $ivec,$ivec,$rndlast
- veor $in0,$dat0,$rndlast
- mov $step1,$step
-
-.Loop2x_cbc_dec128:
- aesd $dat0,q8
- aesd $dat1,q8
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- subs $len,$len,#32
- aesd $dat0,q9
- aesd $dat1,q9
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- cclr $step,lo
- aesd $dat0,$tmp0
- aesd $dat1,$tmp0
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- cclr $step1,ls
- aesd $dat0,$tmp1
- aesd $dat1,$tmp1
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q10
- aesd $dat1,q10
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q11
- aesd $dat1,q11
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q12
- aesd $dat1,q12
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q13
- aesd $dat1,q13
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q14
- aesd $dat1,q14
- aesimc $dat0,$dat0
- aesimc $dat1,$dat1
- aesd $dat0,q15
- aesd $dat1,q15
-
- veor $ivec,$ivec,$dat0
- veor $in0,$in0,$dat1
- vld1.8 {$dat0},[$inp],$step
- vld1.8 {$dat1},[$inp],$step1
- vst1.8 {$ivec},[$out],#16
- veor $ivec,$in1,$rndlast
- vst1.8 {$in0},[$out],#16
- veor $in0,$dat0,$rndlast
- vorr $in1,$dat1,$dat1
- b.hs .Loop2x_cbc_dec128
-
- adds $len,$len,#32
- veor $ivec,$ivec,$rndlast
- b.eq .Lcbc_done
- veor $in0,$in0,$rndlast
- b .Lcbc_dec_tail
-
+___
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+$code.=<<___;
.align 5
.Lcbc_dec:
- subs $len,$len,#16
- vorr $in0,$dat,$dat
+ vld1.8 {$dat2},[$inp],#16
+ subs $len,$len,#32 // bias
+ add $cnt,$rounds,#2
+ vorr $in1,$dat,$dat
+ vorr $dat1,$dat,$dat
+ vorr $in2,$dat2,$dat2
b.lo .Lcbc_dec_tail
- cclr $step,eq
- cmp $rounds,#2
- vld1.8 {$dat1},[$inp],$step
+ vorr $dat1,$dat2,$dat2
+ vld1.8 {$dat2},[$inp],#16
+ vorr $in0,$dat,$dat
vorr $in1,$dat1,$dat1
- b.eq .Lcbc_dec128
+ vorr $in2,$dat2,$dat2
-.Loop2x_cbc_dec:
+.Loop3x_cbc_dec:
aesd $dat0,q8
aesd $dat1,q8
+ aesd $dat2,q8
vld1.32 {q8},[$key_],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
+ aesimc $dat2,$dat2
subs $cnt,$cnt,#2
aesd $dat0,q9
aesd $dat1,q9
+ aesd $dat2,q9
vld1.32 {q9},[$key_],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
- b.gt .Loop2x_cbc_dec
+ aesimc $dat2,$dat2
+ b.gt .Loop3x_cbc_dec
aesd $dat0,q8
aesd $dat1,q8
+ aesd $dat2,q8
+ veor $tmp0,$ivec,$rndlast
aesimc $dat0,$dat0
aesimc $dat1,$dat1
- veor $tmp0,$ivec,$rndlast
+ aesimc $dat2,$dat2
veor $tmp1,$in0,$rndlast
aesd $dat0,q9
aesd $dat1,q9
+ aesd $dat2,q9
+ veor $tmp2,$in1,$rndlast
+ subs $len,$len,#0x30
aesimc $dat0,$dat0
aesimc $dat1,$dat1
- vorr $ivec,$in1,$in1
- subs $len,$len,#32
- aesd $dat0,q10
- aesd $dat1,q10
- aesimc $dat0,$dat0
- cclr $step,lo
- aesimc $dat1,$dat1
- mov $key_,$key
- aesd $dat0,q11
- aesd $dat1,q11
- aesimc $dat0,$dat0
- vld1.8 {$in0},[$inp],$step
- aesimc $dat1,$dat1
- cclr $step,ls
+ aesimc $dat2,$dat2
+ vorr $ivec,$in2,$in2
+ mov.lo x6,$len // x6, $cnt, is zero at this point
aesd $dat0,q12
aesd $dat1,q12
+ aesd $dat2,q12
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat2
+ // are loaded with last "words"
aesimc $dat0,$dat0
aesimc $dat1,$dat1
- vld1.8 {$in1},[$inp],$step
+ aesimc $dat2,$dat2
+ mov $key_,$key
aesd $dat0,q13
aesd $dat1,q13
+ aesd $dat2,q13
+ vld1.8 {$in0},[$inp],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
- vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ aesimc $dat2,$dat2
+ vld1.8 {$in1},[$inp],#16
aesd $dat0,q14
aesd $dat1,q14
+ aesd $dat2,q14
+ vld1.8 {$in2},[$inp],#16
aesimc $dat0,$dat0
aesimc $dat1,$dat1
- vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
aesd $dat0,q15
aesd $dat1,q15
+ aesd $dat2,q15
- mov $cnt,$rounds
+ add $cnt,$rounds,#2
veor $tmp0,$tmp0,$dat0
veor $tmp1,$tmp1,$dat1
+ veor $dat2,$dat2,$tmp2
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
vorr $dat0,$in0,$in0
- vst1.8 {$tmp0},[$out],#32
+ vst1.8 {$tmp0},[$out],#16
vorr $dat1,$in1,$in1
- vst1.8 {$tmp1},[$out],#32
- b.hs .Loop2x_cbc_dec
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$dat2},[$out],#16
+ vorr $dat2,$in2,$in2
+ b.hs .Loop3x_cbc_dec
- adds $len,$len,#32
+ cmn $len,#0x30
b.eq .Lcbc_done
+ nop
.Lcbc_dec_tail:
- aesd $dat,q8
+ aesd $dat1,q8
+ aesd $dat2,q8
vld1.32 {q8},[$key_],#16
- aesimc $dat,$dat
+ aesimc $dat1,$dat1
+ aesimc $dat2,$dat2
subs $cnt,$cnt,#2
- aesd $dat,q9
+ aesd $dat1,q9
+ aesd $dat2,q9
vld1.32 {q9},[$key_],#16
- aesimc $dat,$dat
+ aesimc $dat1,$dat1
+ aesimc $dat2,$dat2
b.gt .Lcbc_dec_tail
- aesd $dat,q8
- aesimc $dat,$dat
- aesd $dat,q9
- aesimc $dat,$dat
- veor $tmp,$ivec,$rndlast
- aesd $dat,q10
- aesimc $dat,$dat
- vorr $ivec,$in0,$in0
- aesd $dat,q11
- aesimc $dat,$dat
- aesd $dat,q12
- aesimc $dat,$dat
- aesd $dat,q13
- aesimc $dat,$dat
- aesd $dat,q14
- aesimc $dat,$dat
- aesd $dat,q15
-
- veor $tmp,$tmp,$dat
- vst1.8 {$tmp},[$out],#16
+ aesd $dat1,q8
+ aesd $dat2,q8
+ aesimc $dat1,$dat1
+ aesimc $dat2,$dat2
+ aesd $dat1,q9
+ aesd $dat2,q9
+ aesimc $dat1,$dat1
+ aesimc $dat2,$dat2
+ aesd $dat1,q12
+ aesd $dat2,q12
+ aesimc $dat1,$dat1
+ aesimc $dat2,$dat2
+ cmn $len,#0x20
+ aesd $dat1,q13
+ aesd $dat2,q13
+ aesimc $dat1,$dat1
+ aesimc $dat2,$dat2
+ veor $tmp1,$ivec,$rndlast
+ aesd $dat1,q14
+ aesd $dat2,q14
+ aesimc $dat1,$dat1
+ aesimc $dat2,$dat2
+ veor $tmp2,$in1,$rndlast
+ aesd $dat1,q15
+ aesd $dat2,q15
+ b.eq .Lcbc_dec_one
+ veor $tmp1,$tmp1,$dat1
+ veor $tmp2,$tmp2,$dat2
+ vorr $ivec,$in2,$in2
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ b .Lcbc_done
+
+.Lcbc_dec_one:
+ veor $tmp1,$tmp1,$dat2
+ vorr $ivec,$in2,$in2
+ vst1.8 {$tmp1},[$out],#16
.Lcbc_done:
vst1.8 {$ivec},[$ivp]
.Lcbc_abort:
___
+}
$code.=<<___ if ($flavour !~ /64/);
vldmia sp!,{d8-d15}
ldmia sp!,{r4-r8,pc}
.size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
___
}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_)=("w5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my $step="x12"; # aliases with $tctr2
+
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15 preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r10,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldr r4, [ip] @ load remaining arg
+___
+$code.=<<___;
+ ldr $rounds,[$key,#240]
+
+ ldr $ctr, [$ivp, #12]
+ vld1.32 {$dat0},[$ivp]
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#4
+ mov $step,#16
+ cmp $len,#2
+ add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+ add $key_,$key,#32
+ mov $cnt,$rounds
+ cclr $step,lo
+#ifndef __ARMEB__
+ rev $ctr, $ctr
+#endif
+ vorr $dat1,$dat0,$dat0
+ add $tctr1, $ctr, #1
+ vorr $dat2,$dat0,$dat0
+ add $ctr, $ctr, #2
+ vorr $ivec,$dat0,$dat0
+ rev $tctr1, $tctr1
+ vmov.32 ${dat1}[3],$tctr1
+ b.ls .Lctr32_tail
+ rev $tctr2, $ctr
+ sub $len,$len,#3 // bias
+ vmov.32 ${dat2}[3],$tctr2
+ b .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+ aese $dat0,q8
+ aese $dat1,q8
+ aese $dat2,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aesmc $dat2,$dat2
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aese $dat1,q9
+ aese $dat2,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aesmc $dat2,$dat2
+ b.gt .Loop3x_ctr32
+
+ aese $dat0,q8
+ aese $dat1,q8
+ aese $dat2,q8
+ mov $key_,$key
+ aesmc $tmp0,$dat0
+ vld1.8 {$in0},[$inp],#16
+ aesmc $tmp1,$dat1
+ aesmc $dat2,$dat2
+ vorr $dat0,$ivec,$ivec
+ aese $tmp0,q9
+ vld1.8 {$in1},[$inp],#16
+ aese $tmp1,q9
+ aese $dat2,q9
+ vorr $dat1,$ivec,$ivec
+ aesmc $tmp0,$tmp0
+ vld1.8 {$in2},[$inp],#16
+ aesmc $tmp1,$tmp1
+ aesmc $tmp2,$dat2
+ vorr $dat2,$ivec,$ivec
+ add $tctr0,$ctr,#1
+ aese $tmp0,q12
+ aese $tmp1,q12
+ aese $tmp2,q12
+ veor $in0,$in0,$rndlast
+ add $tctr1,$ctr,#2
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ aesmc $tmp2,$tmp2
+ veor $in1,$in1,$rndlast
+ add $ctr,$ctr,#3
+ aese $tmp0,q13
+ aese $tmp1,q13
+ aese $tmp2,q13
+ veor $in2,$in2,$rndlast
+ rev $tctr0,$tctr0
+ aesmc $tmp0,$tmp0
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ aesmc $tmp1,$tmp1
+ aesmc $tmp2,$tmp2
+ vmov.32 ${dat0}[3], $tctr0
+ rev $tctr1,$tctr1
+ aese $tmp0,q14
+ aese $tmp1,q14
+ aese $tmp2,q14
+ vmov.32 ${dat1}[3], $tctr1
+ rev $tctr2,$ctr
+ aesmc $tmp0,$tmp0
+ aesmc $tmp1,$tmp1
+ aesmc $tmp2,$tmp2
+ vmov.32 ${dat2}[3], $tctr2
+ subs $len,$len,#3
+ aese $tmp0,q15
+ aese $tmp1,q15
+ aese $tmp2,q15
+
+ mov $cnt,$rounds
+ veor $in0,$in0,$tmp0
+ veor $in1,$in1,$tmp1
+ veor $in2,$in2,$tmp2
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$in0},[$out],#16
+ vst1.8 {$in1},[$out],#16
+ vst1.8 {$in2},[$out],#16
+ b.hs .Loop3x_ctr32
+
+ adds $len,$len,#3
+ b.eq .Lctr32_done
+ cmp $len,#1
+ mov $step,#16
+ cclr $step,eq
+
+.Lctr32_tail:
+ aese $dat0,q8
+ aese $dat1,q8
+ vld1.32 {q8},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aese $dat1,q9
+ vld1.32 {q9},[$key_],#16
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ b.gt .Lctr32_tail
+
+ aese $dat0,q8
+ aese $dat1,q8
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q9
+ aese $dat1,q9
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ vld1.8 {$in0},[$inp],$step
+ aese $dat0,q12
+ aese $dat1,q12
+ vld1.8 {$in1},[$inp]
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q13
+ aese $dat1,q13
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ aese $dat0,q14
+ aese $dat1,q14
+ veor $in0,$in0,$rndlast
+ aesmc $dat0,$dat0
+ aesmc $dat1,$dat1
+ veor $in1,$in1,$rndlast
+ aese $dat0,q15
+ aese $dat1,q15
+
+ cmp $len,#1
+ veor $in0,$in0,$dat0
+ veor $in1,$in1,$dat1
+ vst1.8 {$in0},[$out],#16
+ b.eq .Lctr32_done
+ vst1.8 {$in1},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r10,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+ ret
+___
+$code.=<<___;
+.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
########################################
if ($flavour =~ /64/) { ######## 64-bit code
my %opcode = (
"aesd" => 0x4e285800, "aese" => 0x4e284800,
"aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
- sub unaes {
+ local *unaes = sub {
my ($mnemonic,$arg)=@_;
$arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
- sprintf ".long\t0x%08x\t//%s %s",
+ sprintf ".inst\t0x%08x\t//%s %s",
$opcode{$mnemonic}|$1|($2<<5),
$mnemonic,$arg;
- }
+ };
foreach(split("\n",$code)) {
- s/\`([^\`]*)\`/eval($1)/geo;
+ s/\`([^\`]*)\`/eval($1)/geo;
s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
- s/@\s/\/\//o; # old->new style commentary
+ s/@\s/\/\//o; # old->new style commentary
#s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
- s/vmov\.i8/movi/o or # fix up legacy mnemonics
- s/vext\.8/ext/o or
- s/vrev32\.8/rev32/o or
- s/vtst\.8/cmtst/o or
- s/vshr/ushr/o or
- s/^(\s+)v/$1/o or # strip off v prefix
+ s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
+ s/vmov\.i8/movi/o or # fix up legacy mnemonics
+ s/vext\.8/ext/o or
+ s/vrev32\.8/rev32/o or
+ s/vtst\.8/cmtst/o or
+ s/vshr/ushr/o or
+ s/^(\s+)v/$1/o or # strip off v prefix
s/\bbx\s+lr\b/ret/o;
# fix up remainig legacy suffixes
s/\.[ui]?8//o;
m/\],#8/o and s/\.16b/\.8b/go;
- s/\.[ui]?32//o and s/\.16b/\.4s/go;
- s/\.[ui]?64//o and s/\.16b/\.2d/go;
+ s/\.[ui]?32//o and s/\.16b/\.4s/go;
+ s/\.[ui]?64//o and s/\.16b/\.2d/go;
s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
- print $_,"\n";
+ print $_,"\n";
}
} else { ######## 32-bit code
my %opcode = (
"aesd" => 0xf3b00340, "aese" => 0xf3b00300,
"aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
- sub unaes {
+ local *unaes = sub {
my ($mnemonic,$arg)=@_;
- $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
- sprintf ".long\t0x%08x\t@ %s %s",
- $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
- |(($2&7)<<1) |(($2&8)<<2),
+ if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+ my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+ |(($2&7)<<1) |(($2&8)<<2);
+ # since ARMv7 instructions are always encoded little-endian.
+ # correct solution is to use .inst directive, but older
+ # assemblers don't implement it:-(
+ sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ $word&0xff,($word>>8)&0xff,
+ ($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
- }
+ }
+ };
sub unvtbl {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
- sprintf "vtbl.8 d%d,{q%d},d%d\n\tvtbl.8 d%d,{q%d},d%d",2*$1,$2,2*$3,2*$1+1,$2,2*$3+1;
+ sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
+ "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
}
sub unvdup32 {
my $arg=shift;
$arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
- sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;
+ sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
+ }
+
+ sub unvmov32 {
+ my $arg=shift;
+
+ $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+ sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
}
foreach(split("\n",$code)) {
- s/\`([^\`]*)\`/eval($1)/geo;
+ s/\`([^\`]*)\`/eval($1)/geo;
s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
- s/\/\/\s?/@ /o; # new->old style commentary
+ s/\/\/\s?/@ /o; # new->old style commentary
# fix up remainig new-style suffixes
+ s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
+ s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
+ s/^(\s+)mov\./$1mov/o or
s/^(\s+)ret/$1bx\tlr/o;
- print $_,"\n";
+ print $_,"\n";
}
}