X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Faes%2Fasm%2Faesv8-armx.pl;h=87fe446b30b16df8ed573d6bd842491510b1d5ee;hp=c6d489dd2af5421f05f684b8036ca32b30c9698d;hb=015364baf3328b93dbed2613e59170715a2a11a6;hpb=a0a17fcb75d8de7f650c8b4ae30d85a59563ca22 diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl index c6d489dd2a..87fe446b30 100755 --- a/crypto/aes/asm/aesv8-armx.pl +++ b/crypto/aes/asm/aesv8-armx.pl @@ -11,20 +11,33 @@ # module is endian-agnostic in sense that it supports both big- and # little-endian cases. As does it support both 32- and 64-bit modes # of operation. Latter is achieved by limiting amount of utilized -# registers to 16, which implies additional instructions. This has -# no effect on mighty Apple A7, as results are literally equal to -# the theoretical estimates based on instruction latencies and issue -# rate. It remains to be seen how does it affect other platforms... +# registers to 16, which implies additional NEON load and integer +# instructions. This has no effect on mighty Apple A7, where results +# are literally equal to the theoretical estimates based on AES +# instruction latencies and issue rates. On Cortex-A53, an in-order +# execution core, this costs up to 10-15%, which is partially +# compensated by implementing dedicated code path for 128-bit +# CBC encrypt case. On Cortex-A57 parallelizable mode performance +# seems to be limited by sheer amount of NEON instructions... # # Performance in cycles per byte processed with 128-bit key: # -# CBC enc CBC dec -# Apple A7 2.39 1.20 +# CBC enc CBC dec CTR +# Apple A7 2.39 1.20 1.20 +# Cortex-A53 2.45 1.87 1.94 +# Cortex-A57 3.64 1.34 1.32 $flavour = shift; -$prefix="AES"; +open STDOUT,">".shift; -$code=".text\n"; +$prefix="aes_v8"; + +$code=<<___; +#include "arm_arch.h" + +#if __ARM_ARCH__>=7 +.text +___ $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/); $code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/); @@ -427,189 +440,166 @@ $code.=<<___; vst1.8 {$ivec},[$out],#16 b .Lcbc_done - -.align 5 -.Lcbc_dec128: - vld1.32 {$tmp0-$tmp1},[$key_] - veor $ivec,$ivec,$rndlast - veor $in0,$dat0,$rndlast - mov $step1,$step - -.Loop2x_cbc_dec128: - aesd $dat0,q8 - aesd $dat1,q8 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - subs $len,$len,#32 - aesd $dat0,q9 - aesd $dat1,q9 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - cclr $step,lo - aesd $dat0,$tmp0 - aesd $dat1,$tmp0 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - cclr $step1,ls - aesd $dat0,$tmp1 - aesd $dat1,$tmp1 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q10 - aesd $dat1,q10 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q11 - aesd $dat1,q11 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q12 - aesd $dat1,q12 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q13 - aesd $dat1,q13 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q14 - aesd $dat1,q14 - aesimc $dat0,$dat0 - aesimc $dat1,$dat1 - aesd $dat0,q15 - aesd $dat1,q15 - - veor $ivec,$ivec,$dat0 - veor $in0,$in0,$dat1 - vld1.8 {$dat0},[$inp],$step - vld1.8 {$dat1},[$inp],$step1 - vst1.8 {$ivec},[$out],#16 - veor $ivec,$in1,$rndlast - vst1.8 {$in0},[$out],#16 - veor $in0,$dat0,$rndlast - vorr $in1,$dat1,$dat1 - b.hs .Loop2x_cbc_dec128 - - adds $len,$len,#32 - veor $ivec,$ivec,$rndlast - b.eq .Lcbc_done - veor $in0,$in0,$rndlast - b .Lcbc_dec_tail - +___ +{ +my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); +$code.=<<___; .align 5 .Lcbc_dec: - subs $len,$len,#16 - vorr $in0,$dat,$dat + vld1.8 {$dat2},[$inp],#16 + subs $len,$len,#32 // bias + add $cnt,$rounds,#2 + vorr $in1,$dat,$dat + vorr $dat1,$dat,$dat + vorr $in2,$dat2,$dat2 b.lo .Lcbc_dec_tail - cclr $step,eq - cmp $rounds,#2 - vld1.8 {$dat1},[$inp],$step + vorr $dat1,$dat2,$dat2 + vld1.8 {$dat2},[$inp],#16 + vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 - b.eq .Lcbc_dec128 + vorr $in2,$dat2,$dat2 -.Loop2x_cbc_dec: +.Loop3x_cbc_dec: aesd $dat0,q8 aesd $dat1,q8 + aesd $dat2,q8 vld1.32 {q8},[$key_],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 + aesimc $dat2,$dat2 subs $cnt,$cnt,#2 aesd $dat0,q9 aesd $dat1,q9 + aesd $dat2,q9 vld1.32 {q9},[$key_],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 - b.gt .Loop2x_cbc_dec + aesimc $dat2,$dat2 + b.gt .Loop3x_cbc_dec aesd $dat0,q8 aesd $dat1,q8 + aesd $dat2,q8 + veor $tmp0,$ivec,$rndlast aesimc $dat0,$dat0 aesimc $dat1,$dat1 - veor $tmp0,$ivec,$rndlast + aesimc $dat2,$dat2 veor $tmp1,$in0,$rndlast aesd $dat0,q9 aesd $dat1,q9 + aesd $dat2,q9 + veor $tmp2,$in1,$rndlast + subs $len,$len,#0x30 aesimc $dat0,$dat0 aesimc $dat1,$dat1 - vorr $ivec,$in1,$in1 - subs $len,$len,#32 - aesd $dat0,q10 - aesd $dat1,q10 - aesimc $dat0,$dat0 - cclr $step,lo - aesimc $dat1,$dat1 - mov $key_,$key - aesd $dat0,q11 - aesd $dat1,q11 - aesimc $dat0,$dat0 - vld1.8 {$in0},[$inp],$step - aesimc $dat1,$dat1 - cclr $step,ls + aesimc $dat2,$dat2 + vorr $ivec,$in2,$in2 + mov.lo x6,$len // x6, $cnt, is zero at this point aesd $dat0,q12 aesd $dat1,q12 + aesd $dat2,q12 + add $inp,$inp,x6 // $inp is adjusted in such way that + // at exit from the loop $dat1-$dat2 + // are loaded with last "words" aesimc $dat0,$dat0 aesimc $dat1,$dat1 - vld1.8 {$in1},[$inp],$step + aesimc $dat2,$dat2 + mov $key_,$key aesd $dat0,q13 aesd $dat1,q13 + aesd $dat2,q13 + vld1.8 {$in0},[$inp],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 - vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + aesimc $dat2,$dat2 + vld1.8 {$in1},[$inp],#16 aesd $dat0,q14 aesd $dat1,q14 + aesd $dat2,q14 + vld1.8 {$in2},[$inp],#16 aesimc $dat0,$dat0 aesimc $dat1,$dat1 - vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + aesimc $dat2,$dat2 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] aesd $dat0,q15 aesd $dat1,q15 + aesd $dat2,q15 - mov $cnt,$rounds + add $cnt,$rounds,#2 veor $tmp0,$tmp0,$dat0 veor $tmp1,$tmp1,$dat1 + veor $dat2,$dat2,$tmp2 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] vorr $dat0,$in0,$in0 - vst1.8 {$tmp0},[$out],#32 + vst1.8 {$tmp0},[$out],#16 vorr $dat1,$in1,$in1 - vst1.8 {$tmp1},[$out],#32 - b.hs .Loop2x_cbc_dec + vst1.8 {$tmp1},[$out],#16 + vst1.8 {$dat2},[$out],#16 + vorr $dat2,$in2,$in2 + b.hs .Loop3x_cbc_dec - adds $len,$len,#32 + cmn $len,#0x30 b.eq .Lcbc_done + nop .Lcbc_dec_tail: - aesd $dat,q8 + aesd $dat1,q8 + aesd $dat2,q8 vld1.32 {q8},[$key_],#16 - aesimc $dat,$dat + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 subs $cnt,$cnt,#2 - aesd $dat,q9 + aesd $dat1,q9 + aesd $dat2,q9 vld1.32 {q9},[$key_],#16 - aesimc $dat,$dat + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 b.gt .Lcbc_dec_tail - aesd $dat,q8 - aesimc $dat,$dat - aesd $dat,q9 - aesimc $dat,$dat - veor $tmp,$ivec,$rndlast - aesd $dat,q10 - aesimc $dat,$dat - vorr $ivec,$in0,$in0 - aesd $dat,q11 - aesimc $dat,$dat - aesd $dat,q12 - aesimc $dat,$dat - aesd $dat,q13 - aesimc $dat,$dat - aesd $dat,q14 - aesimc $dat,$dat - aesd $dat,q15 - - veor $tmp,$tmp,$dat - vst1.8 {$tmp},[$out],#16 + aesd $dat1,q8 + aesd $dat2,q8 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + aesd $dat1,q9 + aesd $dat2,q9 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + aesd $dat1,q12 + aesd $dat2,q12 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + cmn $len,#0x20 + aesd $dat1,q13 + aesd $dat2,q13 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + veor $tmp1,$ivec,$rndlast + aesd $dat1,q14 + aesd $dat2,q14 + aesimc $dat1,$dat1 + aesimc $dat2,$dat2 + veor $tmp2,$in1,$rndlast + aesd $dat1,q15 + aesd $dat2,q15 + b.eq .Lcbc_dec_one + veor $tmp1,$tmp1,$dat1 + veor $tmp2,$tmp2,$dat2 + vorr $ivec,$in2,$in2 + vst1.8 {$tmp1},[$out],#16 + vst1.8 {$tmp2},[$out],#16 + b .Lcbc_done + +.Lcbc_dec_one: + veor $tmp1,$tmp1,$dat2 + vorr $ivec,$in2,$in2 + vst1.8 {$tmp1},[$out],#16 .Lcbc_done: vst1.8 {$ivec},[$ivp] .Lcbc_abort: ___ +} $code.=<<___ if ($flavour !~ /64/); vldmia sp!,{d8-d15} ldmia sp!,{r4-r8,pc} @@ -622,93 +612,329 @@ $code.=<<___; .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt ___ }}} +{{{ +my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); +my ($rounds,$cnt,$key_)=("w5","w6","x7"); +my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12)); +my $step="x12"; # aliases with $tctr2 + +my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); +my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); + +my ($dat,$tmp)=($dat0,$tmp0); + +### q8-q15 preloaded key schedule + +$code.=<<___; +.globl ${prefix}_ctr32_encrypt_blocks +.type ${prefix}_ctr32_encrypt_blocks,%function +.align 5 +${prefix}_ctr32_encrypt_blocks: +___ +$code.=<<___ if ($flavour =~ /64/); + stp x29,x30,[sp,#-16]! + add x29,sp,#0 +___ +$code.=<<___ if ($flavour !~ /64/); + mov ip,sp + stmdb sp!,{r4-r10,lr} + vstmdb sp!,{d8-d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg +___ +$code.=<<___; + ldr $rounds,[$key,#240] + + ldr $ctr, [$ivp, #12] + vld1.32 {$dat0},[$ivp] + + vld1.32 {q8-q9},[$key] // load key schedule... + sub $rounds,$rounds,#4 + mov $step,#16 + cmp $len,#2 + add $key_,$key,x5,lsl#4 // pointer to last 5 round keys + sub $rounds,$rounds,#2 + vld1.32 {q12-q13},[$key_],#32 + vld1.32 {q14-q15},[$key_],#32 + vld1.32 {$rndlast},[$key_] + add $key_,$key,#32 + mov $cnt,$rounds + cclr $step,lo +#ifndef __ARMEB__ + rev $ctr, $ctr +#endif + vorr $dat1,$dat0,$dat0 + add $tctr1, $ctr, #1 + vorr $dat2,$dat0,$dat0 + add $ctr, $ctr, #2 + vorr $ivec,$dat0,$dat0 + rev $tctr1, $tctr1 + vmov.32 ${dat1}[3],$tctr1 + b.ls .Lctr32_tail + rev $tctr2, $ctr + sub $len,$len,#3 // bias + vmov.32 ${dat2}[3],$tctr2 + b .Loop3x_ctr32 + +.align 4 +.Loop3x_ctr32: + aese $dat0,q8 + aese $dat1,q8 + aese $dat2,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aesmc $dat2,$dat2 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aese $dat1,q9 + aese $dat2,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aesmc $dat2,$dat2 + b.gt .Loop3x_ctr32 + + aese $dat0,q8 + aese $dat1,q8 + aese $dat2,q8 + mov $key_,$key + aesmc $tmp0,$dat0 + vld1.8 {$in0},[$inp],#16 + aesmc $tmp1,$dat1 + aesmc $dat2,$dat2 + vorr $dat0,$ivec,$ivec + aese $tmp0,q9 + vld1.8 {$in1},[$inp],#16 + aese $tmp1,q9 + aese $dat2,q9 + vorr $dat1,$ivec,$ivec + aesmc $tmp0,$tmp0 + vld1.8 {$in2},[$inp],#16 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$dat2 + vorr $dat2,$ivec,$ivec + add $tctr0,$ctr,#1 + aese $tmp0,q12 + aese $tmp1,q12 + aese $tmp2,q12 + veor $in0,$in0,$rndlast + add $tctr1,$ctr,#2 + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + veor $in1,$in1,$rndlast + add $ctr,$ctr,#3 + aese $tmp0,q13 + aese $tmp1,q13 + aese $tmp2,q13 + veor $in2,$in2,$rndlast + rev $tctr0,$tctr0 + aesmc $tmp0,$tmp0 + vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + vmov.32 ${dat0}[3], $tctr0 + rev $tctr1,$tctr1 + aese $tmp0,q14 + aese $tmp1,q14 + aese $tmp2,q14 + vmov.32 ${dat1}[3], $tctr1 + rev $tctr2,$ctr + aesmc $tmp0,$tmp0 + aesmc $tmp1,$tmp1 + aesmc $tmp2,$tmp2 + vmov.32 ${dat2}[3], $tctr2 + subs $len,$len,#3 + aese $tmp0,q15 + aese $tmp1,q15 + aese $tmp2,q15 + + mov $cnt,$rounds + veor $in0,$in0,$tmp0 + veor $in1,$in1,$tmp1 + veor $in2,$in2,$tmp2 + vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] + vst1.8 {$in0},[$out],#16 + vst1.8 {$in1},[$out],#16 + vst1.8 {$in2},[$out],#16 + b.hs .Loop3x_ctr32 + + adds $len,$len,#3 + b.eq .Lctr32_done + cmp $len,#1 + mov $step,#16 + cclr $step,eq + +.Lctr32_tail: + aese $dat0,q8 + aese $dat1,q8 + vld1.32 {q8},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + subs $cnt,$cnt,#2 + aese $dat0,q9 + aese $dat1,q9 + vld1.32 {q9},[$key_],#16 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + b.gt .Lctr32_tail + + aese $dat0,q8 + aese $dat1,q8 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q9 + aese $dat1,q9 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + vld1.8 {$in0},[$inp],$step + aese $dat0,q12 + aese $dat1,q12 + vld1.8 {$in1},[$inp] + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q13 + aese $dat1,q13 + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + aese $dat0,q14 + aese $dat1,q14 + veor $in0,$in0,$rndlast + aesmc $dat0,$dat0 + aesmc $dat1,$dat1 + veor $in1,$in1,$rndlast + aese $dat0,q15 + aese $dat1,q15 + + cmp $len,#1 + veor $in0,$in0,$dat0 + veor $in1,$in1,$dat1 + vst1.8 {$in0},[$out],#16 + b.eq .Lctr32_done + vst1.8 {$in1},[$out] + +.Lctr32_done: +___ +$code.=<<___ if ($flavour !~ /64/); + vldmia sp!,{d8-d15} + ldmia sp!,{r4-r10,pc} +___ +$code.=<<___ if ($flavour =~ /64/); + ldr x29,[sp],#16 + ret +___ +$code.=<<___; +.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks +___ +}}} +$code.=<<___; +#endif +___ ######################################## if ($flavour =~ /64/) { ######## 64-bit code my %opcode = ( "aesd" => 0x4e285800, "aese" => 0x4e284800, "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 ); - sub unaes { + local *unaes = sub { my ($mnemonic,$arg)=@_; $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && - sprintf ".long\t0x%08x\t//%s %s", + sprintf ".inst\t0x%08x\t//%s %s", $opcode{$mnemonic}|$1|($2<<5), $mnemonic,$arg; - } + }; foreach(split("\n",$code)) { - s/\`([^\`]*)\`/eval($1)/geo; + s/\`([^\`]*)\`/eval($1)/geo; s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers - s/@\s/\/\//o; # old->new style commentary + s/@\s/\/\//o; # old->new style commentary #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or - s/vmov\.i8/movi/o or # fix up legacy mnemonics - s/vext\.8/ext/o or - s/vrev32\.8/rev32/o or - s/vtst\.8/cmtst/o or - s/vshr/ushr/o or - s/^(\s+)v/$1/o or # strip off v prefix + s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or + s/vmov\.i8/movi/o or # fix up legacy mnemonics + s/vext\.8/ext/o or + s/vrev32\.8/rev32/o or + s/vtst\.8/cmtst/o or + s/vshr/ushr/o or + s/^(\s+)v/$1/o or # strip off v prefix s/\bbx\s+lr\b/ret/o; # fix up remainig legacy suffixes s/\.[ui]?8//o; m/\],#8/o and s/\.16b/\.8b/go; - s/\.[ui]?32//o and s/\.16b/\.4s/go; - s/\.[ui]?64//o and s/\.16b/\.2d/go; + s/\.[ui]?32//o and s/\.16b/\.4s/go; + s/\.[ui]?64//o and s/\.16b/\.2d/go; s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o; - print $_,"\n"; + print $_,"\n"; } } else { ######## 32-bit code my %opcode = ( "aesd" => 0xf3b00340, "aese" => 0xf3b00300, "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 ); - sub unaes { + local *unaes = sub { my ($mnemonic,$arg)=@_; - $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o && - sprintf ".long\t0x%08x\t@ %s %s", - $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) - |(($2&7)<<1) |(($2&8)<<2), + if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) { + my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) + |(($2&7)<<1) |(($2&8)<<2); + # since ARMv7 instructions are always encoded little-endian. + # correct solution is to use .inst directive, but older + # assemblers don't implement it:-( + sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s", + $word&0xff,($word>>8)&0xff, + ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; - } + } + }; sub unvtbl { my $arg=shift; $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o && - sprintf "vtbl.8 d%d,{q%d},d%d\n\tvtbl.8 d%d,{q%d},d%d",2*$1,$2,2*$3,2*$1+1,$2,2*$3+1; + sprintf "vtbl.8 d%d,{q%d},d%d\n\t". + "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; } sub unvdup32 { my $arg=shift; $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && - sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1; + sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; + } + + sub unvmov32 { + my $arg=shift; + + $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o && + sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3; } foreach(split("\n",$code)) { - s/\`([^\`]*)\`/eval($1)/geo; + s/\`([^\`]*)\`/eval($1)/geo; s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers - s/\/\/\s?/@ /o; # new->old style commentary + s/\/\/\s?/@ /o; # new->old style commentary # fix up remainig new-style suffixes + s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or s/\],#[0-9]+/]!/o; s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or s/vtbl\.8\s+(.*)/unvtbl($1)/geo or s/vdup\.32\s+(.*)/unvdup32($1)/geo or + s/vmov\.32\s+(.*)/unvmov32($1)/geo or s/^(\s+)b\./$1b/o or + s/^(\s+)mov\./$1mov/o or s/^(\s+)ret/$1bx\tlr/o; - print $_,"\n"; + print $_,"\n"; } }