aesv8-armx.pl: inclrease interleave factor.

[openssl.git] / crypto / aes / asm / aesv8-armx.pl
diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl

index c6d489dd2af5421f05f684b8036ca32b30c9698d..87fe446b30b16df8ed573d6bd842491510b1d5ee 100755 (executable)
--- a/crypto/aes/asm/aesv8-armx.pl
+++ b/crypto/aes/asm/aesv8-armx.pl
@@ -11,20 +11,33 @@
  # module is endian-agnostic in sense that it supports both big- and
  # little-endian cases. As does it support both 32- and 64-bit modes
  # of operation. Latter is achieved by limiting amount of utilized
-# registers to 16, which implies additional instructions. This has
-# no effect on mighty Apple A7, as results are literally equal to
-# the theoretical estimates based on instruction latencies and issue
-# rate. It remains to be seen how does it affect other platforms...
+# registers to 16, which implies additional NEON load and integer
+# instructions. This has no effect on mighty Apple A7, where results
+# are literally equal to the theoretical estimates based on AES
+# instruction latencies and issue rates. On Cortex-A53, an in-order
+# execution core, this costs up to 10-15%, which is partially
+# compensated by implementing dedicated code path for 128-bit
+# CBC encrypt case. On Cortex-A57 parallelizable mode performance
+# seems to be limited by sheer amount of NEON instructions...
  #
  # Performance in cycles per byte processed with 128-bit key:
  #
-#              CBC enc         CBC dec
-# Apple A7     2.39            1.20
+#              CBC enc         CBC dec         CTR
+# Apple A7     2.39            1.20            1.20
+# Cortex-A53   2.45            1.87            1.94
+# Cortex-A57   3.64            1.34            1.32
  
  $flavour = shift;
-$prefix="AES";
+open STDOUT,">".shift;
  
-$code=".text\n";
+$prefix="aes_v8";
+
+$code=<<___;
+#include "arm_arch.h"
+
+#if __ARM_ARCH__>=7
+.text
+___
  $code.=".arch  armv8-a+crypto\n"       if ($flavour =~ /64/);
  $code.=".fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
  
@@ -427,189 +440,166 @@ $code.=<<___;
  
         vst1.8  {$ivec},[$out],#16
         b       .Lcbc_done
-
-.align 5
-.Lcbc_dec128:
-       vld1.32 {$tmp0-$tmp1},[$key_]
-       veor    $ivec,$ivec,$rndlast
-       veor    $in0,$dat0,$rndlast
-       mov     $step1,$step
-
-.Loop2x_cbc_dec128:
-       aesd    $dat0,q8
-       aesd    $dat1,q8
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-        subs   $len,$len,#32
-       aesd    $dat0,q9
-       aesd    $dat1,q9
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-        cclr   $step,lo
-       aesd    $dat0,$tmp0
-       aesd    $dat1,$tmp0
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-        cclr   $step1,ls
-       aesd    $dat0,$tmp1
-       aesd    $dat1,$tmp1
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-       aesd    $dat0,q10
-       aesd    $dat1,q10
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-       aesd    $dat0,q11
-       aesd    $dat1,q11
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-       aesd    $dat0,q12
-       aesd    $dat1,q12
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-       aesd    $dat0,q13
-       aesd    $dat1,q13
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-       aesd    $dat0,q14
-       aesd    $dat1,q14
-       aesimc  $dat0,$dat0
-       aesimc  $dat1,$dat1
-       aesd    $dat0,q15
-       aesd    $dat1,q15
-
-       veor    $ivec,$ivec,$dat0
-       veor    $in0,$in0,$dat1
-       vld1.8  {$dat0},[$inp],$step
-       vld1.8  {$dat1},[$inp],$step1
-       vst1.8  {$ivec},[$out],#16
-       veor    $ivec,$in1,$rndlast
-       vst1.8  {$in0},[$out],#16
-       veor    $in0,$dat0,$rndlast
-       vorr    $in1,$dat1,$dat1
-       b.hs    .Loop2x_cbc_dec128
-
-       adds    $len,$len,#32
-       veor    $ivec,$ivec,$rndlast
-       b.eq    .Lcbc_done
-       veor    $in0,$in0,$rndlast
-       b       .Lcbc_dec_tail
-
+___
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+$code.=<<___;
  .align 5
  .Lcbc_dec:
-       subs    $len,$len,#16
-       vorr    $in0,$dat,$dat
+       vld1.8  {$dat2},[$inp],#16
+       subs    $len,$len,#32           // bias
+       add     $cnt,$rounds,#2
+       vorr    $in1,$dat,$dat
+       vorr    $dat1,$dat,$dat
+       vorr    $in2,$dat2,$dat2
         b.lo    .Lcbc_dec_tail
  
-       cclr    $step,eq
-       cmp     $rounds,#2
-       vld1.8  {$dat1},[$inp],$step
+       vorr    $dat1,$dat2,$dat2
+       vld1.8  {$dat2},[$inp],#16
+       vorr    $in0,$dat,$dat
         vorr    $in1,$dat1,$dat1
-       b.eq    .Lcbc_dec128
+       vorr    $in2,$dat2,$dat2
  
-.Loop2x_cbc_dec:
+.Loop3x_cbc_dec:
         aesd    $dat0,q8
         aesd    $dat1,q8
+       aesd    $dat2,q8
         vld1.32 {q8},[$key_],#16
         aesimc  $dat0,$dat0
         aesimc  $dat1,$dat1
+       aesimc  $dat2,$dat2
         subs    $cnt,$cnt,#2
         aesd    $dat0,q9
         aesd    $dat1,q9
+       aesd    $dat2,q9
         vld1.32 {q9},[$key_],#16
         aesimc  $dat0,$dat0
         aesimc  $dat1,$dat1
-       b.gt    .Loop2x_cbc_dec
+       aesimc  $dat2,$dat2
+       b.gt    .Loop3x_cbc_dec
  
         aesd    $dat0,q8
         aesd    $dat1,q8
+       aesd    $dat2,q8
+        veor   $tmp0,$ivec,$rndlast
         aesimc  $dat0,$dat0
         aesimc  $dat1,$dat1
-        veor   $tmp0,$ivec,$rndlast
+       aesimc  $dat2,$dat2
          veor   $tmp1,$in0,$rndlast
         aesd    $dat0,q9
         aesd    $dat1,q9
+       aesd    $dat2,q9
+        veor   $tmp2,$in1,$rndlast
+        subs   $len,$len,#0x30
         aesimc  $dat0,$dat0
         aesimc  $dat1,$dat1
-        vorr   $ivec,$in1,$in1
-        subs   $len,$len,#32
-       aesd    $dat0,q10
-       aesd    $dat1,q10
-       aesimc  $dat0,$dat0
-        cclr   $step,lo
-       aesimc  $dat1,$dat1
-        mov    $key_,$key
-       aesd    $dat0,q11
-       aesd    $dat1,q11
-       aesimc  $dat0,$dat0
-        vld1.8 {$in0},[$inp],$step
-       aesimc  $dat1,$dat1
-        cclr   $step,ls
+       aesimc  $dat2,$dat2
+        vorr   $ivec,$in2,$in2
+        mov.lo x6,$len                 // x6, $cnt, is zero at this point
         aesd    $dat0,q12
         aesd    $dat1,q12
+       aesd    $dat2,q12
+        add    $inp,$inp,x6            // $inp is adjusted in such way that
+                                       // at exit from the loop $dat1-$dat2
+                                       // are loaded with last "words"
         aesimc  $dat0,$dat0
         aesimc  $dat1,$dat1
-        vld1.8 {$in1},[$inp],$step
+       aesimc  $dat2,$dat2
+        mov    $key_,$key
         aesd    $dat0,q13
         aesd    $dat1,q13
+       aesd    $dat2,q13
+        vld1.8 {$in0},[$inp],#16
         aesimc  $dat0,$dat0
         aesimc  $dat1,$dat1
-        vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
+       aesimc  $dat2,$dat2
+        vld1.8 {$in1},[$inp],#16
         aesd    $dat0,q14
         aesd    $dat1,q14
+       aesd    $dat2,q14
+        vld1.8 {$in2},[$inp],#16
         aesimc  $dat0,$dat0
         aesimc  $dat1,$dat1
-        vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
+       aesimc  $dat2,$dat2
+        vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
         aesd    $dat0,q15
         aesd    $dat1,q15
+       aesd    $dat2,q15
  
-        mov    $cnt,$rounds
+        add    $cnt,$rounds,#2
         veor    $tmp0,$tmp0,$dat0
         veor    $tmp1,$tmp1,$dat1
+       veor    $dat2,$dat2,$tmp2
+        vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
          vorr   $dat0,$in0,$in0
-       vst1.8  {$tmp0},[$out],#32
+       vst1.8  {$tmp0},[$out],#16
          vorr   $dat1,$in1,$in1
-       vst1.8  {$tmp1},[$out],#32
-       b.hs    .Loop2x_cbc_dec
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$dat2},[$out],#16
+        vorr   $dat2,$in2,$in2
+       b.hs    .Loop3x_cbc_dec
  
-       adds    $len,$len,#32
+       cmn     $len,#0x30
         b.eq    .Lcbc_done
+       nop
  
  .Lcbc_dec_tail:
-       aesd    $dat,q8
+       aesd    $dat1,q8
+       aesd    $dat2,q8
         vld1.32 {q8},[$key_],#16
-       aesimc  $dat,$dat
+       aesimc  $dat1,$dat1
+       aesimc  $dat2,$dat2
         subs    $cnt,$cnt,#2
-       aesd    $dat,q9
+       aesd    $dat1,q9
+       aesd    $dat2,q9
         vld1.32 {q9},[$key_],#16
-       aesimc  $dat,$dat
+       aesimc  $dat1,$dat1
+       aesimc  $dat2,$dat2
         b.gt    .Lcbc_dec_tail
  
-       aesd    $dat,q8
-       aesimc  $dat,$dat
-       aesd    $dat,q9
-       aesimc  $dat,$dat
-        veor   $tmp,$ivec,$rndlast
-       aesd    $dat,q10
-       aesimc  $dat,$dat
-        vorr   $ivec,$in0,$in0
-       aesd    $dat,q11
-       aesimc  $dat,$dat
-       aesd    $dat,q12
-       aesimc  $dat,$dat
-       aesd    $dat,q13
-       aesimc  $dat,$dat
-       aesd    $dat,q14
-       aesimc  $dat,$dat
-       aesd    $dat,q15
-
-       veor    $tmp,$tmp,$dat
-       vst1.8  {$tmp},[$out],#16
+       aesd    $dat1,q8
+       aesd    $dat2,q8
+       aesimc  $dat1,$dat1
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q9
+       aesd    $dat2,q9
+       aesimc  $dat1,$dat1
+       aesimc  $dat2,$dat2
+       aesd    $dat1,q12
+       aesd    $dat2,q12
+       aesimc  $dat1,$dat1
+       aesimc  $dat2,$dat2
+        cmn    $len,#0x20
+       aesd    $dat1,q13
+       aesd    $dat2,q13
+       aesimc  $dat1,$dat1
+       aesimc  $dat2,$dat2
+        veor   $tmp1,$ivec,$rndlast
+       aesd    $dat1,q14
+       aesd    $dat2,q14
+       aesimc  $dat1,$dat1
+       aesimc  $dat2,$dat2
+        veor   $tmp2,$in1,$rndlast
+       aesd    $dat1,q15
+       aesd    $dat2,q15
+       b.eq    .Lcbc_dec_one
+       veor    $tmp1,$tmp1,$dat1
+       veor    $tmp2,$tmp2,$dat2
+        vorr   $ivec,$in2,$in2
+       vst1.8  {$tmp1},[$out],#16
+       vst1.8  {$tmp2},[$out],#16
+       b       .Lcbc_done
+
+.Lcbc_dec_one:
+       veor    $tmp1,$tmp1,$dat2
+        vorr   $ivec,$in2,$in2
+       vst1.8  {$tmp1},[$out],#16
  
  .Lcbc_done:
         vst1.8  {$ivec},[$ivp]
  .Lcbc_abort:
  ___
+}
  $code.=<<___   if ($flavour !~ /64/);
         vldmia  sp!,{d8-d15}
         ldmia   sp!,{r4-r8,pc}
@@ -622,93 +612,329 @@ $code.=<<___;
  .size  ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
  ___
  }}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
+my ($rounds,$cnt,$key_)=("w5","w6","x7");
+my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
+my $step="x12";                # aliases with $tctr2
+
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat,$tmp)=($dat0,$tmp0);
+
+### q8-q15     preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_ctr32_encrypt_blocks
+.type  ${prefix}_ctr32_encrypt_blocks,%function
+.align 5
+${prefix}_ctr32_encrypt_blocks:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       stp             x29,x30,[sp,#-16]!
+       add             x29,sp,#0
+___
+$code.=<<___   if ($flavour !~ /64/);
+       mov             ip,sp
+       stmdb           sp!,{r4-r10,lr}
+       vstmdb          sp!,{d8-d15}            @ ABI specification says so
+       ldr             r4, [ip]                @ load remaining arg
+___
+$code.=<<___;
+       ldr             $rounds,[$key,#240]
+
+       ldr             $ctr, [$ivp, #12]
+       vld1.32         {$dat0},[$ivp]
+
+       vld1.32         {q8-q9},[$key]          // load key schedule...
+       sub             $rounds,$rounds,#4
+       mov             $step,#16
+       cmp             $len,#2
+       add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
+       sub             $rounds,$rounds,#2
+       vld1.32         {q12-q13},[$key_],#32
+       vld1.32         {q14-q15},[$key_],#32
+       vld1.32         {$rndlast},[$key_]
+       add             $key_,$key,#32
+       mov             $cnt,$rounds
+       cclr            $step,lo
+#ifndef __ARMEB__
+       rev             $ctr, $ctr
+#endif
+       vorr            $dat1,$dat0,$dat0
+       add             $tctr1, $ctr, #1
+       vorr            $dat2,$dat0,$dat0
+       add             $ctr, $ctr, #2
+       vorr            $ivec,$dat0,$dat0
+       rev             $tctr1, $tctr1
+       vmov.32         ${dat1}[3],$tctr1
+       b.ls            .Lctr32_tail
+       rev             $tctr2, $ctr
+       sub             $len,$len,#3            // bias
+       vmov.32         ${dat2}[3],$tctr2
+       b               .Loop3x_ctr32
+
+.align 4
+.Loop3x_ctr32:
+       aese            $dat0,q8
+       aese            $dat1,q8
+       aese            $dat2,q8
+       vld1.32         {q8},[$key_],#16
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aesmc           $dat2,$dat2
+       subs            $cnt,$cnt,#2
+       aese            $dat0,q9
+       aese            $dat1,q9
+       aese            $dat2,q9
+       vld1.32         {q9},[$key_],#16
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aesmc           $dat2,$dat2
+       b.gt            .Loop3x_ctr32
+
+       aese            $dat0,q8
+       aese            $dat1,q8
+       aese            $dat2,q8
+        mov            $key_,$key
+       aesmc           $tmp0,$dat0
+        vld1.8         {$in0},[$inp],#16
+       aesmc           $tmp1,$dat1
+       aesmc           $dat2,$dat2
+        vorr           $dat0,$ivec,$ivec
+       aese            $tmp0,q9
+        vld1.8         {$in1},[$inp],#16
+       aese            $tmp1,q9
+       aese            $dat2,q9
+        vorr           $dat1,$ivec,$ivec
+       aesmc           $tmp0,$tmp0
+        vld1.8         {$in2},[$inp],#16
+       aesmc           $tmp1,$tmp1
+       aesmc           $tmp2,$dat2
+        vorr           $dat2,$ivec,$ivec
+        add            $tctr0,$ctr,#1
+       aese            $tmp0,q12
+       aese            $tmp1,q12
+       aese            $tmp2,q12
+        veor           $in0,$in0,$rndlast
+        add            $tctr1,$ctr,#2
+       aesmc           $tmp0,$tmp0
+       aesmc           $tmp1,$tmp1
+       aesmc           $tmp2,$tmp2
+        veor           $in1,$in1,$rndlast
+        add            $ctr,$ctr,#3
+       aese            $tmp0,q13
+       aese            $tmp1,q13
+       aese            $tmp2,q13
+        veor           $in2,$in2,$rndlast
+        rev            $tctr0,$tctr0
+       aesmc           $tmp0,$tmp0
+        vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
+       aesmc           $tmp1,$tmp1
+       aesmc           $tmp2,$tmp2
+        vmov.32        ${dat0}[3], $tctr0
+        rev            $tctr1,$tctr1
+       aese            $tmp0,q14
+       aese            $tmp1,q14
+       aese            $tmp2,q14
+        vmov.32        ${dat1}[3], $tctr1
+        rev            $tctr2,$ctr
+       aesmc           $tmp0,$tmp0
+       aesmc           $tmp1,$tmp1
+       aesmc           $tmp2,$tmp2
+        vmov.32        ${dat2}[3], $tctr2
+        subs           $len,$len,#3
+       aese            $tmp0,q15
+       aese            $tmp1,q15
+       aese            $tmp2,q15
+
+        mov            $cnt,$rounds
+       veor            $in0,$in0,$tmp0
+       veor            $in1,$in1,$tmp1
+       veor            $in2,$in2,$tmp2
+        vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
+       vst1.8          {$in0},[$out],#16
+       vst1.8          {$in1},[$out],#16
+       vst1.8          {$in2},[$out],#16
+       b.hs            .Loop3x_ctr32
+
+       adds            $len,$len,#3
+       b.eq            .Lctr32_done
+       cmp             $len,#1
+       mov             $step,#16
+       cclr            $step,eq
+
+.Lctr32_tail:
+       aese            $dat0,q8
+       aese            $dat1,q8
+       vld1.32         {q8},[$key_],#16
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       subs            $cnt,$cnt,#2
+       aese            $dat0,q9
+       aese            $dat1,q9
+       vld1.32         {q9},[$key_],#16
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       b.gt            .Lctr32_tail
+
+       aese            $dat0,q8
+       aese            $dat1,q8
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aese            $dat0,q9
+       aese            $dat1,q9
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+        vld1.8         {$in0},[$inp],$step
+       aese            $dat0,q12
+       aese            $dat1,q12
+        vld1.8         {$in1},[$inp]
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aese            $dat0,q13
+       aese            $dat1,q13
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+       aese            $dat0,q14
+       aese            $dat1,q14
+        veor           $in0,$in0,$rndlast
+       aesmc           $dat0,$dat0
+       aesmc           $dat1,$dat1
+        veor           $in1,$in1,$rndlast
+       aese            $dat0,q15
+       aese            $dat1,q15
+
+       cmp             $len,#1
+       veor            $in0,$in0,$dat0
+       veor            $in1,$in1,$dat1
+       vst1.8          {$in0},[$out],#16
+       b.eq            .Lctr32_done
+       vst1.8          {$in1},[$out]
+
+.Lctr32_done:
+___
+$code.=<<___   if ($flavour !~ /64/);
+       vldmia          sp!,{d8-d15}
+       ldmia           sp!,{r4-r10,pc}
+___
+$code.=<<___   if ($flavour =~ /64/);
+       ldr             x29,[sp],#16
+       ret
+___
+$code.=<<___;
+.size  ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
+___
+}}}
+$code.=<<___;
+#endif
+___
  ########################################
  if ($flavour =~ /64/) {                        ######## 64-bit code
      my %opcode = (
         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
  
-    sub unaes {
+    local *unaes = sub {
         my ($mnemonic,$arg)=@_;
  
         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
-       sprintf ".long\t0x%08x\t//%s %s",
+       sprintf ".inst\t0x%08x\t//%s %s",
                         $opcode{$mnemonic}|$1|($2<<5),
                         $mnemonic,$arg;
-    }
+    };
  
      foreach(split("\n",$code)) {
-        s/\`([^\`]*)\`/eval($1)/geo;
+       s/\`([^\`]*)\`/eval($1)/geo;
  
         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
-        s/@\s/\/\//o;                  # old->new style commentary
+       s/@\s/\/\//o;                   # old->new style commentary
  
         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
-        s/vmov\.i8/movi/o      or      # fix up legacy mnemonics
-        s/vext\.8/ext/o                or
-        s/vrev32\.8/rev32/o    or
-        s/vtst\.8/cmtst/o      or
-        s/vshr/ushr/o          or
-        s/^(\s+)v/$1/o         or      # strip off v prefix
+       s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
+       s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
+       s/vext\.8/ext/o         or
+       s/vrev32\.8/rev32/o     or
+       s/vtst\.8/cmtst/o       or
+       s/vshr/ushr/o           or
+       s/^(\s+)v/$1/o          or      # strip off v prefix
         s/\bbx\s+lr\b/ret/o;
  
         # fix up remainig legacy suffixes
         s/\.[ui]?8//o;
         m/\],#8/o and s/\.16b/\.8b/go;
-        s/\.[ui]?32//o and s/\.16b/\.4s/go;
-        s/\.[ui]?64//o and s/\.16b/\.2d/go;
+       s/\.[ui]?32//o and s/\.16b/\.4s/go;
+       s/\.[ui]?64//o and s/\.16b/\.2d/go;
         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
  
-        print $_,"\n";
+       print $_,"\n";
      }
  } else {                               ######## 32-bit code
      my %opcode = (
         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
  
-    sub unaes {
+    local *unaes = sub {
         my ($mnemonic,$arg)=@_;
  
-       $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
-       sprintf ".long\t0x%08x\t@ %s %s",
-                       $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
-                                         |(($2&7)<<1) |(($2&8)<<2),
+       if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
+           my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+                                        |(($2&7)<<1) |(($2&8)<<2);
+           # since ARMv7 instructions are always encoded little-endian.
+           # correct solution is to use .inst directive, but older
+           # assemblers don't implement it:-(
+           sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+                       $word&0xff,($word>>8)&0xff,
+                       ($word>>16)&0xff,($word>>24)&0xff,
                         $mnemonic,$arg;
-    }
+       }
+    };
  
      sub unvtbl {
         my $arg=shift;
  
         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
-       sprintf "vtbl.8 d%d,{q%d},d%d\n\tvtbl.8 d%d,{q%d},d%d",2*$1,$2,2*$3,2*$1+1,$2,2*$3+1;   
+       sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
+               "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
      }
  
      sub unvdup32 {
         my $arg=shift;
  
         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
-       sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;        
+       sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;      
+    }
+
+    sub unvmov32 {
+       my $arg=shift;
+
+       $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
+       sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;       
      }
  
      foreach(split("\n",$code)) {
-        s/\`([^\`]*)\`/eval($1)/geo;
+       s/\`([^\`]*)\`/eval($1)/geo;
  
         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
-        s/\/\/\s?/@ /o;                                # new->old style commentary
+       s/\/\/\s?/@ /o;                         # new->old style commentary
  
         # fix up remainig new-style suffixes
+       s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
         s/\],#[0-9]+/]!/o;
  
         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
+       s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
         s/^(\s+)b\./$1b/o                               or
+       s/^(\s+)mov\./$1mov/o                           or
         s/^(\s+)ret/$1bx\tlr/o;
  
-        print $_,"\n";
+       print $_,"\n";
      }
  }