Add "teaser" AES module for ARMv8.
authorAndy Polyakov <appro@openssl.org>
Mon, 19 May 2014 06:46:44 +0000 (08:46 +0200)
committerAndy Polyakov <appro@openssl.org>
Mon, 19 May 2014 06:46:44 +0000 (08:46 +0200)
"Teaser" means that it's initial proof-of-concept to build EVP module
upon.

crypto/aes/asm/aesv8-armx.pl [new file with mode: 0755]

diff --git a/crypto/aes/asm/aesv8-armx.pl b/crypto/aes/asm/aesv8-armx.pl
new file mode 100755 (executable)
index 0000000..935f52e
--- /dev/null
@@ -0,0 +1,604 @@
+#!/usr/bin/env perl
+#
+# ====================================================================
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
+# project. The module is, however, dual licensed under OpenSSL and
+# CRYPTOGAMS licenses depending on where you obtain it. For further
+# details see http://www.openssl.org/~appro/cryptogams/.
+# ====================================================================
+#
+# This module implements support for ARMv8 AES instructions. The
+# module is endian-agnostic in sense that it supports both big- and
+# little-endian cases. As does it support both 32- and 64-bit modes
+# of operation. Latter is achieved by limiting amount of utilized
+# registers to 16, which implies additional instructions. This has
+# no effect on mighty Apple A7, as results are literally equal to
+# the theoretical estimates. It remains to be seen how does it
+# affect other platforms...
+#
+# Performance in cycles per byte processed with 128-bit key:
+#
+#              CBC enc         CBC dec
+# Apple A7     2.39            1.20
+
+$flavour = shift;
+$prefix="AES";
+
+$code=".text\n";
+$code.=".arch  armv8-a+crypto\n"       if ($flavour =~ /64/);
+$code.=".fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
+
+# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
+# NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
+# maintain both 32- and 64-bit codes within single module and
+# transliterate common code to either flavour with regex vodoo.
+#
+{{{
+my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
+my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
+       $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
+
+
+$code.=<<___;
+.align 5
+rcon:
+.long  0x01,0x01,0x01,0x01
+.long  0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
+.long  0x1b,0x1b,0x1b,0x1b
+
+.globl ${prefix}_set_encrypt_key
+.type  ${prefix}_set_encrypt_key,%function
+.align 5
+${prefix}_set_encrypt_key:
+.Lenc_key:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       stp     x29,x30,[sp,#-16]!
+       add     x29,sp,#0
+___
+$code.=<<___;
+       adr     $ptr,rcon
+       cmp     $bits,#192
+
+       veor    $zero,$zero,$zero
+       vld1.8  {$in0},[$inp],#16
+       mov     $bits,#8                // reuse $bits
+       vld1.32 {$rcon,$mask},[$ptr],#32
+
+       b.lt    .Loop128
+       b.eq    .L192
+       b       .L256
+
+.align 4
+.Loop128:
+       vtbl.8  $key,{$in0},$mask
+       vext.8  $tmp,$zero,$in0,#12
+       vst1.32 {$in0},[$out],#16
+       aese    $key,$zero
+       subs    $bits,$bits,#1
+
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+        veor   $key,$key,$rcon
+       veor    $in0,$in0,$tmp
+       vshl.u8 $rcon,$rcon,#1
+       veor    $in0,$in0,$key
+       b.ne    .Loop128
+
+       vld1.32 {$rcon},[$ptr]
+
+       vtbl.8  $key,{$in0},$mask
+       vext.8  $tmp,$zero,$in0,#12
+       vst1.32 {$in0},[$out],#16
+       aese    $key,$zero
+
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+        veor   $key,$key,$rcon
+       veor    $in0,$in0,$tmp
+       vshl.u8 $rcon,$rcon,#1
+       veor    $in0,$in0,$key
+
+       vtbl.8  $key,{$in0},$mask
+       vext.8  $tmp,$zero,$in0,#12
+       vst1.32 {$in0},[$out],#16
+       aese    $key,$zero
+
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+        veor   $key,$key,$rcon
+       veor    $in0,$in0,$tmp
+       veor    $in0,$in0,$key
+       vst1.32 {$in0},[$out]
+       add     $out,$out,#0x50
+
+       mov     $rounds,#10
+       b       .Ldone
+
+.align 4
+.L192:
+       vld1.8  {$in1},[$inp],#8
+       vmov.i8 $key,#8                 // borrow $key
+       vst1.32 {$in0},[$out],#16
+       vsub.i8 $mask,$mask,$key        // adjust the mask
+
+.Loop192:
+       vtbl.8  $key,{$in1},$mask
+       vext.8  $tmp,$zero,$in0,#12
+       vst1.32 {$in1},[$out],#8
+       aese    $key,$zero
+       subs    $bits,$bits,#1
+
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+       veor    $in0,$in0,$tmp
+
+       vdup.32 $tmp,${in0}[3]
+       veor    $tmp,$tmp,$in1
+        veor   $key,$key,$rcon
+       vext.8  $in1,$zero,$in1,#12
+       vshl.u8 $rcon,$rcon,#1
+       veor    $in1,$in1,$tmp
+       veor    $in0,$in0,$key
+       veor    $in1,$in1,$key
+       vst1.32 {$in0},[$out],#16
+       b.ne    .Loop192
+
+       mov     $rounds,#12
+       add     $out,$out,#0x20
+       b       .Ldone
+
+.align 4
+.L256:
+       vld1.8  {$in1},[$inp]
+       mov     $bits,#7
+       mov     $rounds,#14
+       vst1.32 {$in0},[$out],#16
+
+.Loop256:
+       vtbl.8  $key,{$in1},$mask
+       vext.8  $tmp,$zero,$in0,#12
+       vst1.32 {$in1},[$out],#16
+       aese    $key,$zero
+       subs    $bits,$bits,#1
+
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+       veor    $in0,$in0,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+        veor   $key,$key,$rcon
+       veor    $in0,$in0,$tmp
+       vshl.u8 $rcon,$rcon,#1
+       veor    $in0,$in0,$key
+       vst1.32 {$in0},[$out],#16
+       b.eq    .Ldone
+
+       vdup.32 $key,${in0}[3]          // just splat
+       vext.8  $tmp,$zero,$in1,#12
+       aese    $key,$zero
+
+       veor    $in1,$in1,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+       veor    $in1,$in1,$tmp
+       vext.8  $tmp,$zero,$tmp,#12
+       veor    $in1,$in1,$tmp
+
+       veor    $in1,$in1,$key
+       b       .Loop256
+
+.Ldone:
+       str     $rounds,[$out]
+
+       eor     x0,x0,x0                // return value
+       `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
+       ret
+.size  ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
+
+.globl ${prefix}_set_decrypt_key
+.type  ${prefix}_set_decrypt_key,%function
+.align 5
+${prefix}_set_decrypt_key:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       stp     x29,x30,[sp,#-16]!
+       add     x29,sp,#0
+___
+$code.=<<___   if ($flavour !~ /64/);
+       stmdb   sp!,{r4,lr}
+___
+$code.=<<___;
+       bl      .Lenc_key
+
+       sub     $out,$out,#240          // restore original $out
+       mov     x4,#-16
+       add     $inp,$out,x12,lsl#4     // end of key schedule
+
+       vld1.32 {v0.16b},[$out]
+       vld1.32 {v1.16b},[$inp]
+       vst1.32 {v0.16b},[$inp],x4
+       vst1.32 {v1.16b},[$out],#16
+
+.Loop_imc:
+       vld1.32 {v0.16b},[$out]
+       vld1.32 {v1.16b},[$inp]
+       aesimc  v0.16b,v0.16b
+       aesimc  v1.16b,v1.16b
+       vst1.32 {v0.16b},[$inp],x4
+       vst1.32 {v1.16b},[$out],#16
+       cmp     $inp,$out
+       b.hi    .Loop_imc
+
+       vld1.32 {v0.16b},[$out]
+       aesimc  v0.16b,v0.16b
+       vst1.32 {v0.16b},[$inp]
+
+       eor     x0,x0,x0                // return value
+___
+$code.=<<___   if ($flavour !~ /64/);
+       ldmia   sp!,{r4,pc}
+___
+$code.=<<___   if ($flavour =~ /64/);
+       ldp     x29,x30,[sp],#16
+       ret
+___
+$code.=<<___;
+.size  ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
+___
+}}}
+{{{
+sub gen_block () {
+my $dir = shift;
+my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
+my ($inp,$out,$key)=map("x$_",(0..2));
+my $rounds="w3";
+my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
+
+$code.=<<___;
+.globl ${prefix}_${dir}crypt
+.type  ${prefix}_${dir}crypt,%function
+.align 5
+${prefix}_${dir}crypt:
+       ldr     $rounds,[$key,#240]
+       vld1.32 {$rndkey0},[$key],#16
+       vld1.8  {$inout},[$inp]
+       sub     $rounds,$rounds,#2
+       vld1.32 {$rndkey1},[$key],#16
+
+.Loop_${dir}c:
+       aes$e   $inout,$rndkey0
+       aes$mc  $inout,$inout
+       vld1.32 {$rndkey0},[$key],#16
+       subs    $rounds,$rounds,#2
+       aes$e   $inout,$rndkey1
+       aes$mc  $inout,$inout
+       vld1.32 {$rndkey1},[$key],#16
+       b.gt    .Loop_${dir}c
+
+       aes$e   $inout,$rndkey0
+       aes$mc  $inout,$inout
+       vld1.32 {$rndkey0},[$key]
+       aes$e   $inout,$rndkey1
+       veor    $inout,$inout,$rndkey0
+
+       vst1.8  {$inout},[$out]
+       ret
+.size  ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
+___
+}
+&gen_block("en");
+&gen_block("de");
+}}}
+{{{
+my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
+my ($rounds,$cnt,$key_,$step)=($enc,"w6","x7","x8");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+### q8-q15     preloaded key schedule
+
+$code.=<<___;
+.globl ${prefix}_cbc_encrypt
+.type  ${prefix}_cbc_encrypt,%function
+.align 5
+${prefix}_cbc_encrypt:
+___
+$code.=<<___   if ($flavour =~ /64/);
+       stp     x29,x30,[sp,#-16]!
+       add     x29,sp,#0
+___
+$code.=<<___   if ($flavour !~ /64/);
+       mov     ip,sp
+       stmdb   sp!,{r4-r8,lr}
+       vstmdb  sp!,{d8-d15}            @ ABI specification says so
+       ldmia   ip,{r4-r5}              @ load remaining args
+___
+$code.=<<___;
+       subs    $len,$len,#16
+       mov     $step,#16
+       b.lo    .Lcbc_abort
+       cclr    $step,eq
+
+       cmp     $enc,#0                 // en- or decrypting?
+       ldr     $rounds,[$key,#240]
+       and     $len,$len,#-16
+       vld1.8  {$ivec},[$ivp]
+       vld1.8  {$dat},[$inp],$step
+
+       vld1.32 {q8-q9},[$key]          // load key schedule...
+       sub     $rounds,$rounds,#6
+       add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
+       sub     $rounds,$rounds,#2
+       vld1.32 {q10-q11},[$key_],#32
+       vld1.32 {q12-q13},[$key_],#32
+       vld1.32 {q14-q15},[$key_],#32
+       vld1.32 {$rndlast},[$key_]
+
+       add     $key_,$key,#32
+       mov     $cnt,$rounds
+       b.eq    .Lcbc_dec
+
+       veor    $dat,$dat,$ivec
+       veor    $rndzero_n_last,q8,$rndlast
+.Loop_cbc_enc:
+       aese    $dat,q8
+       aesmc   $dat,$dat
+       vld1.32 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aese    $dat,q9
+       aesmc   $dat,$dat
+       vld1.32 {q9},[$key_],#16
+       b.gt    .Loop_cbc_enc
+
+       aese    $dat,q8
+       aesmc   $dat,$dat
+        subs   $len,$len,#16
+       aese    $dat,q9
+       aesmc   $dat,$dat
+        cclr   $step,eq
+       aese    $dat,q10
+       aesmc   $dat,$dat
+        add    $key_,$key,#16
+       aese    $dat,q11
+       aesmc   $dat,$dat
+        vld1.8 {q8},[$inp],$step
+       aese    $dat,q12
+       aesmc   $dat,$dat
+        veor   q8,q8,$rndzero_n_last
+       aese    $dat,q13
+       aesmc   $dat,$dat
+        vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
+       aese    $dat,q14
+       aesmc   $dat,$dat
+       aese    $dat,q15
+
+        mov    $cnt,$rounds
+       veor    $ivec,$dat,$rndlast
+       vst1.8  {$ivec},[$out],#16
+       b.hs    .Loop_cbc_enc
+
+       b       .Lcbc_done
+
+.align 5
+.Lcbc_dec:
+       subs    $len,$len,#16
+       vorr    $in0,$dat,$dat
+       b.lo    .Lcbc_dec_tail
+
+       cclr    $step,eq
+       vld1.8  {$dat1},[$inp],$step
+       vorr    $in1,$dat1,$dat1
+
+.Loop2x_cbc_dec:
+       aesd    $dat0,q8
+       aesd    $dat1,q8
+       aesimc  $dat0,$dat0
+       aesimc  $dat1,$dat1
+       vld1.64 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aesd    $dat0,q9
+       aesd    $dat1,q9
+       aesimc  $dat0,$dat0
+       aesimc  $dat1,$dat1
+       vld1.64 {q9},[$key_],#16
+       b.gt    .Loop2x_cbc_dec
+
+       aesd    $dat0,q8
+       aesd    $dat1,q8
+       aesimc  $dat0,$dat0
+        veor   $tmp0,$ivec,$rndlast
+       aesimc  $dat1,$dat1
+        veor   $tmp1,$in0,$rndlast
+       aesd    $dat0,q9
+       aesd    $dat1,q9
+       aesimc  $dat0,$dat0
+        vorr   $ivec,$in1,$in1
+       aesimc  $dat1,$dat1
+        subs   $len,$len,#32
+       aesd    $dat0,q10
+       aesd    $dat1,q10
+       aesimc  $dat0,$dat0
+        cclr   $step,lo
+       aesimc  $dat1,$dat1
+        mov    $key_,$key
+       aesd    $dat0,q11
+       aesd    $dat1,q11
+       aesimc  $dat0,$dat0
+        vld1.8 {$in0},[$inp],$step
+       aesimc  $dat1,$dat1
+        cclr   $step,ls
+       aesd    $dat0,q12
+       aesd    $dat1,q12
+       aesimc  $dat0,$dat0
+       aesimc  $dat1,$dat1
+        vld1.8 {$in1},[$inp],$step
+       aesd    $dat0,q13
+       aesd    $dat1,q13
+       aesimc  $dat0,$dat0
+       aesimc  $dat1,$dat1
+        vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
+       aesd    $dat0,q14
+       aesd    $dat1,q14
+       aesimc  $dat0,$dat0
+       aesimc  $dat1,$dat1
+        vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
+       aesd    $dat0,q15
+       aesd    $dat1,q15
+
+        mov    $cnt,$rounds
+       veor    $tmp0,$tmp0,$dat0
+        vorr   $dat0,$in0,$in0
+       veor    $tmp1,$tmp1,$dat1
+        vorr   $dat1,$in1,$in1
+       vst1.8  {$tmp0-$tmp1},[$out],#32
+       b.hs    .Loop2x_cbc_dec
+
+       adds    $len,$len,#32
+       b.eq    .Lcbc_done
+
+.Lcbc_dec_tail:
+       aesd    $dat,q8
+       aesimc  $dat,$dat
+       vld1.64 {q8},[$key_],#16
+       subs    $cnt,$cnt,#2
+       aesd    $dat,q9
+       aesimc  $dat,$dat
+       vld1.64 {q9},[$key_],#16
+       b.gt    .Lcbc_dec_tail
+
+       aesd    $dat,q8
+       aesimc  $dat,$dat
+       aesd    $dat,q9
+       aesimc  $dat,$dat
+        veor   $tmp,$ivec,$rndlast
+       aesd    $dat,q10
+       aesimc  $dat,$dat
+        vorr   $ivec,$in0,$in0
+       aesd    $dat,q11
+       aesimc  $dat,$dat
+       aesd    $dat,q12
+       aesimc  $dat,$dat
+       aesd    $dat,q13
+       aesimc  $dat,$dat
+       aesd    $dat,q14
+       aesimc  $dat,$dat
+       aesd    $dat,q15
+
+       veor    $tmp,$tmp,$dat
+       vst1.8  {$tmp},[$out],#16
+
+.Lcbc_done:
+       vst1.8  {$ivec},[$ivp]
+.Lcbc_abort:
+___
+$code.=<<___   if ($flavour !~ /64/);
+       vldmia  sp!,{d8-d15}
+       ldmia   sp!,{r4-r8,pc}
+___
+$code.=<<___   if ($flavour =~ /64/);
+       ldr     x29,[sp],#16
+       ret
+___
+$code.=<<___;
+.size  ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
+___
+}}}
+########################################
+if ($flavour =~ /64/) {                        ######## 64-bit code
+    my %opcode = (
+       "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
+       "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
+
+    sub unaes {
+       my ($mnemonic,$arg)=@_;
+
+       $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
+       sprintf ".long\t0x%08x\t//%s %s",
+                       $opcode{$mnemonic}|$1|($2<<5),
+                       $mnemonic,$arg;
+    }
+
+    foreach(split("\n",$code)) {
+        s/\`([^\`]*)\`/eval($1)/geo;
+
+       s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
+        s/@\s/\/\//o;                  # old->new style commentary
+
+       #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
+       s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
+        s/vmov\.i8/movi/o      or      # fix up legacy mnemonics
+        s/vext\.8/ext/o                or
+        s/vrev32\.8/rev32/o    or
+        s/vtst\.8/cmtst/o      or
+        s/vshr/ushr/o          or
+        s/^(\s+)v/$1/o         or      # strip off v prefix
+       s/\bbx\s+lr\b/ret/o;
+
+       # fix up remainig legacy suffixes
+       s/\.[ui]?8//o;
+       m/\],#8/o and s/\.16b/\.8b/go;
+        s/\.[ui]?32//o and s/\.16b/\.4s/go;
+        s/\.[ui]?64//o and s/\.16b/\.2d/go;
+       s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
+
+        print $_,"\n";
+    }
+} else {                               ######## 32-bit code
+    my %opcode = (
+       "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
+       "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
+
+    sub unaes {
+       my ($mnemonic,$arg)=@_;
+
+       $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
+       sprintf ".long\t0x%08x\t@ %s %s",
+                       $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
+                                         |(($2&7)<<1) |(($2&8)<<2),
+                       $mnemonic,$arg;
+    }
+
+    sub unvtbl {
+       my $arg=shift;
+
+       $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
+       sprintf "vtbl.8 d%d,{q%d},d%d\n\tvtbl.8 d%d,{q%d},d%d",2*$1,$2,2*$3,2*$1+1,$2,2*$3+1;   
+    }
+
+    sub unvdup32 {
+       my $arg=shift;
+
+       $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
+       sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;        
+    }
+
+    foreach(split("\n",$code)) {
+        s/\`([^\`]*)\`/eval($1)/geo;
+
+       s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
+       s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
+        s/\/\/\s?/@ /o;                                # new->old style commentary
+
+       # fix up remainig new-style suffixes
+       s/\],#[0-9]+/]!/o;
+
+       s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
+       s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
+       s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
+       s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
+       s/^(\s+)b\./$1b/o                               or
+       s/^(\s+)ret/$1bx\tlr/o;
+
+        print $_,"\n";
+    }
+}
+
+close STDOUT;