3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
23 # Performance in cycles per byte processed with 128-bit key:
26 # Apple A7 2.39 1.20 1.20
27 # Cortex-A53 1.32 1.29 1.46
28 # Cortex-A57(*) 1.95 0.85 0.93
29 # Denver 1.96 0.86 0.80
31 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
32 # and are still same even for updated module;
37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
39 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
40 die "can't locate arm-xlate.pl";
42 open OUT,"| \"$^X\" $xlate $flavour $output";
50 #if __ARM_MAX_ARCH__>=7
53 $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
54 $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
55 #^^^^^^ this is done to simplify adoption by not depending
58 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
59 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
60 # maintain both 32- and 64-bit codes within single module and
61 # transliterate common code to either flavour with regex vodoo.
64 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
65 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
66 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
72 .long 0x01,0x01,0x01,0x01
73 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
74 .long 0x1b,0x1b,0x1b,0x1b
76 .globl ${prefix}_set_encrypt_key
77 .type ${prefix}_set_encrypt_key,%function
79 ${prefix}_set_encrypt_key:
82 $code.=<<___ if ($flavour =~ /64/);
83 stp x29,x30,[sp,#-16]!
103 veor $zero,$zero,$zero
104 vld1.8 {$in0},[$inp],#16
105 mov $bits,#8 // reuse $bits
106 vld1.32 {$rcon,$mask},[$ptr],#32
114 vtbl.8 $key,{$in0},$mask
115 vext.8 $tmp,$zero,$in0,#12
116 vst1.32 {$in0},[$out],#16
121 vext.8 $tmp,$zero,$tmp,#12
123 vext.8 $tmp,$zero,$tmp,#12
126 vshl.u8 $rcon,$rcon,#1
130 vld1.32 {$rcon},[$ptr]
132 vtbl.8 $key,{$in0},$mask
133 vext.8 $tmp,$zero,$in0,#12
134 vst1.32 {$in0},[$out],#16
138 vext.8 $tmp,$zero,$tmp,#12
140 vext.8 $tmp,$zero,$tmp,#12
143 vshl.u8 $rcon,$rcon,#1
146 vtbl.8 $key,{$in0},$mask
147 vext.8 $tmp,$zero,$in0,#12
148 vst1.32 {$in0},[$out],#16
152 vext.8 $tmp,$zero,$tmp,#12
154 vext.8 $tmp,$zero,$tmp,#12
158 vst1.32 {$in0},[$out]
166 vld1.8 {$in1},[$inp],#8
167 vmov.i8 $key,#8 // borrow $key
168 vst1.32 {$in0},[$out],#16
169 vsub.i8 $mask,$mask,$key // adjust the mask
172 vtbl.8 $key,{$in1},$mask
173 vext.8 $tmp,$zero,$in0,#12
174 vst1.32 {$in1},[$out],#8
179 vext.8 $tmp,$zero,$tmp,#12
181 vext.8 $tmp,$zero,$tmp,#12
184 vdup.32 $tmp,${in0}[3]
187 vext.8 $in1,$zero,$in1,#12
188 vshl.u8 $rcon,$rcon,#1
192 vst1.32 {$in0},[$out],#16
204 vst1.32 {$in0},[$out],#16
207 vtbl.8 $key,{$in1},$mask
208 vext.8 $tmp,$zero,$in0,#12
209 vst1.32 {$in1},[$out],#16
214 vext.8 $tmp,$zero,$tmp,#12
216 vext.8 $tmp,$zero,$tmp,#12
219 vshl.u8 $rcon,$rcon,#1
221 vst1.32 {$in0},[$out],#16
224 vdup.32 $key,${in0}[3] // just splat
225 vext.8 $tmp,$zero,$in1,#12
229 vext.8 $tmp,$zero,$tmp,#12
231 vext.8 $tmp,$zero,$tmp,#12
242 mov x0,$ptr // return value
243 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
245 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
247 .globl ${prefix}_set_decrypt_key
248 .type ${prefix}_set_decrypt_key,%function
250 ${prefix}_set_decrypt_key:
252 $code.=<<___ if ($flavour =~ /64/);
253 stp x29,x30,[sp,#-16]!
256 $code.=<<___ if ($flavour !~ /64/);
265 sub $out,$out,#240 // restore original $out
267 add $inp,$out,x12,lsl#4 // end of key schedule
269 vld1.32 {v0.16b},[$out]
270 vld1.32 {v1.16b},[$inp]
271 vst1.32 {v0.16b},[$inp],x4
272 vst1.32 {v1.16b},[$out],#16
275 vld1.32 {v0.16b},[$out]
276 vld1.32 {v1.16b},[$inp]
279 vst1.32 {v0.16b},[$inp],x4
280 vst1.32 {v1.16b},[$out],#16
284 vld1.32 {v0.16b},[$out]
286 vst1.32 {v0.16b},[$inp]
288 eor x0,x0,x0 // return value
291 $code.=<<___ if ($flavour !~ /64/);
294 $code.=<<___ if ($flavour =~ /64/);
299 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
305 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
306 my ($inp,$out,$key)=map("x$_",(0..2));
308 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
311 .globl ${prefix}_${dir}crypt
312 .type ${prefix}_${dir}crypt,%function
314 ${prefix}_${dir}crypt:
315 ldr $rounds,[$key,#240]
316 vld1.32 {$rndkey0},[$key],#16
317 vld1.8 {$inout},[$inp]
318 sub $rounds,$rounds,#2
319 vld1.32 {$rndkey1},[$key],#16
322 aes$e $inout,$rndkey0
324 vld1.32 {$rndkey0},[$key],#16
325 subs $rounds,$rounds,#2
326 aes$e $inout,$rndkey1
328 vld1.32 {$rndkey1},[$key],#16
331 aes$e $inout,$rndkey0
333 vld1.32 {$rndkey0},[$key]
334 aes$e $inout,$rndkey1
335 veor $inout,$inout,$rndkey0
337 vst1.8 {$inout},[$out]
339 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
346 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
347 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
348 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
350 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
351 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
353 ### q8-q15 preloaded key schedule
356 .globl ${prefix}_cbc_encrypt
357 .type ${prefix}_cbc_encrypt,%function
359 ${prefix}_cbc_encrypt:
361 $code.=<<___ if ($flavour =~ /64/);
362 stp x29,x30,[sp,#-16]!
365 $code.=<<___ if ($flavour !~ /64/);
368 vstmdb sp!,{d8-d15} @ ABI specification says so
369 ldmia ip,{r4-r5} @ load remaining args
377 cmp $enc,#0 // en- or decrypting?
378 ldr $rounds,[$key,#240]
380 vld1.8 {$ivec},[$ivp]
381 vld1.8 {$dat},[$inp],$step
383 vld1.32 {q8-q9},[$key] // load key schedule...
384 sub $rounds,$rounds,#6
385 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
386 sub $rounds,$rounds,#2
387 vld1.32 {q10-q11},[$key_],#32
388 vld1.32 {q12-q13},[$key_],#32
389 vld1.32 {q14-q15},[$key_],#32
390 vld1.32 {$rndlast},[$key_]
398 veor $rndzero_n_last,q8,$rndlast
401 vld1.32 {$in0-$in1},[$key_]
415 vst1.8 {$ivec},[$out],#16
447 vld1.8 {q8},[$inp],$step
450 veor q8,q8,$rndzero_n_last
453 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
457 veor $ivec,$dat,$rndlast
460 vst1.8 {$ivec},[$out],#16
465 vld1.32 {$in0-$in1},[$key_]
472 vst1.8 {$ivec},[$out],#16
486 vld1.8 {q8},[$inp],$step
493 veor q8,q8,$rndzero_n_last
495 veor $ivec,$dat,$rndlast
496 b.hs .Loop_cbc_enc128
498 vst1.8 {$ivec},[$out],#16
502 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
506 vld1.8 {$dat2},[$inp],#16
507 subs $len,$len,#32 // bias
511 vorr $in2,$dat2,$dat2
514 vorr $dat1,$dat2,$dat2
515 vld1.8 {$dat2},[$inp],#16
517 vorr $in1,$dat1,$dat1
518 vorr $in2,$dat2,$dat2
527 vld1.32 {q8},[$key_],#16
535 vld1.32 {q9},[$key_],#16
544 veor $tmp0,$ivec,$rndlast
546 veor $tmp1,$in0,$rndlast
547 mov.lo x6,$len // x6, $cnt, is zero at this point
554 veor $tmp2,$in1,$rndlast
555 add $inp,$inp,x6 // $inp is adjusted in such way that
556 // at exit from the loop $dat1-$dat2
557 // are loaded with last "words"
566 vld1.8 {$in0},[$inp],#16
573 vld1.8 {$in1},[$inp],#16
580 vld1.8 {$in2},[$inp],#16
584 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
586 veor $tmp0,$tmp0,$dat0
587 veor $tmp1,$tmp1,$dat1
588 veor $dat2,$dat2,$tmp2
589 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
590 vst1.8 {$tmp0},[$out],#16
592 vst1.8 {$tmp1},[$out],#16
594 vst1.8 {$dat2},[$out],#16
607 vld1.32 {q8},[$key_],#16
613 vld1.32 {q9},[$key_],#16
633 veor $tmp1,$ivec,$rndlast
638 veor $tmp2,$in1,$rndlast
642 veor $tmp1,$tmp1,$dat1
643 veor $tmp2,$tmp2,$dat2
645 vst1.8 {$tmp1},[$out],#16
646 vst1.8 {$tmp2},[$out],#16
650 veor $tmp1,$tmp1,$dat2
652 vst1.8 {$tmp1},[$out],#16
655 vst1.8 {$ivec},[$ivp]
659 $code.=<<___ if ($flavour !~ /64/);
663 $code.=<<___ if ($flavour =~ /64/);
668 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
672 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
673 my ($rounds,$cnt,$key_)=("w5","w6","x7");
674 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
675 my $step="x12"; # aliases with $tctr2
677 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
678 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
680 my ($dat,$tmp)=($dat0,$tmp0);
682 ### q8-q15 preloaded key schedule
685 .globl ${prefix}_ctr32_encrypt_blocks
686 .type ${prefix}_ctr32_encrypt_blocks,%function
688 ${prefix}_ctr32_encrypt_blocks:
690 $code.=<<___ if ($flavour =~ /64/);
691 stp x29,x30,[sp,#-16]!
694 $code.=<<___ if ($flavour !~ /64/);
696 stmdb sp!,{r4-r10,lr}
697 vstmdb sp!,{d8-d15} @ ABI specification says so
698 ldr r4, [ip] @ load remaining arg
701 ldr $rounds,[$key,#240]
703 ldr $ctr, [$ivp, #12]
704 vld1.32 {$dat0},[$ivp]
706 vld1.32 {q8-q9},[$key] // load key schedule...
707 sub $rounds,$rounds,#4
710 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
711 sub $rounds,$rounds,#2
712 vld1.32 {q12-q13},[$key_],#32
713 vld1.32 {q14-q15},[$key_],#32
714 vld1.32 {$rndlast},[$key_]
721 vorr $dat1,$dat0,$dat0
723 vorr $dat2,$dat0,$dat0
725 vorr $ivec,$dat0,$dat0
727 vmov.32 ${dat1}[3],$tctr1
730 sub $len,$len,#3 // bias
731 vmov.32 ${dat2}[3],$tctr2
742 vld1.32 {q8},[$key_],#16
750 vld1.32 {q9},[$key_],#16
757 vld1.8 {$in0},[$inp],#16
758 vorr $dat0,$ivec,$ivec
761 vld1.8 {$in1},[$inp],#16
762 vorr $dat1,$ivec,$ivec
767 vld1.8 {$in2},[$inp],#16
771 vorr $dat2,$ivec,$ivec
777 veor $in0,$in0,$rndlast
781 veor $in1,$in1,$rndlast
787 veor $in2,$in2,$rndlast
791 vmov.32 ${dat0}[3], $tctr0
797 vmov.32 ${dat1}[3], $tctr1
801 vmov.32 ${dat2}[3], $tctr2
808 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
809 vst1.8 {$in0},[$out],#16
812 vst1.8 {$in1},[$out],#16
814 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
815 vst1.8 {$in2},[$out],#16
829 vld1.32 {q8},[$key_],#16
835 vld1.32 {q9},[$key_],#16
846 vld1.8 {$in0},[$inp],$step
856 veor $in0,$in0,$rndlast
861 veor $in1,$in1,$rndlast
868 vst1.8 {$in0},[$out],#16
874 $code.=<<___ if ($flavour !~ /64/);
876 ldmia sp!,{r4-r10,pc}
878 $code.=<<___ if ($flavour =~ /64/);
883 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
889 ########################################
890 if ($flavour =~ /64/) { ######## 64-bit code
892 "aesd" => 0x4e285800, "aese" => 0x4e284800,
893 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
896 my ($mnemonic,$arg)=@_;
898 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
899 sprintf ".inst\t0x%08x\t//%s %s",
900 $opcode{$mnemonic}|$1|($2<<5),
904 foreach(split("\n",$code)) {
905 s/\`([^\`]*)\`/eval($1)/geo;
907 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
908 s/@\s/\/\//o; # old->new style commentary
910 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
911 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
912 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
913 s/vmov\.i8/movi/o or # fix up legacy mnemonics
915 s/vrev32\.8/rev32/o or
918 s/^(\s+)v/$1/o or # strip off v prefix
921 # fix up remainig legacy suffixes
923 m/\],#8/o and s/\.16b/\.8b/go;
924 s/\.[ui]?32//o and s/\.16b/\.4s/go;
925 s/\.[ui]?64//o and s/\.16b/\.2d/go;
926 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
930 } else { ######## 32-bit code
932 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
933 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
936 my ($mnemonic,$arg)=@_;
938 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
939 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
940 |(($2&7)<<1) |(($2&8)<<2);
941 # since ARMv7 instructions are always encoded little-endian.
942 # correct solution is to use .inst directive, but older
943 # assemblers don't implement it:-(
944 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
945 $word&0xff,($word>>8)&0xff,
946 ($word>>16)&0xff,($word>>24)&0xff,
954 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
955 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
956 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
962 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
963 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
969 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
970 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
973 foreach(split("\n",$code)) {
974 s/\`([^\`]*)\`/eval($1)/geo;
976 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
977 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
978 s/\/\/\s?/@ /o; # new->old style commentary
980 # fix up remainig new-style suffixes
981 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
984 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
985 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
986 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
987 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
988 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
990 s/^(\s+)mov\./$1mov/o or
991 s/^(\s+)ret/$1bx\tlr/o;