3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
23 # Performance in cycles per byte processed with 128-bit key:
26 # Apple A7 2.39 1.20 1.20
27 # Cortex-A53 2.45 1.87 1.94
28 # Cortex-A57 3.64 1.34 1.32
33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
35 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
36 die "can't locate arm-xlate.pl";
38 open OUT,"| \"$^X\" $xlate $flavour $output";
46 #if __ARM_MAX_ARCH__>=7
49 $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
50 $code.=".arch armv7-a\n.fpu neon\n.code 32\n" if ($flavour !~ /64/);
51 #^^^^^^ this is done to simplify adoption by not depending
54 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
55 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
56 # maintain both 32- and 64-bit codes within single module and
57 # transliterate common code to either flavour with regex vodoo.
60 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
61 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
62 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
68 .long 0x01,0x01,0x01,0x01
69 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
70 .long 0x1b,0x1b,0x1b,0x1b
72 .globl ${prefix}_set_encrypt_key
73 .type ${prefix}_set_encrypt_key,%function
75 ${prefix}_set_encrypt_key:
78 $code.=<<___ if ($flavour =~ /64/);
79 stp x29,x30,[sp,#-16]!
99 veor $zero,$zero,$zero
100 vld1.8 {$in0},[$inp],#16
101 mov $bits,#8 // reuse $bits
102 vld1.32 {$rcon,$mask},[$ptr],#32
110 vtbl.8 $key,{$in0},$mask
111 vext.8 $tmp,$zero,$in0,#12
112 vst1.32 {$in0},[$out],#16
117 vext.8 $tmp,$zero,$tmp,#12
119 vext.8 $tmp,$zero,$tmp,#12
122 vshl.u8 $rcon,$rcon,#1
126 vld1.32 {$rcon},[$ptr]
128 vtbl.8 $key,{$in0},$mask
129 vext.8 $tmp,$zero,$in0,#12
130 vst1.32 {$in0},[$out],#16
134 vext.8 $tmp,$zero,$tmp,#12
136 vext.8 $tmp,$zero,$tmp,#12
139 vshl.u8 $rcon,$rcon,#1
142 vtbl.8 $key,{$in0},$mask
143 vext.8 $tmp,$zero,$in0,#12
144 vst1.32 {$in0},[$out],#16
148 vext.8 $tmp,$zero,$tmp,#12
150 vext.8 $tmp,$zero,$tmp,#12
154 vst1.32 {$in0},[$out]
162 vld1.8 {$in1},[$inp],#8
163 vmov.i8 $key,#8 // borrow $key
164 vst1.32 {$in0},[$out],#16
165 vsub.i8 $mask,$mask,$key // adjust the mask
168 vtbl.8 $key,{$in1},$mask
169 vext.8 $tmp,$zero,$in0,#12
170 vst1.32 {$in1},[$out],#8
175 vext.8 $tmp,$zero,$tmp,#12
177 vext.8 $tmp,$zero,$tmp,#12
180 vdup.32 $tmp,${in0}[3]
183 vext.8 $in1,$zero,$in1,#12
184 vshl.u8 $rcon,$rcon,#1
188 vst1.32 {$in0},[$out],#16
200 vst1.32 {$in0},[$out],#16
203 vtbl.8 $key,{$in1},$mask
204 vext.8 $tmp,$zero,$in0,#12
205 vst1.32 {$in1},[$out],#16
210 vext.8 $tmp,$zero,$tmp,#12
212 vext.8 $tmp,$zero,$tmp,#12
215 vshl.u8 $rcon,$rcon,#1
217 vst1.32 {$in0},[$out],#16
220 vdup.32 $key,${in0}[3] // just splat
221 vext.8 $tmp,$zero,$in1,#12
225 vext.8 $tmp,$zero,$tmp,#12
227 vext.8 $tmp,$zero,$tmp,#12
238 mov x0,$ptr // return value
239 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
241 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
243 .globl ${prefix}_set_decrypt_key
244 .type ${prefix}_set_decrypt_key,%function
246 ${prefix}_set_decrypt_key:
248 $code.=<<___ if ($flavour =~ /64/);
249 stp x29,x30,[sp,#-16]!
252 $code.=<<___ if ($flavour !~ /64/);
261 sub $out,$out,#240 // restore original $out
263 add $inp,$out,x12,lsl#4 // end of key schedule
265 vld1.32 {v0.16b},[$out]
266 vld1.32 {v1.16b},[$inp]
267 vst1.32 {v0.16b},[$inp],x4
268 vst1.32 {v1.16b},[$out],#16
271 vld1.32 {v0.16b},[$out]
272 vld1.32 {v1.16b},[$inp]
275 vst1.32 {v0.16b},[$inp],x4
276 vst1.32 {v1.16b},[$out],#16
280 vld1.32 {v0.16b},[$out]
282 vst1.32 {v0.16b},[$inp]
284 eor x0,x0,x0 // return value
287 $code.=<<___ if ($flavour !~ /64/);
290 $code.=<<___ if ($flavour =~ /64/);
295 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
301 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
302 my ($inp,$out,$key)=map("x$_",(0..2));
304 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
307 .globl ${prefix}_${dir}crypt
308 .type ${prefix}_${dir}crypt,%function
310 ${prefix}_${dir}crypt:
311 ldr $rounds,[$key,#240]
312 vld1.32 {$rndkey0},[$key],#16
313 vld1.8 {$inout},[$inp]
314 sub $rounds,$rounds,#2
315 vld1.32 {$rndkey1},[$key],#16
318 aes$e $inout,$rndkey0
319 vld1.32 {$rndkey0},[$key],#16
321 subs $rounds,$rounds,#2
322 aes$e $inout,$rndkey1
323 vld1.32 {$rndkey1},[$key],#16
327 aes$e $inout,$rndkey0
328 vld1.32 {$rndkey0},[$key]
330 aes$e $inout,$rndkey1
331 veor $inout,$inout,$rndkey0
333 vst1.8 {$inout},[$out]
335 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
342 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
343 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
344 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
346 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
348 ### q8-q15 preloaded key schedule
351 .globl ${prefix}_cbc_encrypt
352 .type ${prefix}_cbc_encrypt,%function
354 ${prefix}_cbc_encrypt:
356 $code.=<<___ if ($flavour =~ /64/);
357 stp x29,x30,[sp,#-16]!
360 $code.=<<___ if ($flavour !~ /64/);
363 vstmdb sp!,{d8-d15} @ ABI specification says so
364 ldmia ip,{r4-r5} @ load remaining args
372 cmp $enc,#0 // en- or decrypting?
373 ldr $rounds,[$key,#240]
375 vld1.8 {$ivec},[$ivp]
376 vld1.8 {$dat},[$inp],$step
378 vld1.32 {q8-q9},[$key] // load key schedule...
379 sub $rounds,$rounds,#6
380 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
381 sub $rounds,$rounds,#2
382 vld1.32 {q10-q11},[$key_],#32
383 vld1.32 {q12-q13},[$key_],#32
384 vld1.32 {q14-q15},[$key_],#32
385 vld1.32 {$rndlast},[$key_]
393 veor $rndzero_n_last,q8,$rndlast
398 vld1.32 {q8},[$key_],#16
402 vld1.32 {q9},[$key_],#16
417 vld1.8 {q8},[$inp],$step
420 veor q8,q8,$rndzero_n_last
423 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
429 veor $ivec,$dat,$rndlast
430 vst1.8 {$ivec},[$out],#16
437 vld1.32 {$in0-$in1},[$key_]
444 vst1.8 {$ivec},[$out],#16
458 vld1.8 {q8},[$inp],$step
465 veor q8,q8,$rndzero_n_last
467 veor $ivec,$dat,$rndlast
468 b.hs .Loop_cbc_enc128
470 vst1.8 {$ivec},[$out],#16
474 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
478 vld1.8 {$dat2},[$inp],#16
479 subs $len,$len,#32 // bias
483 vorr $in2,$dat2,$dat2
486 vorr $dat1,$dat2,$dat2
487 vld1.8 {$dat2},[$inp],#16
489 vorr $in1,$dat1,$dat1
490 vorr $in2,$dat2,$dat2
496 vld1.32 {q8},[$key_],#16
504 vld1.32 {q9},[$key_],#16
513 veor $tmp0,$ivec,$rndlast
517 veor $tmp1,$in0,$rndlast
521 veor $tmp2,$in1,$rndlast
527 mov.lo x6,$len // x6, $cnt, is zero at this point
531 add $inp,$inp,x6 // $inp is adjusted in such way that
532 // at exit from the loop $dat1-$dat2
533 // are loaded with last "words"
541 vld1.8 {$in0},[$inp],#16
545 vld1.8 {$in1},[$inp],#16
549 vld1.8 {$in2},[$inp],#16
553 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
559 veor $tmp0,$tmp0,$dat0
560 veor $tmp1,$tmp1,$dat1
561 veor $dat2,$dat2,$tmp2
562 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
564 vst1.8 {$tmp0},[$out],#16
566 vst1.8 {$tmp1},[$out],#16
567 vst1.8 {$dat2},[$out],#16
578 vld1.32 {q8},[$key_],#16
584 vld1.32 {q9},[$key_],#16
606 veor $tmp1,$ivec,$rndlast
611 veor $tmp2,$in1,$rndlast
615 veor $tmp1,$tmp1,$dat1
616 veor $tmp2,$tmp2,$dat2
618 vst1.8 {$tmp1},[$out],#16
619 vst1.8 {$tmp2},[$out],#16
623 veor $tmp1,$tmp1,$dat2
625 vst1.8 {$tmp1},[$out],#16
628 vst1.8 {$ivec},[$ivp]
632 $code.=<<___ if ($flavour !~ /64/);
636 $code.=<<___ if ($flavour =~ /64/);
641 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
645 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
646 my ($rounds,$cnt,$key_)=("w5","w6","x7");
647 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
648 my $step="x12"; # aliases with $tctr2
650 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
651 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
653 my ($dat,$tmp)=($dat0,$tmp0);
655 ### q8-q15 preloaded key schedule
658 .globl ${prefix}_ctr32_encrypt_blocks
659 .type ${prefix}_ctr32_encrypt_blocks,%function
661 ${prefix}_ctr32_encrypt_blocks:
663 $code.=<<___ if ($flavour =~ /64/);
664 stp x29,x30,[sp,#-16]!
667 $code.=<<___ if ($flavour !~ /64/);
669 stmdb sp!,{r4-r10,lr}
670 vstmdb sp!,{d8-d15} @ ABI specification says so
671 ldr r4, [ip] @ load remaining arg
674 ldr $rounds,[$key,#240]
676 ldr $ctr, [$ivp, #12]
677 vld1.32 {$dat0},[$ivp]
679 vld1.32 {q8-q9},[$key] // load key schedule...
680 sub $rounds,$rounds,#4
683 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
684 sub $rounds,$rounds,#2
685 vld1.32 {q12-q13},[$key_],#32
686 vld1.32 {q14-q15},[$key_],#32
687 vld1.32 {$rndlast},[$key_]
694 vorr $dat1,$dat0,$dat0
696 vorr $dat2,$dat0,$dat0
698 vorr $ivec,$dat0,$dat0
700 vmov.32 ${dat1}[3],$tctr1
703 sub $len,$len,#3 // bias
704 vmov.32 ${dat2}[3],$tctr2
712 vld1.32 {q8},[$key_],#16
720 vld1.32 {q9},[$key_],#16
731 vld1.8 {$in0},[$inp],#16
734 vorr $dat0,$ivec,$ivec
736 vld1.8 {$in1},[$inp],#16
739 vorr $dat1,$ivec,$ivec
741 vld1.8 {$in2},[$inp],#16
744 vorr $dat2,$ivec,$ivec
749 veor $in0,$in0,$rndlast
754 veor $in1,$in1,$rndlast
759 veor $in2,$in2,$rndlast
762 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
765 vmov.32 ${dat0}[3], $tctr0
770 vmov.32 ${dat1}[3], $tctr1
775 vmov.32 ${dat2}[3], $tctr2
785 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
786 vst1.8 {$in0},[$out],#16
787 vst1.8 {$in1},[$out],#16
788 vst1.8 {$in2},[$out],#16
800 vld1.32 {q8},[$key_],#16
806 vld1.32 {q9},[$key_],#16
819 vld1.8 {$in0},[$inp],$step
831 veor $in0,$in0,$rndlast
834 veor $in1,$in1,$rndlast
841 vst1.8 {$in0},[$out],#16
847 $code.=<<___ if ($flavour !~ /64/);
849 ldmia sp!,{r4-r10,pc}
851 $code.=<<___ if ($flavour =~ /64/);
856 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
862 ########################################
863 if ($flavour =~ /64/) { ######## 64-bit code
865 "aesd" => 0x4e285800, "aese" => 0x4e284800,
866 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
869 my ($mnemonic,$arg)=@_;
871 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
872 sprintf ".inst\t0x%08x\t//%s %s",
873 $opcode{$mnemonic}|$1|($2<<5),
877 foreach(split("\n",$code)) {
878 s/\`([^\`]*)\`/eval($1)/geo;
880 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
881 s/@\s/\/\//o; # old->new style commentary
883 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
884 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
885 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
886 s/vmov\.i8/movi/o or # fix up legacy mnemonics
888 s/vrev32\.8/rev32/o or
891 s/^(\s+)v/$1/o or # strip off v prefix
894 # fix up remainig legacy suffixes
896 m/\],#8/o and s/\.16b/\.8b/go;
897 s/\.[ui]?32//o and s/\.16b/\.4s/go;
898 s/\.[ui]?64//o and s/\.16b/\.2d/go;
899 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
903 } else { ######## 32-bit code
905 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
906 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
909 my ($mnemonic,$arg)=@_;
911 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
912 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
913 |(($2&7)<<1) |(($2&8)<<2);
914 # since ARMv7 instructions are always encoded little-endian.
915 # correct solution is to use .inst directive, but older
916 # assemblers don't implement it:-(
917 sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
918 $word&0xff,($word>>8)&0xff,
919 ($word>>16)&0xff,($word>>24)&0xff,
927 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
928 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
929 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
935 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
936 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
942 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
943 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
946 foreach(split("\n",$code)) {
947 s/\`([^\`]*)\`/eval($1)/geo;
949 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
950 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
951 s/\/\/\s?/@ /o; # new->old style commentary
953 # fix up remainig new-style suffixes
954 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
957 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
958 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
959 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
960 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
961 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
963 s/^(\s+)mov\./$1mov/o or
964 s/^(\s+)ret/$1bx\tlr/o;