3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional instructions. This has
15 # no effect on mighty Apple A7, as results are literally equal to
16 # the theoretical estimates based on instruction latencies and issue
17 # rate. It remains to be seen how does it affect other platforms...
19 # Performance in cycles per byte processed with 128-bit key:
26 open STDOUT,">".shift;
36 $code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
37 $code.=".fpu neon\n.code 32\n" if ($flavour !~ /64/);
39 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
40 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
41 # maintain both 32- and 64-bit codes within single module and
42 # transliterate common code to either flavour with regex vodoo.
45 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
46 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
47 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
53 .long 0x01,0x01,0x01,0x01
54 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
55 .long 0x1b,0x1b,0x1b,0x1b
57 .globl ${prefix}_set_encrypt_key
58 .type ${prefix}_set_encrypt_key,%function
60 ${prefix}_set_encrypt_key:
63 $code.=<<___ if ($flavour =~ /64/);
64 stp x29,x30,[sp,#-16]!
71 veor $zero,$zero,$zero
72 vld1.8 {$in0},[$inp],#16
73 mov $bits,#8 // reuse $bits
74 vld1.32 {$rcon,$mask},[$ptr],#32
82 vtbl.8 $key,{$in0},$mask
83 vext.8 $tmp,$zero,$in0,#12
84 vst1.32 {$in0},[$out],#16
89 vext.8 $tmp,$zero,$tmp,#12
91 vext.8 $tmp,$zero,$tmp,#12
94 vshl.u8 $rcon,$rcon,#1
98 vld1.32 {$rcon},[$ptr]
100 vtbl.8 $key,{$in0},$mask
101 vext.8 $tmp,$zero,$in0,#12
102 vst1.32 {$in0},[$out],#16
106 vext.8 $tmp,$zero,$tmp,#12
108 vext.8 $tmp,$zero,$tmp,#12
111 vshl.u8 $rcon,$rcon,#1
114 vtbl.8 $key,{$in0},$mask
115 vext.8 $tmp,$zero,$in0,#12
116 vst1.32 {$in0},[$out],#16
120 vext.8 $tmp,$zero,$tmp,#12
122 vext.8 $tmp,$zero,$tmp,#12
126 vst1.32 {$in0},[$out]
134 vld1.8 {$in1},[$inp],#8
135 vmov.i8 $key,#8 // borrow $key
136 vst1.32 {$in0},[$out],#16
137 vsub.i8 $mask,$mask,$key // adjust the mask
140 vtbl.8 $key,{$in1},$mask
141 vext.8 $tmp,$zero,$in0,#12
142 vst1.32 {$in1},[$out],#8
147 vext.8 $tmp,$zero,$tmp,#12
149 vext.8 $tmp,$zero,$tmp,#12
152 vdup.32 $tmp,${in0}[3]
155 vext.8 $in1,$zero,$in1,#12
156 vshl.u8 $rcon,$rcon,#1
160 vst1.32 {$in0},[$out],#16
172 vst1.32 {$in0},[$out],#16
175 vtbl.8 $key,{$in1},$mask
176 vext.8 $tmp,$zero,$in0,#12
177 vst1.32 {$in1},[$out],#16
182 vext.8 $tmp,$zero,$tmp,#12
184 vext.8 $tmp,$zero,$tmp,#12
187 vshl.u8 $rcon,$rcon,#1
189 vst1.32 {$in0},[$out],#16
192 vdup.32 $key,${in0}[3] // just splat
193 vext.8 $tmp,$zero,$in1,#12
197 vext.8 $tmp,$zero,$tmp,#12
199 vext.8 $tmp,$zero,$tmp,#12
208 eor x0,x0,x0 // return value
209 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
211 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
213 .globl ${prefix}_set_decrypt_key
214 .type ${prefix}_set_decrypt_key,%function
216 ${prefix}_set_decrypt_key:
218 $code.=<<___ if ($flavour =~ /64/);
219 stp x29,x30,[sp,#-16]!
222 $code.=<<___ if ($flavour !~ /64/);
228 sub $out,$out,#240 // restore original $out
230 add $inp,$out,x12,lsl#4 // end of key schedule
232 vld1.32 {v0.16b},[$out]
233 vld1.32 {v1.16b},[$inp]
234 vst1.32 {v0.16b},[$inp],x4
235 vst1.32 {v1.16b},[$out],#16
238 vld1.32 {v0.16b},[$out]
239 vld1.32 {v1.16b},[$inp]
242 vst1.32 {v0.16b},[$inp],x4
243 vst1.32 {v1.16b},[$out],#16
247 vld1.32 {v0.16b},[$out]
249 vst1.32 {v0.16b},[$inp]
251 eor x0,x0,x0 // return value
253 $code.=<<___ if ($flavour !~ /64/);
256 $code.=<<___ if ($flavour =~ /64/);
261 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
267 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
268 my ($inp,$out,$key)=map("x$_",(0..2));
270 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
273 .globl ${prefix}_${dir}crypt
274 .type ${prefix}_${dir}crypt,%function
276 ${prefix}_${dir}crypt:
277 ldr $rounds,[$key,#240]
278 vld1.32 {$rndkey0},[$key],#16
279 vld1.8 {$inout},[$inp]
280 sub $rounds,$rounds,#2
281 vld1.32 {$rndkey1},[$key],#16
284 aes$e $inout,$rndkey0
285 vld1.32 {$rndkey0},[$key],#16
287 subs $rounds,$rounds,#2
288 aes$e $inout,$rndkey1
289 vld1.32 {$rndkey1},[$key],#16
293 aes$e $inout,$rndkey0
294 vld1.32 {$rndkey0},[$key]
296 aes$e $inout,$rndkey1
297 veor $inout,$inout,$rndkey0
299 vst1.8 {$inout},[$out]
301 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
308 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
309 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
310 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
312 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
314 ### q8-q15 preloaded key schedule
317 .globl ${prefix}_cbc_encrypt
318 .type ${prefix}_cbc_encrypt,%function
320 ${prefix}_cbc_encrypt:
322 $code.=<<___ if ($flavour =~ /64/);
323 stp x29,x30,[sp,#-16]!
326 $code.=<<___ if ($flavour !~ /64/);
329 vstmdb sp!,{d8-d15} @ ABI specification says so
330 ldmia ip,{r4-r5} @ load remaining args
338 cmp $enc,#0 // en- or decrypting?
339 ldr $rounds,[$key,#240]
341 vld1.8 {$ivec},[$ivp]
342 vld1.8 {$dat},[$inp],$step
344 vld1.32 {q8-q9},[$key] // load key schedule...
345 sub $rounds,$rounds,#6
346 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
347 sub $rounds,$rounds,#2
348 vld1.32 {q10-q11},[$key_],#32
349 vld1.32 {q12-q13},[$key_],#32
350 vld1.32 {q14-q15},[$key_],#32
351 vld1.32 {$rndlast},[$key_]
359 veor $rndzero_n_last,q8,$rndlast
364 vld1.32 {q8},[$key_],#16
368 vld1.32 {q9},[$key_],#16
383 vld1.8 {q8},[$inp],$step
386 veor q8,q8,$rndzero_n_last
389 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
395 veor $ivec,$dat,$rndlast
396 vst1.8 {$ivec},[$out],#16
403 vld1.32 {$in0-$in1},[$key_]
410 vst1.8 {$ivec},[$out],#16
424 vld1.8 {q8},[$inp],$step
431 veor q8,q8,$rndzero_n_last
433 veor $ivec,$dat,$rndlast
434 b.hs .Loop_cbc_enc128
436 vst1.8 {$ivec},[$out],#16
441 vld1.32 {$tmp0-$tmp1},[$key_]
442 veor $ivec,$ivec,$rndlast
443 veor $in0,$dat0,$rndlast
489 veor $ivec,$ivec,$dat0
490 vld1.8 {$dat0},[$inp],$step
492 vld1.8 {$dat1},[$inp],$step1
493 vst1.8 {$ivec},[$out],#16
494 veor $ivec,$in1,$rndlast
495 vst1.8 {$in0},[$out],#16
496 veor $in0,$dat0,$rndlast
497 vorr $in1,$dat1,$dat1
498 b.hs .Loop2x_cbc_dec128
501 veor $ivec,$ivec,$rndlast
503 veor $in0,$in0,$rndlast
514 vld1.8 {$dat1},[$inp],$step
515 vorr $in1,$dat1,$dat1
521 vld1.32 {q8},[$key_],#16
527 vld1.32 {q9},[$key_],#16
536 veor $tmp0,$ivec,$rndlast
537 veor $tmp1,$in0,$rndlast
553 vld1.8 {$in0},[$inp],$step
560 vld1.8 {$in1},[$inp],$step
565 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
570 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
575 veor $tmp0,$tmp0,$dat0
576 veor $tmp1,$tmp1,$dat1
578 vst1.8 {$tmp0},[$out],#16
580 vst1.8 {$tmp1},[$out],#16
588 vld1.32 {q8},[$key_],#16
592 vld1.32 {q9},[$key_],#16
600 veor $tmp,$ivec,$rndlast
615 vst1.8 {$tmp},[$out],#16
618 vst1.8 {$ivec},[$ivp]
621 $code.=<<___ if ($flavour !~ /64/);
625 $code.=<<___ if ($flavour =~ /64/);
630 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
634 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
635 my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
636 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
638 my ($dat,$tmp)=($dat0,$tmp0);
640 ### q8-q15 preloaded key schedule
643 .globl ${prefix}_ctr32_encrypt_blocks
644 .type ${prefix}_ctr32_encrypt_blocks,%function
646 ${prefix}_ctr32_encrypt_blocks:
648 $code.=<<___ if ($flavour =~ /64/);
649 stp x29,x30,[sp,#-16]!
652 $code.=<<___ if ($flavour !~ /64/);
654 stmdb sp!,{r4-r10,lr}
655 vstmdb sp!,{d8-d15} @ ABI specification says so
656 ldr r4, [ip] @ load remaining arg
659 ldr $rounds,[$key,#240]
661 ldr $ctr, [$ivp, #12]
662 vld1.32 {$dat0},[$ivp]
664 vld1.32 {q8-q9},[$key] // load key schedule...
665 sub $rounds,$rounds,#6
666 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
667 sub $rounds,$rounds,#2
668 vld1.32 {q10-q11},[$key_],#32
669 vld1.32 {q12-q13},[$key_],#32
670 vld1.32 {q14-q15},[$key_],#32
671 vld1.32 {$rndlast},[$key_]
682 vorr $dat1,$dat0,$dat0
684 vorr $ivec,$dat0,$dat0
687 vmov.32 ${dat1}[3],$tctr1
693 vld1.32 {q8},[$key_],#16
699 vld1.32 {q9},[$key_],#16
707 vorr $dat0,$ivec,$ivec
709 vorr $dat1,$ivec,$ivec
712 vld1.8 {$in0},[$inp],#16
714 vld1.8 {$in1},[$inp],#16
725 veor $in0,$in0,$rndlast
729 veor $in1,$in1,$rndlast
736 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
741 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
744 vmov.32 ${dat0}[3], $tctr
746 vmov.32 ${dat1}[3], $tctr1
754 vst1.8 {$in0},[$out],#16
755 vst1.8 {$in1},[$out],#16
763 vld1.32 {$tmp0-$tmp1},[$key_]
769 vld1.8 {$in0},[$inp],#16
771 vld1.8 {$in1},[$inp],#16
809 veor $in0,$in0,$rndlast
811 veor $in1,$in1,$rndlast
815 vorr $dat0,$ivec,$ivec
817 vorr $dat1,$ivec,$ivec
818 vst1.8 {$in0},[$out],#16
819 vmov.32 ${dat0}[3], $tctr
820 vst1.8 {$in1},[$out],#16
821 vmov.32 ${dat1}[3], $tctr1
822 b.hs .Loop2x_ctr32_128
829 vld1.32 {q8},[$key_],#16
833 vld1.32 {q9},[$key_],#16
852 veor $in0,$in0,$rndlast
860 $code.=<<___ if ($flavour !~ /64/);
862 ldmia sp!,{r4-r10,pc}
864 $code.=<<___ if ($flavour =~ /64/);
869 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
875 ########################################
876 if ($flavour =~ /64/) { ######## 64-bit code
878 "aesd" => 0x4e285800, "aese" => 0x4e284800,
879 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
882 my ($mnemonic,$arg)=@_;
884 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
885 sprintf ".long\t0x%08x\t//%s %s",
886 $opcode{$mnemonic}|$1|($2<<5),
890 foreach(split("\n",$code)) {
891 s/\`([^\`]*)\`/eval($1)/geo;
893 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
894 s/@\s/\/\//o; # old->new style commentary
896 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
897 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
898 s/vmov\.i8/movi/o or # fix up legacy mnemonics
900 s/vrev32\.8/rev32/o or
903 s/^(\s+)v/$1/o or # strip off v prefix
906 # fix up remainig legacy suffixes
908 m/\],#8/o and s/\.16b/\.8b/go;
909 s/\.[ui]?32//o and s/\.16b/\.4s/go;
910 s/\.[ui]?64//o and s/\.16b/\.2d/go;
911 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
915 } else { ######## 32-bit code
917 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
918 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
921 my ($mnemonic,$arg)=@_;
923 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
924 sprintf ".long\t0x%08x\t@ %s %s",
925 $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
926 |(($2&7)<<1) |(($2&8)<<2),
933 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
934 sprintf "vtbl.8 d%d,{q%d},d%d\n\tvtbl.8 d%d,{q%d},d%d",2*$1,$2,2*$3,2*$1+1,$2,2*$3+1;
940 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
941 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;
947 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
948 sprintf "vmov.32 d%d[%d],%s",2*$1+$2>>1,$2&1,$3;
951 foreach(split("\n",$code)) {
952 s/\`([^\`]*)\`/eval($1)/geo;
954 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
955 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
956 s/\/\/\s?/@ /o; # new->old style commentary
958 # fix up remainig new-style suffixes
961 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
962 s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
963 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
964 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
965 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
967 s/^(\s+)ret/$1bx\tlr/o;