2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
42 # Performance in cycles per byte processed with 128-bit key:
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
80 #if __ARM_MAX_ARCH__>=7
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___ if ($flavour !~ /64/);
84 .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
92 # define INST(a,b,c,d) $_byte a,b,c,d
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
119 ${prefix}_set_encrypt_key:
122 $code.=<<___ if ($flavour =~ /64/);
123 stp x29,x30,[sp,#-16]!
143 veor $zero,$zero,$zero
144 vld1.8 {$in0},[$inp],#16
145 mov $bits,#8 // reuse $bits
146 vld1.32 {$rcon,$mask},[$ptr],#32
154 vtbl.8 $key,{$in0},$mask
155 vext.8 $tmp,$zero,$in0,#12
156 vst1.32 {$in0},[$out],#16
161 vext.8 $tmp,$zero,$tmp,#12
163 vext.8 $tmp,$zero,$tmp,#12
166 vshl.u8 $rcon,$rcon,#1
170 vld1.32 {$rcon},[$ptr]
172 vtbl.8 $key,{$in0},$mask
173 vext.8 $tmp,$zero,$in0,#12
174 vst1.32 {$in0},[$out],#16
178 vext.8 $tmp,$zero,$tmp,#12
180 vext.8 $tmp,$zero,$tmp,#12
183 vshl.u8 $rcon,$rcon,#1
186 vtbl.8 $key,{$in0},$mask
187 vext.8 $tmp,$zero,$in0,#12
188 vst1.32 {$in0},[$out],#16
192 vext.8 $tmp,$zero,$tmp,#12
194 vext.8 $tmp,$zero,$tmp,#12
198 vst1.32 {$in0},[$out]
206 vld1.8 {$in1},[$inp],#8
207 vmov.i8 $key,#8 // borrow $key
208 vst1.32 {$in0},[$out],#16
209 vsub.i8 $mask,$mask,$key // adjust the mask
212 vtbl.8 $key,{$in1},$mask
213 vext.8 $tmp,$zero,$in0,#12
214 vst1.32 {$in1},[$out],#8
219 vext.8 $tmp,$zero,$tmp,#12
221 vext.8 $tmp,$zero,$tmp,#12
224 vdup.32 $tmp,${in0}[3]
227 vext.8 $in1,$zero,$in1,#12
228 vshl.u8 $rcon,$rcon,#1
232 vst1.32 {$in0},[$out],#16
244 vst1.32 {$in0},[$out],#16
247 vtbl.8 $key,{$in1},$mask
248 vext.8 $tmp,$zero,$in0,#12
249 vst1.32 {$in1},[$out],#16
254 vext.8 $tmp,$zero,$tmp,#12
256 vext.8 $tmp,$zero,$tmp,#12
259 vshl.u8 $rcon,$rcon,#1
261 vst1.32 {$in0},[$out],#16
264 vdup.32 $key,${in0}[3] // just splat
265 vext.8 $tmp,$zero,$in1,#12
269 vext.8 $tmp,$zero,$tmp,#12
271 vext.8 $tmp,$zero,$tmp,#12
282 mov x0,$ptr // return value
283 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
285 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
287 .globl ${prefix}_set_decrypt_key
288 .type ${prefix}_set_decrypt_key,%function
290 ${prefix}_set_decrypt_key:
292 $code.=<<___ if ($flavour =~ /64/);
293 .inst 0xd503233f // paciasp
294 stp x29,x30,[sp,#-16]!
297 $code.=<<___ if ($flavour !~ /64/);
306 sub $out,$out,#240 // restore original $out
308 add $inp,$out,x12,lsl#4 // end of key schedule
310 vld1.32 {v0.16b},[$out]
311 vld1.32 {v1.16b},[$inp]
312 vst1.32 {v0.16b},[$inp],x4
313 vst1.32 {v1.16b},[$out],#16
316 vld1.32 {v0.16b},[$out]
317 vld1.32 {v1.16b},[$inp]
320 vst1.32 {v0.16b},[$inp],x4
321 vst1.32 {v1.16b},[$out],#16
325 vld1.32 {v0.16b},[$out]
327 vst1.32 {v0.16b},[$inp]
329 eor x0,x0,x0 // return value
332 $code.=<<___ if ($flavour !~ /64/);
335 $code.=<<___ if ($flavour =~ /64/);
337 .inst 0xd50323bf // autiasp
341 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
347 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
348 my ($inp,$out,$key)=map("x$_",(0..2));
350 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
353 .globl ${prefix}_${dir}crypt
354 .type ${prefix}_${dir}crypt,%function
356 ${prefix}_${dir}crypt:
357 ldr $rounds,[$key,#240]
358 vld1.32 {$rndkey0},[$key],#16
359 vld1.8 {$inout},[$inp]
360 sub $rounds,$rounds,#2
361 vld1.32 {$rndkey1},[$key],#16
364 aes$e $inout,$rndkey0
366 vld1.32 {$rndkey0},[$key],#16
367 subs $rounds,$rounds,#2
368 aes$e $inout,$rndkey1
370 vld1.32 {$rndkey1},[$key],#16
373 aes$e $inout,$rndkey0
375 vld1.32 {$rndkey0},[$key]
376 aes$e $inout,$rndkey1
377 veor $inout,$inout,$rndkey0
379 vst1.8 {$inout},[$out]
381 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
388 # Performance in cycles per byte.
389 # Processed with AES-ECB different key size.
390 # It shows the value before and after optimization as below:
393 # AES-128-ECB AES-192-ECB AES-256-ECB
394 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
395 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
397 # Optimization is implemented by loop unrolling and interleaving.
398 # Commonly, we choose the unrolling factor as 5, if the input
399 # data size smaller than 5 blocks, but not smaller than 3 blocks,
400 # choose 3 as the unrolling factor.
401 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
402 # as one iteration, every loop the left size lsize -= 5*16.
403 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
404 # every loop lsize -=3*16.
405 # If lsize < 3*16 bytes, treat them as the tail, interleave the
406 # two blocks AES instructions.
407 # There is one special case, if the original input data size dsize
408 # = 16 bytes, we will treat it seperately to improve the
409 # performance: one independent code block without LR, FP load and
410 # store, just looks like what the original ECB implementation does.
413 my ($inp,$out,$len,$key)=map("x$_",(0..3));
414 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
415 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
417 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
419 ### q7 last round key
420 ### q10-q15 q7 Last 7 round keys
421 ### q8-q9 preloaded round keys except last 7 keys for big size
422 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
425 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
427 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
428 my ($dat4,$in4,$tmp4);
429 if ($flavour =~ /64/) {
430 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
434 .globl ${prefix}_ecb_encrypt
435 .type ${prefix}_ecb_encrypt,%function
437 ${prefix}_ecb_encrypt:
439 $code.=<<___ if ($flavour =~ /64/);
441 // Original input data size bigger than 16, jump to big size processing.
443 vld1.8 {$dat0},[$inp]
444 cmp $enc,#0 // en- or decrypting?
445 ldr $rounds,[$key,#240]
446 vld1.32 {q5-q6},[$key],#32 // load key schedule...
451 vld1.32 {q8-q9},[$key],#32 // load key schedule...
454 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
459 vld1.32 {q8},[$key],#16 // load key schedule...
462 vld1.32 {q9},[$key],#16 // load key schedule...
463 subs $rounds,$rounds,#2 // bias
464 b.gt .Lecb_round_loop
466 vld1.32 {q10-q11},[$key],#32 // load key schedule...
471 vld1.32 {q12-q13},[$key],#32 // load key schedule...
476 vld1.32 {q14-q15},[$key],#32 // load key schedule...
481 vld1.32 {$rndlast},[$key]
485 veor $dat0,$dat0,$rndlast
486 vst1.8 {$dat0},[$out]
491 vld1.32 {q8-q9},[$key],#32 // load key schedule...
494 subs $rounds,$rounds,#10 // bias
496 .Lecb_dec_round_loop:
499 vld1.32 {q8},[$key],#16 // load key schedule...
502 vld1.32 {q9},[$key],#16 // load key schedule...
503 subs $rounds,$rounds,#2 // bias
504 b.gt .Lecb_dec_round_loop
506 vld1.32 {q10-q11},[$key],#32 // load key schedule...
511 vld1.32 {q12-q13},[$key],#32 // load key schedule...
516 vld1.32 {q14-q15},[$key],#32 // load key schedule...
521 vld1.32 {$rndlast},[$key]
525 veor $dat0,$dat0,$rndlast
526 vst1.8 {$dat0},[$out]
530 $code.=<<___ if ($flavour =~ /64/);
531 stp x29,x30,[sp,#-16]!
534 $code.=<<___ if ($flavour !~ /64/);
537 vstmdb sp!,{d8-d15} @ ABI specification says so
538 ldmia ip,{r4-r5} @ load remaining args
546 cmp $enc,#0 // en- or decrypting?
547 ldr $rounds,[$key,#240]
549 vld1.8 {$dat},[$inp],$step
551 vld1.32 {q8-q9},[$key] // load key schedule...
552 sub $rounds,$rounds,#6
553 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
554 sub $rounds,$rounds,#2
555 vld1.32 {q10-q11},[$key_],#32
556 vld1.32 {q12-q13},[$key_],#32
557 vld1.32 {q14-q15},[$key_],#32
558 vld1.32 {$rndlast},[$key_]
564 vld1.8 {$dat1},[$inp],#16
565 subs $len,$len,#32 // bias
567 vorr $in1,$dat1,$dat1
568 vorr $dat2,$dat1,$dat1
573 vld1.8 {$dat2},[$inp],#16
575 $code.=<<___ if ($flavour =~ /64/);
579 vld1.8 {$dat3},[$inp],#16
580 vld1.8 {$dat4},[$inp],#16
581 sub $len,$len,#32 // bias
595 vld1.32 {q8},[$key_],#16
607 vld1.32 {q9},[$key_],#16
620 cmp $len,#0x40 // because .Lecb_enc_tail4x
633 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
646 add $inp,$inp,x6 // $inp is adjusted in such way that
647 // at exit from the loop $dat1-$dat4
648 // are loaded with last "words"
649 add x6,$len,#0x60 // because .Lecb_enc_tail4x
696 vld1.8 {$in0},[$inp],#16
698 vld1.8 {$in1},[$inp],#16
700 vld1.8 {$in2},[$inp],#16
702 vld1.8 {$in3},[$inp],#16
704 vld1.8 {$in4},[$inp],#16
705 cbz x6,.Lecb_enc_tail4x
706 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
707 veor $tmp0,$rndlast,$dat0
709 veor $tmp1,$rndlast,$dat1
711 veor $tmp2,$rndlast,$dat2
713 veor $tmp3,$rndlast,$dat3
715 veor $tmp4,$rndlast,$dat4
716 vst1.8 {$tmp0},[$out],#16
718 vst1.8 {$tmp1},[$out],#16
720 vst1.8 {$tmp2},[$out],#16
721 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
722 vst1.8 {$tmp3},[$out],#16
723 vst1.8 {$tmp4},[$out],#16
740 veor $tmp1,$rndlast,$dat1
741 veor $tmp2,$rndlast,$dat2
742 veor $tmp3,$rndlast,$dat3
743 veor $tmp4,$rndlast,$dat4
744 vst1.8 {$tmp1},[$out],#16
745 vst1.8 {$tmp2},[$out],#16
746 vst1.8 {$tmp3},[$out],#16
747 vst1.8 {$tmp4},[$out],#16
760 vld1.32 {q8},[$key_],#16
768 vld1.32 {q9},[$key_],#16
778 mov.lo x6,$len // x6, $cnt, is zero at this point
785 add $inp,$inp,x6 // $inp is adjusted in such way that
786 // at exit from the loop $dat1-$dat2
787 // are loaded with last "words"
795 vld1.8 {$in0},[$inp],#16
802 vld1.8 {$in1},[$inp],#16
809 vld1.8 {$in2},[$inp],#16
813 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
815 veor $tmp0,$rndlast,$dat0
816 veor $tmp1,$rndlast,$dat1
817 veor $dat2,$dat2,$rndlast
818 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
819 vst1.8 {$tmp0},[$out],#16
821 vst1.8 {$tmp1},[$out],#16
823 vst1.8 {$dat2},[$out],#16
836 vld1.32 {q8},[$key_],#16
842 vld1.32 {q9},[$key_],#16
869 veor $tmp1,$rndlast,$dat1
870 veor $tmp2,$rndlast,$dat2
871 vst1.8 {$tmp1},[$out],#16
872 vst1.8 {$tmp2},[$out],#16
876 veor $tmp1,$rndlast,$dat2
877 vst1.8 {$tmp1},[$out],#16
884 vld1.8 {$dat1},[$inp],#16
885 subs $len,$len,#32 // bias
887 vorr $in1,$dat1,$dat1
888 vorr $dat2,$dat1,$dat1
893 vld1.8 {$dat2},[$inp],#16
895 $code.=<<___ if ($flavour =~ /64/);
899 vld1.8 {$dat3},[$inp],#16
900 vld1.8 {$dat4},[$inp],#16
901 sub $len,$len,#32 // bias
915 vld1.32 {q8},[$key_],#16
927 vld1.32 {q9},[$key_],#16
940 cmp $len,#0x40 // because .Lecb_tail4x
953 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
966 add $inp,$inp,x6 // $inp is adjusted in such way that
967 // at exit from the loop $dat1-$dat4
968 // are loaded with last "words"
969 add x6,$len,#0x60 // because .Lecb_tail4x
1016 vld1.8 {$in0},[$inp],#16
1018 vld1.8 {$in1},[$inp],#16
1020 vld1.8 {$in2},[$inp],#16
1022 vld1.8 {$in3},[$inp],#16
1024 vld1.8 {$in4},[$inp],#16
1026 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1027 veor $tmp0,$rndlast,$dat0
1028 vorr $dat0,$in0,$in0
1029 veor $tmp1,$rndlast,$dat1
1030 vorr $dat1,$in1,$in1
1031 veor $tmp2,$rndlast,$dat2
1032 vorr $dat2,$in2,$in2
1033 veor $tmp3,$rndlast,$dat3
1034 vorr $dat3,$in3,$in3
1035 veor $tmp4,$rndlast,$dat4
1036 vst1.8 {$tmp0},[$out],#16
1037 vorr $dat4,$in4,$in4
1038 vst1.8 {$tmp1},[$out],#16
1040 vst1.8 {$tmp2},[$out],#16
1041 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1042 vst1.8 {$tmp3},[$out],#16
1043 vst1.8 {$tmp4},[$out],#16
1044 b.hs .Loop5x_ecb_dec
1050 subs $len,$len,#0x30
1051 vorr $dat0,$in2,$in2
1052 vorr $dat1,$in3,$in3
1053 vorr $dat2,$in4,$in4
1060 veor $tmp1,$rndlast,$dat1
1061 veor $tmp2,$rndlast,$dat2
1062 veor $tmp3,$rndlast,$dat3
1063 veor $tmp4,$rndlast,$dat4
1064 vst1.8 {$tmp1},[$out],#16
1065 vst1.8 {$tmp2},[$out],#16
1066 vst1.8 {$tmp3},[$out],#16
1067 vst1.8 {$tmp4},[$out],#16
1080 vld1.32 {q8},[$key_],#16
1088 vld1.32 {q9},[$key_],#16
1089 b.gt .Loop3x_ecb_dec
1097 subs $len,$len,#0x30
1098 mov.lo x6,$len // x6, $cnt, is zero at this point
1105 add $inp,$inp,x6 // $inp is adjusted in such way that
1106 // at exit from the loop $dat1-$dat2
1107 // are loaded with last "words"
1115 vld1.8 {$in0},[$inp],#16
1122 vld1.8 {$in1},[$inp],#16
1129 vld1.8 {$in2},[$inp],#16
1133 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1135 veor $tmp0,$rndlast,$dat0
1136 veor $tmp1,$rndlast,$dat1
1137 veor $dat2,$dat2,$rndlast
1138 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1139 vst1.8 {$tmp0},[$out],#16
1140 vorr $dat0,$in0,$in0
1141 vst1.8 {$tmp1},[$out],#16
1142 vorr $dat1,$in1,$in1
1143 vst1.8 {$dat2},[$out],#16
1144 vorr $dat2,$in2,$in2
1145 b.hs .Loop3x_ecb_dec
1156 vld1.32 {q8},[$key_],#16
1162 vld1.32 {q9},[$key_],#16
1189 veor $tmp1,$rndlast,$dat1
1190 veor $tmp2,$rndlast,$dat2
1191 vst1.8 {$tmp1},[$out],#16
1192 vst1.8 {$tmp2},[$out],#16
1196 veor $tmp1,$rndlast,$dat2
1197 vst1.8 {$tmp1},[$out],#16
1202 $code.=<<___ if ($flavour !~ /64/);
1204 ldmia sp!,{r4-r8,pc}
1206 $code.=<<___ if ($flavour =~ /64/);
1209 $code.=<<___ if ($flavour =~ /64/);
1214 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1218 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1219 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1220 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1222 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1223 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1225 ### q8-q15 preloaded key schedule
1228 .globl ${prefix}_cbc_encrypt
1229 .type ${prefix}_cbc_encrypt,%function
1231 ${prefix}_cbc_encrypt:
1233 $code.=<<___ if ($flavour =~ /64/);
1234 stp x29,x30,[sp,#-16]!
1237 $code.=<<___ if ($flavour !~ /64/);
1239 stmdb sp!,{r4-r8,lr}
1240 vstmdb sp!,{d8-d15} @ ABI specification says so
1241 ldmia ip,{r4-r5} @ load remaining args
1249 cmp $enc,#0 // en- or decrypting?
1250 ldr $rounds,[$key,#240]
1252 vld1.8 {$ivec},[$ivp]
1253 vld1.8 {$dat},[$inp],$step
1255 vld1.32 {q8-q9},[$key] // load key schedule...
1256 sub $rounds,$rounds,#6
1257 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1258 sub $rounds,$rounds,#2
1259 vld1.32 {q10-q11},[$key_],#32
1260 vld1.32 {q12-q13},[$key_],#32
1261 vld1.32 {q14-q15},[$key_],#32
1262 vld1.32 {$rndlast},[$key_]
1269 veor $dat,$dat,$ivec
1270 veor $rndzero_n_last,q8,$rndlast
1273 vld1.32 {$in0-$in1},[$key_]
1275 add $key4,$key,#16*4
1276 add $key5,$key,#16*5
1279 add $key6,$key,#16*6
1280 add $key7,$key,#16*7
1287 vst1.8 {$ivec},[$out],#16
1293 vld1.32 {q8},[$key4]
1297 vld1.32 {q9},[$key5]
1302 vld1.32 {q8},[$key6]
1305 vld1.32 {q9},[$key7]
1319 vld1.8 {q8},[$inp],$step
1322 veor q8,q8,$rndzero_n_last
1325 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1329 veor $ivec,$dat,$rndlast
1332 vst1.8 {$ivec},[$out],#16
1337 vld1.32 {$in0-$in1},[$key_]
1340 b .Lenter_cbc_enc128
1344 vst1.8 {$ivec},[$out],#16
1358 vld1.8 {q8},[$inp],$step
1365 veor q8,q8,$rndzero_n_last
1367 veor $ivec,$dat,$rndlast
1368 b.hs .Loop_cbc_enc128
1370 vst1.8 {$ivec},[$out],#16
1374 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1376 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1377 my ($dat4,$in4,$tmp4);
1378 if ($flavour =~ /64/) {
1379 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1385 vld1.8 {$dat2},[$inp],#16
1386 subs $len,$len,#32 // bias
1389 vorr $dat1,$dat,$dat
1390 vorr $in2,$dat2,$dat2
1393 vorr $dat1,$dat2,$dat2
1394 vld1.8 {$dat2},[$inp],#16
1396 vorr $in1,$dat1,$dat1
1397 vorr $in2,$dat2,$dat2
1399 $code.=<<___ if ($flavour =~ /64/);
1401 b.lo .Loop3x_cbc_dec
1403 vld1.8 {$dat3},[$inp],#16
1404 vld1.8 {$dat4},[$inp],#16
1405 sub $len,$len,#32 // bias
1407 vorr $in3,$dat3,$dat3
1408 vorr $in4,$dat4,$dat4
1421 vld1.32 {q8},[$key_],#16
1433 vld1.32 {q9},[$key_],#16
1434 b.gt .Loop5x_cbc_dec
1446 cmp $len,#0x40 // because .Lcbc_tail4x
1459 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1472 add $inp,$inp,x6 // $inp is adjusted in such way that
1473 // at exit from the loop $dat1-$dat4
1474 // are loaded with last "words"
1475 add x6,$len,#0x60 // because .Lcbc_tail4x
1521 veor $tmp0,$ivec,$rndlast
1523 veor $tmp1,$in0,$rndlast
1524 vld1.8 {$in0},[$inp],#16
1526 veor $tmp2,$in1,$rndlast
1527 vld1.8 {$in1},[$inp],#16
1529 veor $tmp3,$in2,$rndlast
1530 vld1.8 {$in2},[$inp],#16
1532 veor $tmp4,$in3,$rndlast
1533 vld1.8 {$in3},[$inp],#16
1535 vorr $ivec,$in4,$in4
1536 vld1.8 {$in4},[$inp],#16
1538 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1539 veor $tmp0,$tmp0,$dat0
1540 vorr $dat0,$in0,$in0
1541 veor $tmp1,$tmp1,$dat1
1542 vorr $dat1,$in1,$in1
1543 veor $tmp2,$tmp2,$dat2
1544 vorr $dat2,$in2,$in2
1545 veor $tmp3,$tmp3,$dat3
1546 vorr $dat3,$in3,$in3
1547 veor $tmp4,$tmp4,$dat4
1548 vst1.8 {$tmp0},[$out],#16
1549 vorr $dat4,$in4,$in4
1550 vst1.8 {$tmp1},[$out],#16
1552 vst1.8 {$tmp2},[$out],#16
1553 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1554 vst1.8 {$tmp3},[$out],#16
1555 vst1.8 {$tmp4},[$out],#16
1556 b.hs .Loop5x_cbc_dec
1562 subs $len,$len,#0x30
1563 vorr $dat0,$in2,$in2
1565 vorr $dat1,$in3,$in3
1567 vorr $dat2,$in4,$in4
1575 veor $tmp1,$tmp0,$dat1
1576 veor $tmp2,$tmp2,$dat2
1577 veor $tmp3,$tmp3,$dat3
1578 veor $tmp4,$tmp4,$dat4
1579 vst1.8 {$tmp1},[$out],#16
1580 vst1.8 {$tmp2},[$out],#16
1581 vst1.8 {$tmp3},[$out],#16
1582 vst1.8 {$tmp4},[$out],#16
1595 vld1.32 {q8},[$key_],#16
1603 vld1.32 {q9},[$key_],#16
1604 b.gt .Loop3x_cbc_dec
1612 veor $tmp0,$ivec,$rndlast
1613 subs $len,$len,#0x30
1614 veor $tmp1,$in0,$rndlast
1615 mov.lo x6,$len // x6, $cnt, is zero at this point
1622 veor $tmp2,$in1,$rndlast
1623 add $inp,$inp,x6 // $inp is adjusted in such way that
1624 // at exit from the loop $dat1-$dat2
1625 // are loaded with last "words"
1626 vorr $ivec,$in2,$in2
1634 vld1.8 {$in0},[$inp],#16
1641 vld1.8 {$in1},[$inp],#16
1648 vld1.8 {$in2},[$inp],#16
1652 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1654 veor $tmp0,$tmp0,$dat0
1655 veor $tmp1,$tmp1,$dat1
1656 veor $dat2,$dat2,$tmp2
1657 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1658 vst1.8 {$tmp0},[$out],#16
1659 vorr $dat0,$in0,$in0
1660 vst1.8 {$tmp1},[$out],#16
1661 vorr $dat1,$in1,$in1
1662 vst1.8 {$dat2},[$out],#16
1663 vorr $dat2,$in2,$in2
1664 b.hs .Loop3x_cbc_dec
1675 vld1.32 {q8},[$key_],#16
1681 vld1.32 {q9},[$key_],#16
1701 veor $tmp1,$ivec,$rndlast
1706 veor $tmp2,$in1,$rndlast
1710 veor $tmp1,$tmp1,$dat1
1711 veor $tmp2,$tmp2,$dat2
1712 vorr $ivec,$in2,$in2
1713 vst1.8 {$tmp1},[$out],#16
1714 vst1.8 {$tmp2},[$out],#16
1718 veor $tmp1,$tmp1,$dat2
1719 vorr $ivec,$in2,$in2
1720 vst1.8 {$tmp1},[$out],#16
1723 vst1.8 {$ivec},[$ivp]
1727 $code.=<<___ if ($flavour !~ /64/);
1729 ldmia sp!,{r4-r8,pc}
1731 $code.=<<___ if ($flavour =~ /64/);
1736 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1740 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1741 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1742 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1743 my $step="x12"; # aliases with $tctr2
1745 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1746 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1748 # used only in 64-bit mode...
1749 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1751 my ($dat,$tmp)=($dat0,$tmp0);
1753 ### q8-q15 preloaded key schedule
1756 .globl ${prefix}_ctr32_encrypt_blocks
1757 .type ${prefix}_ctr32_encrypt_blocks,%function
1759 ${prefix}_ctr32_encrypt_blocks:
1761 $code.=<<___ if ($flavour =~ /64/);
1762 stp x29,x30,[sp,#-16]!
1765 $code.=<<___ if ($flavour !~ /64/);
1767 stmdb sp!,{r4-r10,lr}
1768 vstmdb sp!,{d8-d15} @ ABI specification says so
1769 ldr r4, [ip] @ load remaining arg
1772 ldr $rounds,[$key,#240]
1774 ldr $ctr, [$ivp, #12]
1775 vld1.32 {$dat0},[$ivp]
1777 vld1.32 {q8-q9},[$key] // load key schedule...
1778 sub $rounds,$rounds,#4
1781 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1782 sub $rounds,$rounds,#2
1783 vld1.32 {q12-q13},[$key_],#32
1784 vld1.32 {q14-q15},[$key_],#32
1785 vld1.32 {$rndlast},[$key_]
1792 vorr $dat1,$dat0,$dat0
1793 add $tctr1, $ctr, #1
1794 vorr $dat2,$dat0,$dat0
1796 vorr $ivec,$dat0,$dat0
1798 vmov.32 ${dat1}[3],$tctr1
1801 sub $len,$len,#3 // bias
1802 vmov.32 ${dat2}[3],$tctr2
1804 $code.=<<___ if ($flavour =~ /64/);
1810 vorr $dat3,$dat0,$dat0
1812 vorr $dat4,$dat0,$dat0
1814 vmov.32 ${dat3}[3],w13
1815 sub $len,$len,#2 // bias
1816 vmov.32 ${dat4}[3],w14
1832 vld1.32 {q8},[$key_],#16
1844 vld1.32 {q9},[$key_],#16
1858 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1870 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1906 vld1.8 {$in0},[$inp],#16
1909 vld1.8 {$in1},[$inp],#16
1912 vld1.8 {$in2},[$inp],#16
1915 vld1.8 {$in3},[$inp],#16
1918 vld1.8 {$in4},[$inp],#16
1921 veor $in0,$in0,$rndlast
1923 veor $in1,$in1,$rndlast
1925 veor $in2,$in2,$rndlast
1927 veor $in3,$in3,$rndlast
1929 veor $in4,$in4,$rndlast
1931 veor $in0,$in0,$dat0
1932 vorr $dat0,$ivec,$ivec
1933 veor $in1,$in1,$dat1
1934 vorr $dat1,$ivec,$ivec
1935 veor $in2,$in2,$dat2
1936 vorr $dat2,$ivec,$ivec
1937 veor $in3,$in3,$dat3
1938 vorr $dat3,$ivec,$ivec
1939 veor $in4,$in4,$dat4
1940 vorr $dat4,$ivec,$ivec
1942 vst1.8 {$in0},[$out],#16
1943 vmov.32 ${dat0}[3],$tctr0
1944 vst1.8 {$in1},[$out],#16
1945 vmov.32 ${dat1}[3],$tctr1
1946 vst1.8 {$in2},[$out],#16
1947 vmov.32 ${dat2}[3],$tctr2
1948 vst1.8 {$in3},[$out],#16
1949 vmov.32 ${dat3}[3],w13
1950 vst1.8 {$in4},[$out],#16
1951 vmov.32 ${dat4}[3],w14
1954 cbz $len,.Lctr32_done
1968 sub $len,$len,#3 // bias
1982 vld1.32 {q8},[$key_],#16
1990 vld1.32 {q9},[$key_],#16
1997 vld1.8 {$in0},[$inp],#16
1998 vorr $dat0,$ivec,$ivec
2001 vld1.8 {$in1},[$inp],#16
2002 vorr $dat1,$ivec,$ivec
2007 vld1.8 {$in2},[$inp],#16
2011 vorr $dat2,$ivec,$ivec
2017 veor $in0,$in0,$rndlast
2021 veor $in1,$in1,$rndlast
2027 veor $in2,$in2,$rndlast
2031 vmov.32 ${dat0}[3], $tctr0
2037 vmov.32 ${dat1}[3], $tctr1
2041 vmov.32 ${dat2}[3], $tctr2
2047 veor $in0,$in0,$tmp0
2048 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2049 vst1.8 {$in0},[$out],#16
2050 veor $in1,$in1,$tmp1
2052 vst1.8 {$in1},[$out],#16
2053 veor $in2,$in2,$tmp2
2054 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2055 vst1.8 {$in2},[$out],#16
2069 vld1.32 {q8},[$key_],#16
2075 vld1.32 {q9},[$key_],#16
2086 vld1.8 {$in0},[$inp],$step
2091 vld1.8 {$in1},[$inp]
2096 veor $in0,$in0,$rndlast
2101 veor $in1,$in1,$rndlast
2106 veor $in0,$in0,$dat0
2107 veor $in1,$in1,$dat1
2108 vst1.8 {$in0},[$out],#16
2110 vst1.8 {$in1},[$out]
2114 $code.=<<___ if ($flavour !~ /64/);
2116 ldmia sp!,{r4-r10,pc}
2118 $code.=<<___ if ($flavour =~ /64/);
2123 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2129 ########################################
2130 if ($flavour =~ /64/) { ######## 64-bit code
2132 "aesd" => 0x4e285800, "aese" => 0x4e284800,
2133 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
2135 local *unaes = sub {
2136 my ($mnemonic,$arg)=@_;
2138 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
2139 sprintf ".inst\t0x%08x\t//%s %s",
2140 $opcode{$mnemonic}|$1|($2<<5),
2144 foreach(split("\n",$code)) {
2145 s/\`([^\`]*)\`/eval($1)/geo;
2147 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
2148 s/@\s/\/\//o; # old->new style commentary
2150 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
2151 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
2152 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
2153 s/vmov\.i8/movi/o or # fix up legacy mnemonics
2155 s/vrev32\.8/rev32/o or
2156 s/vtst\.8/cmtst/o or
2158 s/^(\s+)v/$1/o or # strip off v prefix
2159 s/\bbx\s+lr\b/ret/o;
2161 # fix up remaining legacy suffixes
2163 m/\],#8/o and s/\.16b/\.8b/go;
2164 s/\.[ui]?32//o and s/\.16b/\.4s/go;
2165 s/\.[ui]?64//o and s/\.16b/\.2d/go;
2166 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
2170 } else { ######## 32-bit code
2172 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
2173 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
2175 local *unaes = sub {
2176 my ($mnemonic,$arg)=@_;
2178 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
2179 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
2180 |(($2&7)<<1) |(($2&8)<<2);
2181 # since ARMv7 instructions are always encoded little-endian.
2182 # correct solution is to use .inst directive, but older
2183 # assemblers don't implement it:-(
2184 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
2185 $word&0xff,($word>>8)&0xff,
2186 ($word>>16)&0xff,($word>>24)&0xff,
2194 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
2195 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
2196 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
2202 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
2203 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
2209 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
2210 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
2213 foreach(split("\n",$code)) {
2214 s/\`([^\`]*)\`/eval($1)/geo;
2216 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
2217 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
2218 s/\/\/\s?/@ /o; # new->old style commentary
2220 # fix up remaining new-style suffixes
2221 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
2224 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
2225 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
2226 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
2227 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
2228 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
2229 s/^(\s+)b\./$1b/o or
2230 s/^(\s+)ret/$1bx\tlr/o;
2232 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {