2 # Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
32 # Key to performance of parallelize-able modes is round instruction
33 # interleaving. But which factor to use? There is optimal one for
34 # each combination of instruction latency and issue rate, beyond
35 # which increasing interleave factor doesn't pay off. While on cons
36 # side we have code size increase and resource waste on platforms for
37 # which interleave factor is too high. In other words you want it to
38 # be just right. So far interleave factor of 3x was serving well all
39 # platforms. But for ThunderX2 optimal interleave factor was measured
42 # Performance in cycles per byte processed with 128-bit key:
45 # Apple A7 2.39 1.20 1.20
46 # Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
47 # Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
48 # Cortex-A72 1.33 0.85/0.88 0.92/0.96
49 # Denver 1.96 0.65/0.86 0.76/0.80
50 # Mongoose 1.33 1.23/1.20 1.30/1.20
51 # Kryo 1.26 0.87/0.94 1.00/1.00
52 # ThunderX2 5.95 1.25 1.30
54 # (*) original 3.64/1.34/1.32 results were for r0p0 revision
55 # and are still same even for updated module;
56 # (**) numbers after slash are for 32-bit code, which is 3x-
59 # $output is the last argument if it looks like a file (it has an extension)
60 # $flavour is the first argument if it doesn't look like a file
61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
67 die "can't locate arm-xlate.pl";
69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
70 or die "can't call $xlate: $!";
75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
80 #if __ARM_MAX_ARCH__>=7
82 $code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
83 $code.=<<___ if ($flavour !~ /64/);
84 .arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
89 # define INST(a,b,c,d) $_byte c,d|0xc,a,b
92 # define INST(a,b,c,d) $_byte a,b,c,d
98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
100 # maintain both 32- and 64-bit codes within single module and
101 # transliterate common code to either flavour with regex vodoo.
104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
106 $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
112 .long 0x01,0x01,0x01,0x01
113 .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat
114 .long 0x1b,0x1b,0x1b,0x1b
116 .globl ${prefix}_set_encrypt_key
117 .type ${prefix}_set_encrypt_key,%function
119 ${prefix}_set_encrypt_key:
122 $code.=<<___ if ($flavour =~ /64/);
123 stp x29,x30,[sp,#-16]!
143 veor $zero,$zero,$zero
144 vld1.8 {$in0},[$inp],#16
145 mov $bits,#8 // reuse $bits
146 vld1.32 {$rcon,$mask},[$ptr],#32
154 vtbl.8 $key,{$in0},$mask
155 vext.8 $tmp,$zero,$in0,#12
156 vst1.32 {$in0},[$out],#16
161 vext.8 $tmp,$zero,$tmp,#12
163 vext.8 $tmp,$zero,$tmp,#12
166 vshl.u8 $rcon,$rcon,#1
170 vld1.32 {$rcon},[$ptr]
172 vtbl.8 $key,{$in0},$mask
173 vext.8 $tmp,$zero,$in0,#12
174 vst1.32 {$in0},[$out],#16
178 vext.8 $tmp,$zero,$tmp,#12
180 vext.8 $tmp,$zero,$tmp,#12
183 vshl.u8 $rcon,$rcon,#1
186 vtbl.8 $key,{$in0},$mask
187 vext.8 $tmp,$zero,$in0,#12
188 vst1.32 {$in0},[$out],#16
192 vext.8 $tmp,$zero,$tmp,#12
194 vext.8 $tmp,$zero,$tmp,#12
198 vst1.32 {$in0},[$out]
206 vld1.8 {$in1},[$inp],#8
207 vmov.i8 $key,#8 // borrow $key
208 vst1.32 {$in0},[$out],#16
209 vsub.i8 $mask,$mask,$key // adjust the mask
212 vtbl.8 $key,{$in1},$mask
213 vext.8 $tmp,$zero,$in0,#12
215 vst1.32 {$in1},[$out],#16
218 vst1.32 {$in1},[$out],#8
224 vext.8 $tmp,$zero,$tmp,#12
226 vext.8 $tmp,$zero,$tmp,#12
229 vdup.32 $tmp,${in0}[3]
232 vext.8 $in1,$zero,$in1,#12
233 vshl.u8 $rcon,$rcon,#1
237 vst1.32 {$in0},[$out],#16
249 vst1.32 {$in0},[$out],#16
252 vtbl.8 $key,{$in1},$mask
253 vext.8 $tmp,$zero,$in0,#12
254 vst1.32 {$in1},[$out],#16
259 vext.8 $tmp,$zero,$tmp,#12
261 vext.8 $tmp,$zero,$tmp,#12
264 vshl.u8 $rcon,$rcon,#1
266 vst1.32 {$in0},[$out],#16
269 vdup.32 $key,${in0}[3] // just splat
270 vext.8 $tmp,$zero,$in1,#12
274 vext.8 $tmp,$zero,$tmp,#12
276 vext.8 $tmp,$zero,$tmp,#12
287 mov x0,$ptr // return value
288 `"ldr x29,[sp],#16" if ($flavour =~ /64/)`
290 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
292 .globl ${prefix}_set_decrypt_key
293 .type ${prefix}_set_decrypt_key,%function
295 ${prefix}_set_decrypt_key:
297 $code.=<<___ if ($flavour =~ /64/);
298 .inst 0xd503233f // paciasp
299 stp x29,x30,[sp,#-16]!
302 $code.=<<___ if ($flavour !~ /64/);
311 sub $out,$out,#240 // restore original $out
313 add $inp,$out,x12,lsl#4 // end of key schedule
315 vld1.32 {v0.16b},[$out]
316 vld1.32 {v1.16b},[$inp]
317 vst1.32 {v0.16b},[$inp],x4
318 vst1.32 {v1.16b},[$out],#16
321 vld1.32 {v0.16b},[$out]
322 vld1.32 {v1.16b},[$inp]
325 vst1.32 {v0.16b},[$inp],x4
326 vst1.32 {v1.16b},[$out],#16
330 vld1.32 {v0.16b},[$out]
332 vst1.32 {v0.16b},[$inp]
334 eor x0,x0,x0 // return value
337 $code.=<<___ if ($flavour !~ /64/);
340 $code.=<<___ if ($flavour =~ /64/);
342 .inst 0xd50323bf // autiasp
346 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
352 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
353 my ($inp,$out,$key)=map("x$_",(0..2));
355 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
358 .globl ${prefix}_${dir}crypt
359 .type ${prefix}_${dir}crypt,%function
361 ${prefix}_${dir}crypt:
362 ldr $rounds,[$key,#240]
363 vld1.32 {$rndkey0},[$key],#16
364 vld1.8 {$inout},[$inp]
365 sub $rounds,$rounds,#2
366 vld1.32 {$rndkey1},[$key],#16
369 aes$e $inout,$rndkey0
371 vld1.32 {$rndkey0},[$key],#16
372 subs $rounds,$rounds,#2
373 aes$e $inout,$rndkey1
375 vld1.32 {$rndkey1},[$key],#16
378 aes$e $inout,$rndkey0
380 vld1.32 {$rndkey0},[$key]
381 aes$e $inout,$rndkey1
382 veor $inout,$inout,$rndkey0
384 vst1.8 {$inout},[$out]
386 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
393 # Performance in cycles per byte.
394 # Processed with AES-ECB different key size.
395 # It shows the value before and after optimization as below:
398 # AES-128-ECB AES-192-ECB AES-256-ECB
399 # Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
400 # Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
402 # Optimization is implemented by loop unrolling and interleaving.
403 # Commonly, we choose the unrolling factor as 5, if the input
404 # data size smaller than 5 blocks, but not smaller than 3 blocks,
405 # choose 3 as the unrolling factor.
406 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
407 # as one iteration, every loop the left size lsize -= 5*16.
408 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
409 # every loop lsize -=3*16.
410 # If lsize < 3*16 bytes, treat them as the tail, interleave the
411 # two blocks AES instructions.
412 # There is one special case, if the original input data size dsize
413 # = 16 bytes, we will treat it seperately to improve the
414 # performance: one independent code block without LR, FP load and
415 # store, just looks like what the original ECB implementation does.
418 my ($inp,$out,$len,$key)=map("x$_",(0..3));
419 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
420 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
422 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
424 ### q7 last round key
425 ### q10-q15 q7 Last 7 round keys
426 ### q8-q9 preloaded round keys except last 7 keys for big size
427 ### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
430 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
432 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
433 my ($dat4,$in4,$tmp4);
434 if ($flavour =~ /64/) {
435 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
439 .globl ${prefix}_ecb_encrypt
440 .type ${prefix}_ecb_encrypt,%function
442 ${prefix}_ecb_encrypt:
444 $code.=<<___ if ($flavour =~ /64/);
446 // Original input data size bigger than 16, jump to big size processing.
448 vld1.8 {$dat0},[$inp]
449 cmp $enc,#0 // en- or decrypting?
450 ldr $rounds,[$key,#240]
451 vld1.32 {q5-q6},[$key],#32 // load key schedule...
456 vld1.32 {q8-q9},[$key],#32 // load key schedule...
459 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
464 vld1.32 {q8},[$key],#16 // load key schedule...
467 vld1.32 {q9},[$key],#16 // load key schedule...
468 subs $rounds,$rounds,#2 // bias
469 b.gt .Lecb_round_loop
471 vld1.32 {q10-q11},[$key],#32 // load key schedule...
476 vld1.32 {q12-q13},[$key],#32 // load key schedule...
481 vld1.32 {q14-q15},[$key],#32 // load key schedule...
486 vld1.32 {$rndlast},[$key]
490 veor $dat0,$dat0,$rndlast
491 vst1.8 {$dat0},[$out]
496 vld1.32 {q8-q9},[$key],#32 // load key schedule...
499 subs $rounds,$rounds,#10 // bias
501 .Lecb_dec_round_loop:
504 vld1.32 {q8},[$key],#16 // load key schedule...
507 vld1.32 {q9},[$key],#16 // load key schedule...
508 subs $rounds,$rounds,#2 // bias
509 b.gt .Lecb_dec_round_loop
511 vld1.32 {q10-q11},[$key],#32 // load key schedule...
516 vld1.32 {q12-q13},[$key],#32 // load key schedule...
521 vld1.32 {q14-q15},[$key],#32 // load key schedule...
526 vld1.32 {$rndlast},[$key]
530 veor $dat0,$dat0,$rndlast
531 vst1.8 {$dat0},[$out]
535 $code.=<<___ if ($flavour =~ /64/);
536 stp x29,x30,[sp,#-16]!
539 $code.=<<___ if ($flavour !~ /64/);
542 vstmdb sp!,{d8-d15} @ ABI specification says so
543 ldmia ip,{r4-r5} @ load remaining args
551 cmp $enc,#0 // en- or decrypting?
552 ldr $rounds,[$key,#240]
554 vld1.8 {$dat},[$inp],$step
556 vld1.32 {q8-q9},[$key] // load key schedule...
557 sub $rounds,$rounds,#6
558 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
559 sub $rounds,$rounds,#2
560 vld1.32 {q10-q11},[$key_],#32
561 vld1.32 {q12-q13},[$key_],#32
562 vld1.32 {q14-q15},[$key_],#32
563 vld1.32 {$rndlast},[$key_]
569 vld1.8 {$dat1},[$inp],#16
570 subs $len,$len,#32 // bias
572 vorr $in1,$dat1,$dat1
573 vorr $dat2,$dat1,$dat1
578 vld1.8 {$dat2},[$inp],#16
580 $code.=<<___ if ($flavour =~ /64/);
584 vld1.8 {$dat3},[$inp],#16
585 vld1.8 {$dat4},[$inp],#16
586 sub $len,$len,#32 // bias
600 vld1.32 {q8},[$key_],#16
612 vld1.32 {q9},[$key_],#16
625 cmp $len,#0x40 // because .Lecb_enc_tail4x
638 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
651 add $inp,$inp,x6 // $inp is adjusted in such way that
652 // at exit from the loop $dat1-$dat4
653 // are loaded with last "words"
654 add x6,$len,#0x60 // because .Lecb_enc_tail4x
701 vld1.8 {$in0},[$inp],#16
703 vld1.8 {$in1},[$inp],#16
705 vld1.8 {$in2},[$inp],#16
707 vld1.8 {$in3},[$inp],#16
709 vld1.8 {$in4},[$inp],#16
710 cbz x6,.Lecb_enc_tail4x
711 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
712 veor $tmp0,$rndlast,$dat0
714 veor $tmp1,$rndlast,$dat1
716 veor $tmp2,$rndlast,$dat2
718 veor $tmp3,$rndlast,$dat3
720 veor $tmp4,$rndlast,$dat4
721 vst1.8 {$tmp0},[$out],#16
723 vst1.8 {$tmp1},[$out],#16
725 vst1.8 {$tmp2},[$out],#16
726 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
727 vst1.8 {$tmp3},[$out],#16
728 vst1.8 {$tmp4},[$out],#16
745 veor $tmp1,$rndlast,$dat1
746 veor $tmp2,$rndlast,$dat2
747 veor $tmp3,$rndlast,$dat3
748 veor $tmp4,$rndlast,$dat4
749 vst1.8 {$tmp1},[$out],#16
750 vst1.8 {$tmp2},[$out],#16
751 vst1.8 {$tmp3},[$out],#16
752 vst1.8 {$tmp4},[$out],#16
765 vld1.32 {q8},[$key_],#16
773 vld1.32 {q9},[$key_],#16
783 mov.lo x6,$len // x6, $cnt, is zero at this point
790 add $inp,$inp,x6 // $inp is adjusted in such way that
791 // at exit from the loop $dat1-$dat2
792 // are loaded with last "words"
800 vld1.8 {$in0},[$inp],#16
807 vld1.8 {$in1},[$inp],#16
814 vld1.8 {$in2},[$inp],#16
818 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
820 veor $tmp0,$rndlast,$dat0
821 veor $tmp1,$rndlast,$dat1
822 veor $dat2,$dat2,$rndlast
823 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
824 vst1.8 {$tmp0},[$out],#16
826 vst1.8 {$tmp1},[$out],#16
828 vst1.8 {$dat2},[$out],#16
841 vld1.32 {q8},[$key_],#16
847 vld1.32 {q9},[$key_],#16
874 veor $tmp1,$rndlast,$dat1
875 veor $tmp2,$rndlast,$dat2
876 vst1.8 {$tmp1},[$out],#16
877 vst1.8 {$tmp2},[$out],#16
881 veor $tmp1,$rndlast,$dat2
882 vst1.8 {$tmp1},[$out],#16
889 vld1.8 {$dat1},[$inp],#16
890 subs $len,$len,#32 // bias
892 vorr $in1,$dat1,$dat1
893 vorr $dat2,$dat1,$dat1
898 vld1.8 {$dat2},[$inp],#16
900 $code.=<<___ if ($flavour =~ /64/);
904 vld1.8 {$dat3},[$inp],#16
905 vld1.8 {$dat4},[$inp],#16
906 sub $len,$len,#32 // bias
920 vld1.32 {q8},[$key_],#16
932 vld1.32 {q9},[$key_],#16
945 cmp $len,#0x40 // because .Lecb_tail4x
958 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
971 add $inp,$inp,x6 // $inp is adjusted in such way that
972 // at exit from the loop $dat1-$dat4
973 // are loaded with last "words"
974 add x6,$len,#0x60 // because .Lecb_tail4x
1021 vld1.8 {$in0},[$inp],#16
1023 vld1.8 {$in1},[$inp],#16
1025 vld1.8 {$in2},[$inp],#16
1027 vld1.8 {$in3},[$inp],#16
1029 vld1.8 {$in4},[$inp],#16
1031 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1032 veor $tmp0,$rndlast,$dat0
1033 vorr $dat0,$in0,$in0
1034 veor $tmp1,$rndlast,$dat1
1035 vorr $dat1,$in1,$in1
1036 veor $tmp2,$rndlast,$dat2
1037 vorr $dat2,$in2,$in2
1038 veor $tmp3,$rndlast,$dat3
1039 vorr $dat3,$in3,$in3
1040 veor $tmp4,$rndlast,$dat4
1041 vst1.8 {$tmp0},[$out],#16
1042 vorr $dat4,$in4,$in4
1043 vst1.8 {$tmp1},[$out],#16
1045 vst1.8 {$tmp2},[$out],#16
1046 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1047 vst1.8 {$tmp3},[$out],#16
1048 vst1.8 {$tmp4},[$out],#16
1049 b.hs .Loop5x_ecb_dec
1055 subs $len,$len,#0x30
1056 vorr $dat0,$in2,$in2
1057 vorr $dat1,$in3,$in3
1058 vorr $dat2,$in4,$in4
1065 veor $tmp1,$rndlast,$dat1
1066 veor $tmp2,$rndlast,$dat2
1067 veor $tmp3,$rndlast,$dat3
1068 veor $tmp4,$rndlast,$dat4
1069 vst1.8 {$tmp1},[$out],#16
1070 vst1.8 {$tmp2},[$out],#16
1071 vst1.8 {$tmp3},[$out],#16
1072 vst1.8 {$tmp4},[$out],#16
1085 vld1.32 {q8},[$key_],#16
1093 vld1.32 {q9},[$key_],#16
1094 b.gt .Loop3x_ecb_dec
1102 subs $len,$len,#0x30
1103 mov.lo x6,$len // x6, $cnt, is zero at this point
1110 add $inp,$inp,x6 // $inp is adjusted in such way that
1111 // at exit from the loop $dat1-$dat2
1112 // are loaded with last "words"
1120 vld1.8 {$in0},[$inp],#16
1127 vld1.8 {$in1},[$inp],#16
1134 vld1.8 {$in2},[$inp],#16
1138 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1140 veor $tmp0,$rndlast,$dat0
1141 veor $tmp1,$rndlast,$dat1
1142 veor $dat2,$dat2,$rndlast
1143 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1144 vst1.8 {$tmp0},[$out],#16
1145 vorr $dat0,$in0,$in0
1146 vst1.8 {$tmp1},[$out],#16
1147 vorr $dat1,$in1,$in1
1148 vst1.8 {$dat2},[$out],#16
1149 vorr $dat2,$in2,$in2
1150 b.hs .Loop3x_ecb_dec
1161 vld1.32 {q8},[$key_],#16
1167 vld1.32 {q9},[$key_],#16
1194 veor $tmp1,$rndlast,$dat1
1195 veor $tmp2,$rndlast,$dat2
1196 vst1.8 {$tmp1},[$out],#16
1197 vst1.8 {$tmp2},[$out],#16
1201 veor $tmp1,$rndlast,$dat2
1202 vst1.8 {$tmp1},[$out],#16
1207 $code.=<<___ if ($flavour !~ /64/);
1209 ldmia sp!,{r4-r8,pc}
1211 $code.=<<___ if ($flavour =~ /64/);
1214 $code.=<<___ if ($flavour =~ /64/);
1219 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1223 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1224 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1225 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1227 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1228 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1230 ### q8-q15 preloaded key schedule
1233 .globl ${prefix}_cbc_encrypt
1234 .type ${prefix}_cbc_encrypt,%function
1236 ${prefix}_cbc_encrypt:
1238 $code.=<<___ if ($flavour =~ /64/);
1239 stp x29,x30,[sp,#-16]!
1242 $code.=<<___ if ($flavour !~ /64/);
1244 stmdb sp!,{r4-r8,lr}
1245 vstmdb sp!,{d8-d15} @ ABI specification says so
1246 ldmia ip,{r4-r5} @ load remaining args
1254 cmp $enc,#0 // en- or decrypting?
1255 ldr $rounds,[$key,#240]
1257 vld1.8 {$ivec},[$ivp]
1258 vld1.8 {$dat},[$inp],$step
1260 vld1.32 {q8-q9},[$key] // load key schedule...
1261 sub $rounds,$rounds,#6
1262 add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
1263 sub $rounds,$rounds,#2
1264 vld1.32 {q10-q11},[$key_],#32
1265 vld1.32 {q12-q13},[$key_],#32
1266 vld1.32 {q14-q15},[$key_],#32
1267 vld1.32 {$rndlast},[$key_]
1274 veor $dat,$dat,$ivec
1275 veor $rndzero_n_last,q8,$rndlast
1278 vld1.32 {$in0-$in1},[$key_]
1280 add $key4,$key,#16*4
1281 add $key5,$key,#16*5
1284 add $key6,$key,#16*6
1285 add $key7,$key,#16*7
1292 vst1.8 {$ivec},[$out],#16
1298 vld1.32 {q8},[$key4]
1302 vld1.32 {q9},[$key5]
1307 vld1.32 {q8},[$key6]
1310 vld1.32 {q9},[$key7]
1324 vld1.8 {q8},[$inp],$step
1327 veor q8,q8,$rndzero_n_last
1330 vld1.32 {q9},[$key_] // re-pre-load rndkey[1]
1334 veor $ivec,$dat,$rndlast
1337 vst1.8 {$ivec},[$out],#16
1342 vld1.32 {$in0-$in1},[$key_]
1345 b .Lenter_cbc_enc128
1349 vst1.8 {$ivec},[$out],#16
1363 vld1.8 {q8},[$inp],$step
1370 veor q8,q8,$rndzero_n_last
1372 veor $ivec,$dat,$rndlast
1373 b.hs .Loop_cbc_enc128
1375 vst1.8 {$ivec},[$out],#16
1379 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1381 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
1382 my ($dat4,$in4,$tmp4);
1383 if ($flavour =~ /64/) {
1384 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1390 vld1.8 {$dat2},[$inp],#16
1391 subs $len,$len,#32 // bias
1394 vorr $dat1,$dat,$dat
1395 vorr $in2,$dat2,$dat2
1398 vorr $dat1,$dat2,$dat2
1399 vld1.8 {$dat2},[$inp],#16
1401 vorr $in1,$dat1,$dat1
1402 vorr $in2,$dat2,$dat2
1404 $code.=<<___ if ($flavour =~ /64/);
1406 b.lo .Loop3x_cbc_dec
1408 vld1.8 {$dat3},[$inp],#16
1409 vld1.8 {$dat4},[$inp],#16
1410 sub $len,$len,#32 // bias
1412 vorr $in3,$dat3,$dat3
1413 vorr $in4,$dat4,$dat4
1426 vld1.32 {q8},[$key_],#16
1438 vld1.32 {q9},[$key_],#16
1439 b.gt .Loop5x_cbc_dec
1451 cmp $len,#0x40 // because .Lcbc_tail4x
1464 csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
1477 add $inp,$inp,x6 // $inp is adjusted in such way that
1478 // at exit from the loop $dat1-$dat4
1479 // are loaded with last "words"
1480 add x6,$len,#0x60 // because .Lcbc_tail4x
1526 veor $tmp0,$ivec,$rndlast
1528 veor $tmp1,$in0,$rndlast
1529 vld1.8 {$in0},[$inp],#16
1531 veor $tmp2,$in1,$rndlast
1532 vld1.8 {$in1},[$inp],#16
1534 veor $tmp3,$in2,$rndlast
1535 vld1.8 {$in2},[$inp],#16
1537 veor $tmp4,$in3,$rndlast
1538 vld1.8 {$in3},[$inp],#16
1540 vorr $ivec,$in4,$in4
1541 vld1.8 {$in4},[$inp],#16
1543 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1544 veor $tmp0,$tmp0,$dat0
1545 vorr $dat0,$in0,$in0
1546 veor $tmp1,$tmp1,$dat1
1547 vorr $dat1,$in1,$in1
1548 veor $tmp2,$tmp2,$dat2
1549 vorr $dat2,$in2,$in2
1550 veor $tmp3,$tmp3,$dat3
1551 vorr $dat3,$in3,$in3
1552 veor $tmp4,$tmp4,$dat4
1553 vst1.8 {$tmp0},[$out],#16
1554 vorr $dat4,$in4,$in4
1555 vst1.8 {$tmp1},[$out],#16
1557 vst1.8 {$tmp2},[$out],#16
1558 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1559 vst1.8 {$tmp3},[$out],#16
1560 vst1.8 {$tmp4},[$out],#16
1561 b.hs .Loop5x_cbc_dec
1567 subs $len,$len,#0x30
1568 vorr $dat0,$in2,$in2
1570 vorr $dat1,$in3,$in3
1572 vorr $dat2,$in4,$in4
1580 veor $tmp1,$tmp0,$dat1
1581 veor $tmp2,$tmp2,$dat2
1582 veor $tmp3,$tmp3,$dat3
1583 veor $tmp4,$tmp4,$dat4
1584 vst1.8 {$tmp1},[$out],#16
1585 vst1.8 {$tmp2},[$out],#16
1586 vst1.8 {$tmp3},[$out],#16
1587 vst1.8 {$tmp4},[$out],#16
1600 vld1.32 {q8},[$key_],#16
1608 vld1.32 {q9},[$key_],#16
1609 b.gt .Loop3x_cbc_dec
1617 veor $tmp0,$ivec,$rndlast
1618 subs $len,$len,#0x30
1619 veor $tmp1,$in0,$rndlast
1620 mov.lo x6,$len // x6, $cnt, is zero at this point
1627 veor $tmp2,$in1,$rndlast
1628 add $inp,$inp,x6 // $inp is adjusted in such way that
1629 // at exit from the loop $dat1-$dat2
1630 // are loaded with last "words"
1631 vorr $ivec,$in2,$in2
1639 vld1.8 {$in0},[$inp],#16
1646 vld1.8 {$in1},[$inp],#16
1653 vld1.8 {$in2},[$inp],#16
1657 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1659 veor $tmp0,$tmp0,$dat0
1660 veor $tmp1,$tmp1,$dat1
1661 veor $dat2,$dat2,$tmp2
1662 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1663 vst1.8 {$tmp0},[$out],#16
1664 vorr $dat0,$in0,$in0
1665 vst1.8 {$tmp1},[$out],#16
1666 vorr $dat1,$in1,$in1
1667 vst1.8 {$dat2},[$out],#16
1668 vorr $dat2,$in2,$in2
1669 b.hs .Loop3x_cbc_dec
1680 vld1.32 {q8},[$key_],#16
1686 vld1.32 {q9},[$key_],#16
1706 veor $tmp1,$ivec,$rndlast
1711 veor $tmp2,$in1,$rndlast
1715 veor $tmp1,$tmp1,$dat1
1716 veor $tmp2,$tmp2,$dat2
1717 vorr $ivec,$in2,$in2
1718 vst1.8 {$tmp1},[$out],#16
1719 vst1.8 {$tmp2},[$out],#16
1723 veor $tmp1,$tmp1,$dat2
1724 vorr $ivec,$in2,$in2
1725 vst1.8 {$tmp1},[$out],#16
1728 vst1.8 {$ivec},[$ivp]
1732 $code.=<<___ if ($flavour !~ /64/);
1734 ldmia sp!,{r4-r8,pc}
1736 $code.=<<___ if ($flavour =~ /64/);
1741 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1745 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1746 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1747 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1748 my $step="x12"; # aliases with $tctr2
1750 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1751 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1753 # used only in 64-bit mode...
1754 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1756 my ($dat,$tmp)=($dat0,$tmp0);
1758 ### q8-q15 preloaded key schedule
1761 .globl ${prefix}_ctr32_encrypt_blocks
1762 .type ${prefix}_ctr32_encrypt_blocks,%function
1764 ${prefix}_ctr32_encrypt_blocks:
1766 $code.=<<___ if ($flavour =~ /64/);
1767 stp x29,x30,[sp,#-16]!
1770 $code.=<<___ if ($flavour !~ /64/);
1772 stmdb sp!,{r4-r10,lr}
1773 vstmdb sp!,{d8-d15} @ ABI specification says so
1774 ldr r4, [ip] @ load remaining arg
1777 ldr $rounds,[$key,#240]
1779 ldr $ctr, [$ivp, #12]
1781 vld1.8 {$dat0},[$ivp]
1783 vld1.32 {$dat0},[$ivp]
1785 vld1.32 {q8-q9},[$key] // load key schedule...
1786 sub $rounds,$rounds,#4
1789 add $key_,$key,x5,lsl#4 // pointer to last 5 round keys
1790 sub $rounds,$rounds,#2
1791 vld1.32 {q12-q13},[$key_],#32
1792 vld1.32 {q14-q15},[$key_],#32
1793 vld1.32 {$rndlast},[$key_]
1800 vorr $dat1,$dat0,$dat0
1801 add $tctr1, $ctr, #1
1802 vorr $dat2,$dat0,$dat0
1804 vorr $ivec,$dat0,$dat0
1806 vmov.32 ${dat1}[3],$tctr1
1809 sub $len,$len,#3 // bias
1810 vmov.32 ${dat2}[3],$tctr2
1812 $code.=<<___ if ($flavour =~ /64/);
1818 vorr $dat3,$dat0,$dat0
1820 vorr $dat4,$dat0,$dat0
1822 vmov.32 ${dat3}[3],w13
1823 sub $len,$len,#2 // bias
1824 vmov.32 ${dat4}[3],w14
1840 vld1.32 {q8},[$key_],#16
1852 vld1.32 {q9},[$key_],#16
1866 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
1878 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
1914 vld1.8 {$in0},[$inp],#16
1917 vld1.8 {$in1},[$inp],#16
1920 vld1.8 {$in2},[$inp],#16
1923 vld1.8 {$in3},[$inp],#16
1926 vld1.8 {$in4},[$inp],#16
1929 veor $in0,$in0,$rndlast
1931 veor $in1,$in1,$rndlast
1933 veor $in2,$in2,$rndlast
1935 veor $in3,$in3,$rndlast
1937 veor $in4,$in4,$rndlast
1939 veor $in0,$in0,$dat0
1940 vorr $dat0,$ivec,$ivec
1941 veor $in1,$in1,$dat1
1942 vorr $dat1,$ivec,$ivec
1943 veor $in2,$in2,$dat2
1944 vorr $dat2,$ivec,$ivec
1945 veor $in3,$in3,$dat3
1946 vorr $dat3,$ivec,$ivec
1947 veor $in4,$in4,$dat4
1948 vorr $dat4,$ivec,$ivec
1950 vst1.8 {$in0},[$out],#16
1951 vmov.32 ${dat0}[3],$tctr0
1952 vst1.8 {$in1},[$out],#16
1953 vmov.32 ${dat1}[3],$tctr1
1954 vst1.8 {$in2},[$out],#16
1955 vmov.32 ${dat2}[3],$tctr2
1956 vst1.8 {$in3},[$out],#16
1957 vmov.32 ${dat3}[3],w13
1958 vst1.8 {$in4},[$out],#16
1959 vmov.32 ${dat4}[3],w14
1962 cbz $len,.Lctr32_done
1976 sub $len,$len,#3 // bias
1990 vld1.32 {q8},[$key_],#16
1998 vld1.32 {q9},[$key_],#16
2005 vld1.8 {$in0},[$inp],#16
2006 vorr $dat0,$ivec,$ivec
2009 vld1.8 {$in1},[$inp],#16
2010 vorr $dat1,$ivec,$ivec
2015 vld1.8 {$in2},[$inp],#16
2019 vorr $dat2,$ivec,$ivec
2025 veor $in0,$in0,$rndlast
2029 veor $in1,$in1,$rndlast
2035 veor $in2,$in2,$rndlast
2039 vmov.32 ${dat0}[3], $tctr0
2045 vmov.32 ${dat1}[3], $tctr1
2049 vmov.32 ${dat2}[3], $tctr2
2055 veor $in0,$in0,$tmp0
2056 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2057 vst1.8 {$in0},[$out],#16
2058 veor $in1,$in1,$tmp1
2060 vst1.8 {$in1},[$out],#16
2061 veor $in2,$in2,$tmp2
2062 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2063 vst1.8 {$in2},[$out],#16
2077 vld1.32 {q8},[$key_],#16
2083 vld1.32 {q9},[$key_],#16
2094 vld1.8 {$in0},[$inp],$step
2099 vld1.8 {$in1},[$inp]
2104 veor $in0,$in0,$rndlast
2109 veor $in1,$in1,$rndlast
2114 veor $in0,$in0,$dat0
2115 veor $in1,$in1,$dat1
2116 vst1.8 {$in0},[$out],#16
2118 vst1.8 {$in1},[$out]
2122 $code.=<<___ if ($flavour !~ /64/);
2124 ldmia sp!,{r4-r10,pc}
2126 $code.=<<___ if ($flavour =~ /64/);
2131 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2134 # Performance in cycles per byte.
2135 # Processed with AES-XTS different key size.
2136 # It shows the value before and after optimization as below:
2139 # AES-128-XTS AES-256-XTS
2140 # Cortex-A57 3.36/1.09 4.02/1.37
2141 # Cortex-A72 3.03/1.02 3.28/1.33
2143 # Optimization is implemented by loop unrolling and interleaving.
2144 # Commonly, we choose the unrolling factor as 5, if the input
2145 # data size smaller than 5 blocks, but not smaller than 3 blocks,
2146 # choose 3 as the unrolling factor.
2147 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
2148 # as one iteration, every loop the left size lsize -= 5*16.
2149 # If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
2150 # will be processed specially, which be integrated into the 5*16 bytes
2151 # loop to improve the efficiency.
2152 # There is one special case, if the original input data size dsize
2153 # = 16 bytes, we will treat it seperately to improve the
2154 # performance: one independent code block without LR, FP load and
2156 # Encryption will process the (length -tailcnt) bytes as mentioned
2157 # previously, then encrypt the composite block as last second
2159 # Decryption will process the (length -tailcnt -1) bytes as mentioned
2160 # previously, then decrypt the last second cipher block to get the
2161 # last plain block(tail), decrypt the composite block as last second
2165 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2166 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2167 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2168 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2169 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2170 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2171 my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
2172 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2173 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2175 my ($tmpin)=("v26.16b");
2176 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2179 # q10-q15, q7 Last 7 round keys
2180 # q8-q9 preloaded round keys except last 7 keys for big size
2181 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2184 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2186 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2187 my ($dat4,$in4,$tmp4);
2188 if ($flavour =~ /64/) {
2189 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2192 $code.=<<___ if ($flavour =~ /64/);
2193 .globl ${prefix}_xts_encrypt
2194 .type ${prefix}_xts_encrypt,%function
2196 ${prefix}_xts_encrypt:
2198 $code.=<<___ if ($flavour =~ /64/);
2200 // Original input data size bigger than 16, jump to big size processing.
2201 b.ne .Lxts_enc_big_size
2202 // Encrypt the iv with key2, as the first XEX iv.
2203 ldr $rounds,[$key2,#240]
2204 vld1.8 {$dat},[$key2],#16
2205 vld1.8 {$iv0},[$ivp]
2206 sub $rounds,$rounds,#2
2207 vld1.8 {$dat1},[$key2],#16
2212 vld1.32 {$dat},[$key2],#16
2213 subs $rounds,$rounds,#2
2216 vld1.32 {$dat1},[$key2],#16
2217 b.gt .Loop_enc_iv_enc
2221 vld1.32 {$dat},[$key2]
2225 vld1.8 {$dat0},[$inp]
2226 veor $dat0,$iv0,$dat0
2228 ldr $rounds,[$key1,#240]
2229 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2233 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2236 subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
2238 .Lxts_enc_round_loop:
2241 vld1.32 {q8},[$key1],#16 // load key schedule...
2244 vld1.32 {q9},[$key1],#16 // load key schedule...
2245 subs $rounds,$rounds,#2 // bias
2246 b.gt .Lxts_enc_round_loop
2248 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2253 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2258 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2263 vld1.32 {$rndlast},[$key1]
2267 veor $dat0,$dat0,$rndlast
2268 veor $dat0,$dat0,$iv0
2269 vst1.8 {$dat0},[$out]
2270 b .Lxts_enc_final_abort
2275 $code.=<<___ if ($flavour =~ /64/);
2276 stp $constnumx,$tmpinp,[sp,#-64]!
2277 stp $tailcnt,$midnumx,[sp,#48]
2278 stp $ivd10,$ivd20,[sp,#32]
2279 stp $ivd30,$ivd40,[sp,#16]
2281 // tailcnt store the tail value of length%16.
2282 and $tailcnt,$len,#0xf
2287 csel $step,xzr,$step,eq
2289 // Firstly, encrypt the iv with key2, as the first iv of XEX.
2290 ldr $rounds,[$key2,#240]
2291 vld1.32 {$dat},[$key2],#16
2292 vld1.8 {$iv0},[$ivp]
2293 sub $rounds,$rounds,#2
2294 vld1.32 {$dat1},[$key2],#16
2299 vld1.32 {$dat},[$key2],#16
2300 subs $rounds,$rounds,#2
2303 vld1.32 {$dat1},[$key2],#16
2308 vld1.32 {$dat},[$key2]
2312 // The iv for second block
2313 // $ivl- iv(low), $ivh - iv(high)
2314 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2318 extr $midnumx,$ivh,$ivh,#32
2319 extr $ivh,$ivh,$ivl,#63
2320 and $tmpmw,$constnum,$midnum,asr#31
2321 eor $ivl,$tmpmx,$ivl,lsl#1
2325 ldr $rounds0,[$key1,#240] // next starting point
2326 vld1.8 {$dat},[$inp],$step
2328 vld1.32 {q8-q9},[$key1] // load key schedule...
2329 sub $rounds0,$rounds0,#6
2330 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
2331 sub $rounds0,$rounds0,#2
2332 vld1.32 {q10-q11},[$key_],#32
2333 vld1.32 {q12-q13},[$key_],#32
2334 vld1.32 {q14-q15},[$key_],#32
2335 vld1.32 {$rndlast},[$key_]
2338 mov $rounds,$rounds0
2342 vld1.8 {$dat2},[$inp],#16
2343 subs $len,$len,#32 // bias
2344 add $rounds,$rounds0,#2
2346 vorr $dat1,$dat,$dat
2348 vorr $in2,$dat2,$dat2
2349 vorr $in4,$dat2,$dat2
2350 b.lo .Lxts_inner_enc_tail
2351 veor $dat,$dat,$iv0 // before encryption, xor with iv
2352 veor $dat2,$dat2,$iv1
2354 // The iv for third block
2355 extr $midnumx,$ivh,$ivh,#32
2356 extr $ivh,$ivh,$ivl,#63
2357 and $tmpmw,$constnum,$midnum,asr#31
2358 eor $ivl,$tmpmx,$ivl,lsl#1
2363 vorr $dat1,$dat2,$dat2
2364 vld1.8 {$dat2},[$inp],#16
2366 vorr $in1,$dat1,$dat1
2367 veor $in2,$dat2,$iv2 // the third block
2368 veor $dat2,$dat2,$iv2
2370 b.lo .Lxts_outer_enc_tail
2372 // The iv for fourth block
2373 extr $midnumx,$ivh,$ivh,#32
2374 extr $ivh,$ivh,$ivl,#63
2375 and $tmpmw,$constnum,$midnum,asr#31
2376 eor $ivl,$tmpmx,$ivl,lsl#1
2380 vld1.8 {$dat3},[$inp],#16
2381 // The iv for fifth block
2382 extr $midnumx,$ivh,$ivh,#32
2383 extr $ivh,$ivh,$ivl,#63
2384 and $tmpmw,$constnum,$midnum,asr#31
2385 eor $ivl,$tmpmx,$ivl,lsl#1
2389 vld1.8 {$dat4},[$inp],#16
2390 veor $dat3,$dat3,$iv3 // the fourth block
2391 veor $dat4,$dat4,$iv4
2392 sub $len,$len,#32 // bias
2393 mov $rounds,$rounds0
2408 vld1.32 {q8},[$key_],#16
2409 subs $rounds,$rounds,#2
2420 vld1.32 {q9},[$key_],#16
2421 b.gt .Loop5x_xts_enc
2433 subs $len,$len,#0x50 // because .Lxts_enc_tail4x
2445 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
2458 add $inp,$inp,$xoffset // x0 is adjusted in such way that
2459 // at exit from the loop v1.16b-v26.16b
2460 // are loaded with last "words"
2461 add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
2507 veor $tmp0,$rndlast,$iv0
2509 // The iv for first block of one iteration
2510 extr $midnumx,$ivh,$ivh,#32
2511 extr $ivh,$ivh,$ivl,#63
2512 and $tmpmw,$constnum,$midnum,asr#31
2513 eor $ivl,$tmpmx,$ivl,lsl#1
2516 veor $tmp1,$rndlast,$iv1
2517 vld1.8 {$in0},[$inp],#16
2519 // The iv for second block
2520 extr $midnumx,$ivh,$ivh,#32
2521 extr $ivh,$ivh,$ivl,#63
2522 and $tmpmw,$constnum,$midnum,asr#31
2523 eor $ivl,$tmpmx,$ivl,lsl#1
2526 veor $tmp2,$rndlast,$iv2
2527 vld1.8 {$in1},[$inp],#16
2529 // The iv for third block
2530 extr $midnumx,$ivh,$ivh,#32
2531 extr $ivh,$ivh,$ivl,#63
2532 and $tmpmw,$constnum,$midnum,asr#31
2533 eor $ivl,$tmpmx,$ivl,lsl#1
2536 veor $tmp3,$rndlast,$iv3
2537 vld1.8 {$in2},[$inp],#16
2539 // The iv for fourth block
2540 extr $midnumx,$ivh,$ivh,#32
2541 extr $ivh,$ivh,$ivl,#63
2542 and $tmpmw,$constnum,$midnum,asr#31
2543 eor $ivl,$tmpmx,$ivl,lsl#1
2546 veor $tmp4,$rndlast,$iv4
2547 vld1.8 {$in3},[$inp],#16
2550 // The iv for fifth block
2551 extr $midnumx,$ivh,$ivh,#32
2552 extr $ivh,$ivh,$ivl,#63
2553 and $tmpmw,$constnum,$midnum,asr #31
2554 eor $ivl,$tmpmx,$ivl,lsl #1
2558 vld1.8 {$in4},[$inp],#16
2559 cbz $xoffset,.Lxts_enc_tail4x
2560 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2561 veor $tmp0,$tmp0,$dat0
2562 veor $dat0,$in0,$iv0
2563 veor $tmp1,$tmp1,$dat1
2564 veor $dat1,$in1,$iv1
2565 veor $tmp2,$tmp2,$dat2
2566 veor $dat2,$in2,$iv2
2567 veor $tmp3,$tmp3,$dat3
2568 veor $dat3,$in3,$iv3
2569 veor $tmp4,$tmp4,$dat4
2570 vst1.8 {$tmp0},[$out],#16
2571 veor $dat4,$in4,$iv4
2572 vst1.8 {$tmp1},[$out],#16
2573 mov $rounds,$rounds0
2574 vst1.8 {$tmp2},[$out],#16
2575 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2576 vst1.8 {$tmp3},[$out],#16
2577 vst1.8 {$tmp4},[$out],#16
2578 b.hs .Loop5x_xts_enc
2581 // If left 4 blocks, borrow the five block's processing.
2583 b.ne .Loop5x_enc_after
2590 veor $dat0,$iv0,$in0
2591 veor $dat1,$iv1,$in1
2592 veor $dat2,$in2,$iv2
2593 veor $dat3,$in3,$iv3
2594 veor $dat4,$in4,$iv4
2595 b.eq .Loop5x_xts_enc
2599 cbz $len,.Lxts_enc_done
2601 add $rounds,$rounds0,#2
2602 subs $len,$len,#0x30
2603 b.lo .Lxts_inner_enc_tail
2605 veor $dat0,$iv0,$in2
2606 veor $dat1,$iv1,$in3
2607 veor $dat2,$in4,$iv2
2608 b .Lxts_outer_enc_tail
2613 veor $tmp1,$dat1,$tmp1
2614 vst1.8 {$tmp1},[$out],#16
2615 veor $tmp2,$dat2,$tmp2
2616 vst1.8 {$tmp2},[$out],#16
2617 veor $tmp3,$dat3,$tmp3
2618 veor $tmp4,$dat4,$tmp4
2619 vst1.8 {$tmp3-$tmp4},[$out],#32
2623 .Lxts_outer_enc_tail:
2630 vld1.32 {q8},[$key_],#16
2631 subs $rounds,$rounds,#2
2638 vld1.32 {q9},[$key_],#16
2639 b.gt .Lxts_outer_enc_tail
2647 veor $tmp0,$iv0,$rndlast
2648 subs $len,$len,#0x30
2649 // The iv for first block
2652 //mov $constnum,#0x87
2653 extr $midnumx,$ivh,$ivh,#32
2654 extr $ivh,$ivh,$ivl,#63
2655 and $tmpmw,$constnum,$midnum,asr#31
2656 eor $ivl,$tmpmx,$ivl,lsl#1
2659 veor $tmp1,$iv1,$rndlast
2660 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
2667 veor $tmp2,$iv2,$rndlast
2669 add $xoffset,$xoffset,#0x20
2670 add $inp,$inp,$xoffset
2694 vld1.8 {$in2},[$inp],#16
2695 add $rounds,$rounds0,#2
2696 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
2697 veor $tmp0,$tmp0,$dat0
2698 veor $tmp1,$tmp1,$dat1
2699 veor $dat2,$dat2,$tmp2
2700 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
2701 vst1.8 {$tmp0},[$out],#16
2702 vst1.8 {$tmp1},[$out],#16
2703 vst1.8 {$dat2},[$out],#16
2711 .Lxts_inner_enc_tail:
2713 veor $dat1,$in3,$iv0
2714 veor $dat2,$in4,$iv1
2715 b.eq .Lxts_enc_tail_loop
2716 veor $dat2,$in4,$iv0
2717 .Lxts_enc_tail_loop:
2722 vld1.32 {q8},[$key_],#16
2723 subs $rounds,$rounds,#2
2728 vld1.32 {q9},[$key_],#16
2729 b.gt .Lxts_enc_tail_loop
2748 veor $tmp1,$iv0,$rndlast
2753 veor $tmp2,$iv1,$rndlast
2757 veor $tmp1,$tmp1,$dat1
2758 vst1.8 {$tmp1},[$out],#16
2759 veor $tmp2,$tmp2,$dat2
2761 vst1.8 {$tmp2},[$out],#16
2765 extr $midnumx,$ivh,$ivh,#32
2766 extr $ivh,$ivh,$ivl,#63
2767 and $tmpmw,$constnum,$midnum,asr #31
2768 eor $ivl,$tmpmx,$ivl,lsl #1
2774 veor $tmp1,$tmp1,$dat2
2776 vst1.8 {$tmp1},[$out],#16
2780 extr $midnumx,$ivh,$ivh,#32
2781 extr $ivh,$ivh,$ivl,#63
2782 and $tmpmw,$constnum,$midnum,asr #31
2783 eor $ivl,$tmpmx,$ivl,lsl #1
2789 // Process the tail block with cipher stealing.
2796 .composite_enc_loop:
2797 subs $tailcnt,$tailcnt,#1
2798 ldrb $l2outp,[$out,$tailcnt]
2799 ldrb $loutp,[$tmpinp,$tailcnt]
2800 strb $l2outp,[$tmpoutp,$tailcnt]
2801 strb $loutp,[$out,$tailcnt]
2802 b.gt .composite_enc_loop
2803 .Lxts_enc_load_done:
2804 vld1.8 {$tmpin},[$out]
2805 veor $tmpin,$tmpin,$iv0
2807 // Encrypt the composite block to get the last second encrypted text block
2808 ldr $rounds,[$key1,#240] // load key schedule...
2809 vld1.8 {$dat},[$key1],#16
2810 sub $rounds,$rounds,#2
2811 vld1.8 {$dat1},[$key1],#16 // load key schedule...
2815 vld1.32 {$dat0},[$key1],#16
2816 subs $rounds,$rounds,#2
2819 vld1.32 {$dat1},[$key1],#16
2820 b.gt .Loop_final_enc
2824 vld1.32 {$dat0},[$key1]
2826 veor $tmpin,$tmpin,$dat0
2827 veor $tmpin,$tmpin,$iv0
2828 vst1.8 {$tmpin},[$out]
2831 ldp $tailcnt,$midnumx,[sp,#48]
2832 ldp $ivd10,$ivd20,[sp,#32]
2833 ldp $ivd30,$ivd40,[sp,#16]
2834 ldp $constnumx,$tmpinp,[sp],#64
2835 .Lxts_enc_final_abort:
2837 .size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
2842 my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
2843 my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
2844 my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
2845 my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
2846 my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
2847 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
2848 my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
2849 my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
2850 my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
2852 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
2855 # q10-q15, q7 Last 7 round keys
2856 # q8-q9 preloaded round keys except last 7 keys for big size
2857 # q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
2860 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
2862 my ($dat3,$in3,$tmp3); # used only in 64-bit mode
2863 my ($dat4,$in4,$tmp4);
2864 if ($flavour =~ /64/) {
2865 ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
2868 $code.=<<___ if ($flavour =~ /64/);
2869 .globl ${prefix}_xts_decrypt
2870 .type ${prefix}_xts_decrypt,%function
2872 ${prefix}_xts_decrypt:
2874 $code.=<<___ if ($flavour =~ /64/);
2876 // Original input data size bigger than 16, jump to big size processing.
2877 b.ne .Lxts_dec_big_size
2878 // Encrypt the iv with key2, as the first XEX iv.
2879 ldr $rounds,[$key2,#240]
2880 vld1.8 {$dat},[$key2],#16
2881 vld1.8 {$iv0},[$ivp]
2882 sub $rounds,$rounds,#2
2883 vld1.8 {$dat1},[$key2],#16
2885 .Loop_dec_small_iv_enc:
2888 vld1.32 {$dat},[$key2],#16
2889 subs $rounds,$rounds,#2
2892 vld1.32 {$dat1},[$key2],#16
2893 b.gt .Loop_dec_small_iv_enc
2897 vld1.32 {$dat},[$key2]
2901 vld1.8 {$dat0},[$inp]
2902 veor $dat0,$iv0,$dat0
2904 ldr $rounds,[$key1,#240]
2905 vld1.32 {q20-q21},[$key1],#32 // load key schedule...
2909 vld1.32 {q8-q9},[$key1],#32 // load key schedule...
2912 subs $rounds,$rounds,#10 // bias
2914 .Lxts_dec_round_loop:
2917 vld1.32 {q8},[$key1],#16 // load key schedule...
2920 vld1.32 {q9},[$key1],#16 // load key schedule...
2921 subs $rounds,$rounds,#2 // bias
2922 b.gt .Lxts_dec_round_loop
2924 vld1.32 {q10-q11},[$key1],#32 // load key schedule...
2929 vld1.32 {q12-q13},[$key1],#32 // load key schedule...
2934 vld1.32 {q14-q15},[$key1],#32 // load key schedule...
2939 vld1.32 {$rndlast},[$key1]
2943 veor $dat0,$dat0,$rndlast
2944 veor $dat0,$iv0,$dat0
2945 vst1.8 {$dat0},[$out]
2946 b .Lxts_dec_final_abort
2949 $code.=<<___ if ($flavour =~ /64/);
2950 stp $constnumx,$tmpinp,[sp,#-64]!
2951 stp $tailcnt,$midnumx,[sp,#48]
2952 stp $ivd10,$ivd20,[sp,#32]
2953 stp $ivd30,$ivd40,[sp,#16]
2955 and $tailcnt,$len,#0xf
2959 b.lo .Lxts_dec_abort
2961 // Encrypt the iv with key2, as the first XEX iv
2962 ldr $rounds,[$key2,#240]
2963 vld1.8 {$dat},[$key2],#16
2964 vld1.8 {$iv0},[$ivp]
2965 sub $rounds,$rounds,#2
2966 vld1.8 {$dat1},[$key2],#16
2971 vld1.32 {$dat},[$key2],#16
2972 subs $rounds,$rounds,#2
2975 vld1.32 {$dat1},[$key2],#16
2976 b.gt .Loop_dec_iv_enc
2980 vld1.32 {$dat},[$key2]
2984 // The iv for second block
2985 // $ivl- iv(low), $ivh - iv(high)
2986 // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
2990 extr $midnumx,$ivh,$ivh,#32
2991 extr $ivh,$ivh,$ivl,#63
2992 and $tmpmw,$constnum,$midnum,asr #31
2993 eor $ivl,$tmpmx,$ivl,lsl #1
2997 ldr $rounds0,[$key1,#240] // load rounds number
2999 // The iv for third block
3000 extr $midnumx,$ivh,$ivh,#32
3001 extr $ivh,$ivh,$ivl,#63
3002 and $tmpmw,$constnum,$midnum,asr #31
3003 eor $ivl,$tmpmx,$ivl,lsl #1
3007 vld1.32 {q8-q9},[$key1] // load key schedule...
3008 sub $rounds0,$rounds0,#6
3009 add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
3010 sub $rounds0,$rounds0,#2
3011 vld1.32 {q10-q11},[$key_],#32 // load key schedule...
3012 vld1.32 {q12-q13},[$key_],#32
3013 vld1.32 {q14-q15},[$key_],#32
3014 vld1.32 {$rndlast},[$key_]
3016 // The iv for fourth block
3017 extr $midnumx,$ivh,$ivh,#32
3018 extr $ivh,$ivh,$ivl,#63
3019 and $tmpmw,$constnum,$midnum,asr #31
3020 eor $ivl,$tmpmx,$ivl,lsl #1
3025 mov $rounds,$rounds0
3032 b.eq .Lxts_dec_begin
3034 csel $step,xzr,$step,eq
3035 vld1.8 {$dat},[$inp],#16
3039 vld1.8 {$dat},[$inp],$step
3040 subs $len,$len,#32 // bias
3041 add $rounds,$rounds0,#2
3043 vorr $dat1,$dat,$dat
3045 vld1.8 {$dat2},[$inp],#16
3046 vorr $in2,$dat2,$dat2
3047 vorr $in4,$dat2,$dat2
3048 b.lo .Lxts_inner_dec_tail
3049 veor $dat,$dat,$iv0 // before decryt, xor with iv
3050 veor $dat2,$dat2,$iv1
3052 vorr $dat1,$dat2,$dat2
3053 vld1.8 {$dat2},[$inp],#16
3055 vorr $in1,$dat1,$dat1
3056 veor $in2,$dat2,$iv2 // third block xox with third iv
3057 veor $dat2,$dat2,$iv2
3059 b.lo .Lxts_outer_dec_tail
3061 vld1.8 {$dat3},[$inp],#16
3063 // The iv for fifth block
3064 extr $midnumx,$ivh,$ivh,#32
3065 extr $ivh,$ivh,$ivl,#63
3066 and $tmpmw,$constnum,$midnum,asr #31
3067 eor $ivl,$tmpmx,$ivl,lsl #1
3071 vld1.8 {$dat4},[$inp],#16
3072 veor $dat3,$dat3,$iv3 // the fourth block
3073 veor $dat4,$dat4,$iv4
3074 sub $len,$len,#32 // bias
3075 mov $rounds,$rounds0
3090 vld1.32 {q8},[$key_],#16 // load key schedule...
3091 subs $rounds,$rounds,#2
3102 vld1.32 {q9},[$key_],#16 // load key schedule...
3103 b.gt .Loop5x_xts_dec
3115 subs $len,$len,#0x50 // because .Lxts_dec_tail4x
3127 csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
3140 add $inp,$inp,$xoffset // x0 is adjusted in such way that
3141 // at exit from the loop v1.16b-v26.16b
3142 // are loaded with last "words"
3143 add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
3189 veor $tmp0,$rndlast,$iv0
3191 // The iv for first block of next iteration.
3192 extr $midnumx,$ivh,$ivh,#32
3193 extr $ivh,$ivh,$ivl,#63
3194 and $tmpmw,$constnum,$midnum,asr #31
3195 eor $ivl,$tmpmx,$ivl,lsl #1
3198 veor $tmp1,$rndlast,$iv1
3199 vld1.8 {$in0},[$inp],#16
3201 // The iv for second block
3202 extr $midnumx,$ivh,$ivh,#32
3203 extr $ivh,$ivh,$ivl,#63
3204 and $tmpmw,$constnum,$midnum,asr #31
3205 eor $ivl,$tmpmx,$ivl,lsl #1
3208 veor $tmp2,$rndlast,$iv2
3209 vld1.8 {$in1},[$inp],#16
3211 // The iv for third block
3212 extr $midnumx,$ivh,$ivh,#32
3213 extr $ivh,$ivh,$ivl,#63
3214 and $tmpmw,$constnum,$midnum,asr #31
3215 eor $ivl,$tmpmx,$ivl,lsl #1
3218 veor $tmp3,$rndlast,$iv3
3219 vld1.8 {$in2},[$inp],#16
3221 // The iv for fourth block
3222 extr $midnumx,$ivh,$ivh,#32
3223 extr $ivh,$ivh,$ivl,#63
3224 and $tmpmw,$constnum,$midnum,asr #31
3225 eor $ivl,$tmpmx,$ivl,lsl #1
3228 veor $tmp4,$rndlast,$iv4
3229 vld1.8 {$in3},[$inp],#16
3232 // The iv for fifth block
3233 extr $midnumx,$ivh,$ivh,#32
3234 extr $ivh,$ivh,$ivl,#63
3235 and $tmpmw,$constnum,$midnum,asr #31
3236 eor $ivl,$tmpmx,$ivl,lsl #1
3240 vld1.8 {$in4},[$inp],#16
3241 cbz $xoffset,.Lxts_dec_tail4x
3242 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3243 veor $tmp0,$tmp0,$dat0
3244 veor $dat0,$in0,$iv0
3245 veor $tmp1,$tmp1,$dat1
3246 veor $dat1,$in1,$iv1
3247 veor $tmp2,$tmp2,$dat2
3248 veor $dat2,$in2,$iv2
3249 veor $tmp3,$tmp3,$dat3
3250 veor $dat3,$in3,$iv3
3251 veor $tmp4,$tmp4,$dat4
3252 vst1.8 {$tmp0},[$out],#16
3253 veor $dat4,$in4,$iv4
3254 vst1.8 {$tmp1},[$out],#16
3255 mov $rounds,$rounds0
3256 vst1.8 {$tmp2},[$out],#16
3257 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3258 vst1.8 {$tmp3},[$out],#16
3259 vst1.8 {$tmp4},[$out],#16
3260 b.hs .Loop5x_xts_dec
3263 b.ne .Loop5x_dec_after
3264 // If x2($len) equal to -0x10, the left blocks is 4.
3265 // After specially processing, utilize the five blocks processing again.
3266 // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
3273 veor $dat0,$iv0,$in0
3274 veor $dat1,$iv1,$in1
3275 veor $dat2,$in2,$iv2
3276 veor $dat3,$in3,$iv3
3277 veor $dat4,$in4,$iv4
3278 b.eq .Loop5x_xts_dec
3284 add $rounds,$rounds0,#2
3285 subs $len,$len,#0x30
3286 b.lo .Lxts_inner_dec_tail
3288 veor $dat0,$iv0,$in2
3289 veor $dat1,$iv1,$in3
3290 veor $dat2,$in4,$iv2
3291 b .Lxts_outer_dec_tail
3296 vld1.32 {$dat0},[$inp],#16
3297 veor $tmp1,$dat1,$tmp0
3298 vst1.8 {$tmp1},[$out],#16
3299 veor $tmp2,$dat2,$tmp2
3300 vst1.8 {$tmp2},[$out],#16
3301 veor $tmp3,$dat3,$tmp3
3302 veor $tmp4,$dat4,$tmp4
3303 vst1.8 {$tmp3-$tmp4},[$out],#32
3307 .Lxts_outer_dec_tail:
3314 vld1.32 {q8},[$key_],#16
3315 subs $rounds,$rounds,#2
3322 vld1.32 {q9},[$key_],#16
3323 b.gt .Lxts_outer_dec_tail
3331 veor $tmp0,$iv0,$rndlast
3332 subs $len,$len,#0x30
3333 // The iv for first block
3337 extr $midnumx,$ivh,$ivh,#32
3338 extr $ivh,$ivh,$ivl,#63
3339 and $tmpmw,$constnum,$midnum,asr #31
3340 eor $ivl,$tmpmx,$ivl,lsl #1
3343 veor $tmp1,$iv1,$rndlast
3344 csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
3351 veor $tmp2,$iv2,$rndlast
3352 // The iv for second block
3353 extr $midnumx,$ivh,$ivh,#32
3354 extr $ivh,$ivh,$ivl,#63
3355 and $tmpmw,$constnum,$midnum,asr #31
3356 eor $ivl,$tmpmx,$ivl,lsl #1
3360 add $xoffset,$xoffset,#0x20
3361 add $inp,$inp,$xoffset // $inp is adjusted to the last data
3365 // The iv for third block
3366 extr $midnumx,$ivh,$ivh,#32
3367 extr $ivh,$ivh,$ivl,#63
3368 and $tmpmw,$constnum,$midnum,asr #31
3369 eor $ivl,$tmpmx,$ivl,lsl #1
3391 vld1.8 {$in2},[$inp],#16
3395 vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
3396 add $rounds,$rounds0,#2
3397 veor $tmp0,$tmp0,$dat0
3398 veor $tmp1,$tmp1,$dat1
3399 veor $dat2,$dat2,$tmp2
3400 vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
3401 vst1.8 {$tmp0},[$out],#16
3402 vst1.8 {$tmp1},[$out],#16
3403 vst1.8 {$dat2},[$out],#16
3413 .Lxts_inner_dec_tail:
3414 // $len == -0x10 means two blocks left.
3416 veor $dat1,$in3,$iv0
3417 veor $dat2,$in4,$iv1
3418 b.eq .Lxts_dec_tail_loop
3419 veor $dat2,$in4,$iv0
3420 .Lxts_dec_tail_loop:
3425 vld1.32 {q8},[$key_],#16
3426 subs $rounds,$rounds,#2
3431 vld1.32 {q9},[$key_],#16
3432 b.gt .Lxts_dec_tail_loop
3451 veor $tmp1,$iv0,$rndlast
3456 veor $tmp2,$iv1,$rndlast
3460 veor $tmp1,$tmp1,$dat1
3461 veor $tmp2,$tmp2,$dat2
3464 vst1.8 {$tmp1},[$out],#16
3465 vst1.8 {$tmp2},[$out],#16
3470 veor $tmp1,$tmp1,$dat2
3473 vst1.8 {$tmp1},[$out],#16
3478 b.eq .Lxts_dec_abort
3479 // Processing the last two blocks with cipher stealing.
3481 cbnz x2,.Lxts_dec_1st_done
3482 vld1.32 {$dat0},[$inp],#16
3484 // Decrypt the last secod block to get the last plain text block
3486 eor $tmpin,$dat0,$iv1
3487 ldr $rounds,[$key1,#240]
3488 vld1.32 {$dat0},[$key1],#16
3489 sub $rounds,$rounds,#2
3490 vld1.32 {$dat1},[$key1],#16
3491 .Loop_final_2nd_dec:
3493 aesimc $tmpin,$tmpin
3494 vld1.32 {$dat0},[$key1],#16 // load key schedule...
3495 subs $rounds,$rounds,#2
3497 aesimc $tmpin,$tmpin
3498 vld1.32 {$dat1},[$key1],#16 // load key schedule...
3499 b.gt .Loop_final_2nd_dec
3502 aesimc $tmpin,$tmpin
3503 vld1.32 {$dat0},[$key1]
3505 veor $tmpin,$tmpin,$dat0
3506 veor $tmpin,$tmpin,$iv1
3507 vst1.8 {$tmpin},[$out]
3510 add $tmpoutp,$out,#16
3512 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
3513 // to get the last encrypted block.
3514 .composite_dec_loop:
3515 subs $tailcnt,$tailcnt,#1
3516 ldrb $l2outp,[$out,$tailcnt]
3517 ldrb $loutp,[$tmpinp,$tailcnt]
3518 strb $l2outp,[$tmpoutp,$tailcnt]
3519 strb $loutp,[$out,$tailcnt]
3520 b.gt .composite_dec_loop
3521 .Lxts_dec_load_done:
3522 vld1.8 {$tmpin},[$out]
3523 veor $tmpin,$tmpin,$iv0
3525 // Decrypt the composite block to get the last second plain text block
3526 ldr $rounds,[$key_,#240]
3527 vld1.8 {$dat},[$key_],#16
3528 sub $rounds,$rounds,#2
3529 vld1.8 {$dat1},[$key_],#16
3532 aesimc $tmpin,$tmpin
3533 vld1.32 {$dat0},[$key_],#16 // load key schedule...
3534 subs $rounds,$rounds,#2
3536 aesimc $tmpin,$tmpin
3537 vld1.32 {$dat1},[$key_],#16 // load key schedule...
3538 b.gt .Loop_final_dec
3541 aesimc $tmpin,$tmpin
3542 vld1.32 {$dat0},[$key_]
3544 veor $tmpin,$tmpin,$dat0
3545 veor $tmpin,$tmpin,$iv0
3546 vst1.8 {$tmpin},[$out]
3549 ldp $tailcnt,$midnumx,[sp,#48]
3550 ldp $ivd10,$ivd20,[sp,#32]
3551 ldp $ivd30,$ivd40,[sp,#16]
3552 ldp $constnumx,$tmpinp,[sp],#64
3554 .Lxts_dec_final_abort:
3556 .size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
3563 ########################################
3564 if ($flavour =~ /64/) { ######## 64-bit code
3566 "aesd" => 0x4e285800, "aese" => 0x4e284800,
3567 "aesimc"=> 0x4e287800, "aesmc" => 0x4e286800 );
3569 local *unaes = sub {
3570 my ($mnemonic,$arg)=@_;
3572 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o &&
3573 sprintf ".inst\t0x%08x\t//%s %s",
3574 $opcode{$mnemonic}|$1|($2<<5),
3578 foreach(split("\n",$code)) {
3579 s/\`([^\`]*)\`/eval($1)/geo;
3581 s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo; # old->new registers
3582 s/@\s/\/\//o; # old->new style commentary
3584 #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3585 s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o or
3586 s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel $2,$3,$2,$1/o or
3587 s/vmov\.i8/movi/o or # fix up legacy mnemonics
3589 s/vrev32\.8/rev32/o or
3590 s/vtst\.8/cmtst/o or
3592 s/^(\s+)v/$1/o or # strip off v prefix
3593 s/\bbx\s+lr\b/ret/o;
3595 # fix up remaining legacy suffixes
3597 m/\],#8/o and s/\.16b/\.8b/go;
3598 s/\.[ui]?32//o and s/\.16b/\.4s/go;
3599 s/\.[ui]?64//o and s/\.16b/\.2d/go;
3600 s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
3604 } else { ######## 32-bit code
3606 "aesd" => 0xf3b00340, "aese" => 0xf3b00300,
3607 "aesimc"=> 0xf3b003c0, "aesmc" => 0xf3b00380 );
3609 local *unaes = sub {
3610 my ($mnemonic,$arg)=@_;
3612 if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
3613 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
3614 |(($2&7)<<1) |(($2&8)<<2);
3615 # since ARMv7 instructions are always encoded little-endian.
3616 # correct solution is to use .inst directive, but older
3617 # assemblers don't implement it:-(
3618 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
3619 $word&0xff,($word>>8)&0xff,
3620 ($word>>16)&0xff,($word>>24)&0xff,
3628 $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
3629 sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
3630 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
3636 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
3637 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
3643 $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
3644 sprintf "vmov.32 d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
3647 foreach(split("\n",$code)) {
3648 s/\`([^\`]*)\`/eval($1)/geo;
3650 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers
3651 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers
3652 s/\/\/\s?/@ /o; # new->old style commentary
3654 # fix up remaining new-style suffixes
3655 s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo or
3658 s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
3659 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
3660 s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
3661 s/vdup\.32\s+(.*)/unvdup32($1)/geo or
3662 s/vmov\.32\s+(.*)/unvmov32($1)/geo or
3663 s/^(\s+)b\./$1b/o or
3664 s/^(\s+)ret/$1bx\tlr/o;
3666 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
3674 close STDOUT or die "error closing STDOUT: $!";