2 # Copyright 2022-2023 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # This module implements SM4 with ASIMD and AESE on AARCH64
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23 die "can't locate arm-xlate.pl";
25 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26 or die "can't call $xlate: $!";
30 my @vtmp=map("v$_",(0..3));
31 my @qtmp=map("q$_",(0..3));
32 my @data=map("v$_",(4..7));
33 my @datax=map("v$_",(8..11));
34 my ($rk0,$rk1)=("v12","v13");
35 my ($rka,$rkb)=("v14","v15");
36 my @vtmpx=map("v$_",(12..15));
37 my ($vtmp4,$vtmp5)=("v24","v25");
38 my ($MaskV,$TAHMatV,$TALMatV,$ATAHMatV,$ATALMatV,$ANDMaskV)=("v26","v27","v28","v29","v30","v31");
39 my ($MaskQ,$TAHMatQ,$TALMatQ,$ATAHMatQ,$ATALMatQ,$ANDMaskQ)=("q26","q27","q28","q29","q30","q31");
41 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
42 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
43 my ($xtmp1,$xtmp2)=("x8","x9");
44 my ($ptr,$counter)=("x10","w11");
45 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
51 if ($src and ("$src" ne "$dst")) {
54 rev32 $dst.16b,$src.16b
62 rev32 $dst.16b,$dst.16b
72 if ($src and ("$src" ne "$dst")) {
75 rev32 $dst.16b,$src.16b
83 rev32 $dst.16b,$dst.16b
94 if ($src and ("$src" ne "$dst")) {
97 rbit $dst.16b,$src.16b
101 mov $dst.16b,$src.16b
107 rbit $dst.16b,$src.16b
114 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
117 zip1 $vt0.4s,$dat0.4s,$dat1.4s
118 zip2 $vt1.4s,$dat0.4s,$dat1.4s
119 zip1 $vt2.4s,$dat2.4s,$dat3.4s
120 zip2 $vt3.4s,$dat2.4s,$dat3.4s
121 zip1 $dat0.2d,$vt0.2d,$vt2.2d
122 zip2 $dat1.2d,$vt0.2d,$vt2.2d
123 zip1 $dat2.2d,$vt1.2d,$vt3.2d
124 zip2 $dat3.2d,$vt1.2d,$vt3.2d
128 # matrix multiplication Mat*x = (lowerMat*x) ^ (higherMat*x)
131 my $higherMat = shift;
132 my $lowerMat = shift;
135 ushr $tmp.16b, $x.16b, 4
136 and $x.16b, $x.16b, $ANDMaskV.16b
137 tbl $x.16b, {$lowerMat.16b}, $x.16b
138 tbl $tmp.16b, {$higherMat.16b}, $tmp.16b
139 eor $x.16b, $x.16b, $tmp.16b
143 # sbox operations for 4-lane of words
144 # sbox operation for 4-lane of words
149 // optimize sbox using AESE instruction
150 tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
152 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
154 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
155 aese @vtmp[0].16b,@vtmp[1].16b
157 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, $vtmp4);
159 mov $dat.16b,@vtmp[0].16b
161 // linear transformation
162 ushr @vtmp[0].4s,$dat.4s,32-2
163 ushr @vtmp[1].4s,$dat.4s,32-10
164 ushr @vtmp[2].4s,$dat.4s,32-18
165 ushr @vtmp[3].4s,$dat.4s,32-24
166 sli @vtmp[0].4s,$dat.4s,2
167 sli @vtmp[1].4s,$dat.4s,10
168 sli @vtmp[2].4s,$dat.4s,18
169 sli @vtmp[3].4s,$dat.4s,24
170 eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
171 eor $vtmp4.16b,$vtmp4.16b,$vtmp[1].16b
172 eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
173 eor $dat.16b,$dat.16b,$vtmp4.16b
177 # sbox operation for 8-lane of words
183 // optimize sbox using AESE instruction
184 tbl @vtmp[0].16b, {$dat.16b}, $MaskV.16b
185 tbl @vtmp[1].16b, {$datx.16b}, $MaskV.16b
187 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, $vtmp4);
188 &mul_matrix(@vtmp[1], $TAHMatV, $TALMatV, $vtmp4);
190 eor $vtmp5.16b, $vtmp5.16b, $vtmp5.16b
191 aese @vtmp[0].16b,$vtmp5.16b
192 aese @vtmp[1].16b,$vtmp5.16b
194 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV,$vtmp4);
195 &mul_matrix(@vtmp[1], $ATAHMatV, $ATALMatV,$vtmp4);
197 mov $dat.16b,@vtmp[0].16b
198 mov $datx.16b,@vtmp[1].16b
200 // linear transformation
201 ushr @vtmp[0].4s,$dat.4s,32-2
202 ushr $vtmp5.4s,$datx.4s,32-2
203 ushr @vtmp[1].4s,$dat.4s,32-10
204 ushr @vtmp[2].4s,$dat.4s,32-18
205 ushr @vtmp[3].4s,$dat.4s,32-24
206 sli @vtmp[0].4s,$dat.4s,2
207 sli $vtmp5.4s,$datx.4s,2
208 sli @vtmp[1].4s,$dat.4s,10
209 sli @vtmp[2].4s,$dat.4s,18
210 sli @vtmp[3].4s,$dat.4s,24
211 eor $vtmp4.16b,@vtmp[0].16b,$dat.16b
212 eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
213 eor $dat.16b,@vtmp[2].16b,@vtmp[3].16b
214 eor $dat.16b,$dat.16b,$vtmp4.16b
215 ushr @vtmp[1].4s,$datx.4s,32-10
216 ushr @vtmp[2].4s,$datx.4s,32-18
217 ushr @vtmp[3].4s,$datx.4s,32-24
218 sli @vtmp[1].4s,$datx.4s,10
219 sli @vtmp[2].4s,$datx.4s,18
220 sli @vtmp[3].4s,$datx.4s,24
221 eor $vtmp4.16b,$vtmp5.16b,$datx.16b
222 eor $vtmp4.16b,$vtmp4.16b,@vtmp[1].16b
223 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
224 eor $datx.16b,$datx.16b,$vtmp4.16b
228 # sbox operation for one single word
233 mov @vtmp[3].s[0],$word
234 // optimize sbox using AESE instruction
235 tbl @vtmp[0].16b, {@vtmp[3].16b}, $MaskV.16b
237 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
239 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
240 aese @vtmp[0].16b,@vtmp[1].16b
242 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
245 mov $wtmp0,@vtmp[0].s[0]
246 eor $word,$wtmp0,$wtmp0,ror #32-2
247 eor $word,$word,$wtmp0,ror #32-10
248 eor $word,$word,$wtmp0,ror #32-18
249 eor $word,$word,$wtmp0,ror #32-24
253 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
258 ldp $wtmp0,$wtmp1,[$kptr],8
259 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
260 eor $tmpw,$word2,$word3
261 eor $wtmp2,$wtmp0,$word1
262 eor $tmpw,$tmpw,$wtmp2
266 eor $word0,$word0,$tmpw
267 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
268 eor $tmpw,$word2,$word3
269 eor $wtmp2,$word0,$wtmp1
270 eor $tmpw,$tmpw,$wtmp2
274 ldp $wtmp0,$wtmp1,[$kptr],8
275 eor $word1,$word1,$tmpw
276 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
277 eor $tmpw,$word0,$word1
278 eor $wtmp2,$wtmp0,$word3
279 eor $tmpw,$tmpw,$wtmp2
283 eor $word2,$word2,$tmpw
284 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
285 eor $tmpw,$word0,$word1
286 eor $wtmp2,$word2,$wtmp1
287 eor $tmpw,$tmpw,$wtmp2
291 eor $word3,$word3,$tmpw
295 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
300 ldp $wtmp0,$wtmp1,[$kptr],8
304 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
305 eor $rka.16b,@data[2].16b,@data[3].16b
306 eor $rk0.16b,@data[1].16b,$rk0.16b
307 eor $rk0.16b,$rka.16b,$rk0.16b
311 eor @data[0].16b,@data[0].16b,$rk0.16b
313 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
314 eor $rka.16b,$rka.16b,@data[0].16b
315 eor $rk1.16b,$rka.16b,$rk1.16b
319 ldp $wtmp0,$wtmp1,[$kptr],8
320 eor @data[1].16b,@data[1].16b,$rk1.16b
325 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
326 eor $rka.16b,@data[0].16b,@data[1].16b
327 eor $rk0.16b,@data[3].16b,$rk0.16b
328 eor $rk0.16b,$rka.16b,$rk0.16b
332 eor @data[2].16b,@data[2].16b,$rk0.16b
334 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
335 eor $rka.16b,$rka.16b,@data[2].16b
336 eor $rk1.16b,$rka.16b,$rk1.16b
340 eor @data[3].16b,@data[3].16b,$rk1.16b
344 # sm4 for 8 lanes of data, in neon registers
345 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
350 ldp $wtmp0,$wtmp1,[$kptr],8
351 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
353 eor $rka.16b,@data[2].16b,@data[3].16b
354 eor $rkb.16b,@datax[2].16b,@datax[3].16b
355 eor @vtmp[0].16b,@data[1].16b,$rk0.16b
356 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
357 eor $rk0.16b,$rka.16b,@vtmp[0].16b
358 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
360 &sbox_double($rk0,$rk1);
362 eor @data[0].16b,@data[0].16b,$rk0.16b
363 eor @datax[0].16b,@datax[0].16b,$rk1.16b
365 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
367 eor $rka.16b,$rka.16b,@data[0].16b
368 eor $rkb.16b,$rkb.16b,@datax[0].16b
369 eor $rk0.16b,$rka.16b,$rk1.16b
370 eor $rk1.16b,$rkb.16b,$rk1.16b
372 &sbox_double($rk0,$rk1);
374 ldp $wtmp0,$wtmp1,[$kptr],8
375 eor @data[1].16b,@data[1].16b,$rk0.16b
376 eor @datax[1].16b,@datax[1].16b,$rk1.16b
378 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
380 eor $rka.16b,@data[0].16b,@data[1].16b
381 eor $rkb.16b,@datax[0].16b,@datax[1].16b
382 eor @vtmp[0].16b,@data[3].16b,$rk0.16b
383 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
384 eor $rk0.16b,$rka.16b,@vtmp[0].16b
385 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
387 &sbox_double($rk0,$rk1);
389 eor @data[2].16b,@data[2].16b,$rk0.16b
390 eor @datax[2].16b,@datax[2].16b,$rk1.16b
392 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
394 eor $rka.16b,$rka.16b,@data[2].16b
395 eor $rkb.16b,$rkb.16b,@datax[2].16b
396 eor $rk0.16b,$rka.16b,$rk1.16b
397 eor $rk1.16b,$rkb.16b,$rk1.16b
399 &sbox_double($rk0,$rk1);
401 eor @data[3].16b,@data[3].16b,$rk0.16b
402 eor @datax[3].16b,@datax[3].16b,$rk1.16b
406 sub encrypt_1blk_norev() {
420 subs $counter,$counter,#1
432 &encrypt_1blk_norev($dat);
436 sub encrypt_4blks() {
444 subs $counter,$counter,#1
447 &rev32(@vtmp[3],@data[0]);
448 &rev32(@vtmp[2],@data[1]);
449 &rev32(@vtmp[1],@data[2]);
450 &rev32(@vtmp[0],@data[3]);
453 sub encrypt_8blks() {
461 subs $counter,$counter,#1
464 &rev32(@vtmp[3],@data[0]);
465 &rev32(@vtmp[2],@data[1]);
466 &rev32(@vtmp[1],@data[2]);
467 &rev32(@vtmp[0],@data[3]);
468 &rev32(@data[3],@datax[0]);
469 &rev32(@data[2],@datax[1]);
470 &rev32(@data[1],@datax[2]);
471 &rev32(@data[0],@datax[3]);
478 ldr $MaskQ, .Lsbox_magic
479 ldr $TAHMatQ, .Lsbox_magic+16
480 ldr $TALMatQ, .Lsbox_magic+32
481 ldr $ATAHMatQ, .Lsbox_magic+48
482 ldr $ATALMatQ, .Lsbox_magic+64
483 ldr $ANDMaskQ, .Lsbox_magic+80
487 sub mov_reg_to_vec() {
495 &rev32_armeb($desv,$desv);
498 sub mov_vec_to_reg() {
508 sub compute_tweak() {
515 extr $xtmp2,$src1,$src1,#32
516 extr $des1,$src1,$src0,#63
517 and $wtmp1,$wtmp0,$wtmp2,asr#31
518 eor $des0,$xtmp1,$src0,lsl#1
522 sub compute_tweak_vec() {
526 &rbit(@vtmp[2],$src,$std);
528 ldr @qtmp[0], .Lxts_magic
529 shl $des.16b, @vtmp[2].16b, #1
530 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
531 ushr @vtmp[1].16b, @vtmp[1].16b, #7
532 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
533 eor $des.16b, $des.16b, @vtmp[1].16b
535 &rbit($des,$des,$std);
539 #include "arm_arch.h"
543 .type _${prefix}_consts,%object
547 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
548 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
549 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
550 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
551 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
552 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
553 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
554 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
556 .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
558 .quad 0x0B0A090807060504,0x030201000F0E0D0C
560 .quad 0x0101010101010187,0x0101010101010101
562 .quad 0x0b0e0104070a0d00,0x0306090c0f020508
563 .quad 0x62185a2042387a00,0x22581a6002783a40
564 .quad 0x15df62a89e54e923,0xc10bb67c4a803df7
565 .quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead
566 .quad 0x6404462679195b3b,0xe383c1a1fe9edcbc
567 .quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f
569 .size _${prefix}_consts,.-_${prefix}_consts
573 my ($key,$keys,$enc)=("x0","x1","w2");
574 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
575 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
577 .type _${prefix}_set_key,%function
580 AARCH64_VALID_CALL_TARGET
581 ld1 {$vkey.4s},[$key]
586 adr $pointer,.Lshuffles
587 ld1 {$vmap.2d},[$pointer]
589 ld1 {$vfk.2d},[$pointer]
590 eor $vkey.16b,$vkey.16b,$vfk.16b
593 movi @vtmp[0].16b,#64
598 ldr $roundkey,[$pointer],#4
599 eor $roundkey,$roundkey,$wtmp
601 eor $roundkey,$roundkey,$wtmp
603 eor $roundkey,$roundkey,$wtmp
604 // optimize sbox using AESE instruction
605 mov @data[0].s[0],$roundkey
606 tbl @vtmp[0].16b, {@data[0].16b}, $MaskV.16b
608 &mul_matrix(@vtmp[0], $TAHMatV, $TALMatV, @vtmp[2]);
610 eor @vtmp[1].16b, @vtmp[1].16b, @vtmp[1].16b
611 aese @vtmp[0].16b,@vtmp[1].16b
613 &mul_matrix(@vtmp[0], $ATAHMatV, $ATALMatV, @vtmp[2]);
615 mov $wtmp,@vtmp[0].s[0]
616 eor $roundkey,$wtmp,$wtmp,ror #19
617 eor $roundkey,$roundkey,$wtmp,ror #9
619 eor $roundkey,$roundkey,$wtmp
620 mov $vkey.s[0],$roundkey
622 str $roundkey,[$keys],#4
625 str $roundkey,[$keys],#-4
627 tbl $vkey.16b,{$vkey.16b},$vmap.16b
628 subs $schedules,$schedules,#1
631 .size _${prefix}_set_key,.-_${prefix}_set_key
638 .type _${prefix}_enc_4blks,%function
640 _${prefix}_enc_4blks:
641 AARCH64_VALID_CALL_TARGET
646 .size _${prefix}_enc_4blks,.-_${prefix}_enc_4blks
652 .type _${prefix}_enc_8blks,%function
654 _${prefix}_enc_8blks:
655 AARCH64_VALID_CALL_TARGET
660 .size _${prefix}_enc_8blks,.-_${prefix}_enc_8blks
666 my ($key,$keys)=("x0","x1");
668 .globl ${prefix}_set_encrypt_key
669 .type ${prefix}_set_encrypt_key,%function
671 ${prefix}_set_encrypt_key:
672 AARCH64_SIGN_LINK_REGISTER
673 stp x29,x30,[sp,#-16]!
675 bl _${prefix}_set_key
677 AARCH64_VALIDATE_LINK_REGISTER
679 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
684 my ($key,$keys)=("x0","x1");
686 .globl ${prefix}_set_decrypt_key
687 .type ${prefix}_set_decrypt_key,%function
689 ${prefix}_set_decrypt_key:
690 AARCH64_SIGN_LINK_REGISTER
691 stp x29,x30,[sp,#-16]!
693 bl _${prefix}_set_key
695 AARCH64_VALIDATE_LINK_REGISTER
697 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
704 my ($inp,$outp,$rk)=map("x$_",(0..2));
707 .globl ${prefix}_${dir}crypt
708 .type ${prefix}_${dir}crypt,%function
710 ${prefix}_${dir}crypt:
711 AARCH64_VALID_CALL_TARGET
712 ld1 {@data[0].4s},[$inp]
715 &rev32(@data[0],@data[0]);
719 &encrypt_1blk(@data[0]);
721 st1 {@data[0].4s},[$outp]
723 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
732 .globl ${prefix}_ecb_encrypt
733 .type ${prefix}_ecb_encrypt,%function
735 ${prefix}_ecb_encrypt:
736 AARCH64_SIGN_LINK_REGISTER
737 // convert length into blocks
747 .Lecb_8_blocks_process:
749 b.lt .Lecb_4_blocks_process
750 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
751 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
753 &rev32(@data[0],@data[0]);
754 &rev32(@data[1],@data[1]);
755 &rev32(@data[2],@data[2]);
756 &rev32(@data[3],@data[3]);
757 &rev32(@datax[0],@datax[0]);
758 &rev32(@datax[1],@datax[1]);
759 &rev32(@datax[2],@datax[2]);
760 &rev32(@datax[3],@datax[3]);
762 bl _${prefix}_enc_8blks
763 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
764 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
765 subs $blocks,$blocks,#8
766 b.gt .Lecb_8_blocks_process
768 .Lecb_4_blocks_process:
771 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
773 &rev32(@data[0],@data[0]);
774 &rev32(@data[1],@data[1]);
775 &rev32(@data[2],@data[2]);
776 &rev32(@data[3],@data[3]);
778 bl _${prefix}_enc_4blks
779 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
780 sub $blocks,$blocks,#4
782 // process last block
786 ld1 {@data[0].4s},[$inp]
788 &rev32(@data[0],@data[0]);
789 &encrypt_1blk(@data[0]);
791 st1 {@data[0].4s},[$outp]
793 1: // process last 2 blocks
794 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
795 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
799 &rev32(@data[0],@data[0]);
800 &rev32(@data[1],@data[1]);
801 &rev32(@data[2],@data[2]);
802 &rev32(@data[3],@data[3]);
804 bl _${prefix}_enc_4blks
805 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
806 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
808 1: // process last 3 blocks
809 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
811 &rev32(@data[0],@data[0]);
812 &rev32(@data[1],@data[1]);
813 &rev32(@data[2],@data[2]);
814 &rev32(@data[3],@data[3]);
816 bl _${prefix}_enc_4blks
817 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
818 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
819 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
826 AARCH64_VALIDATE_LINK_REGISTER
828 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
833 my ($len,$ivp,$enc)=("x2","x4","w5");
838 .globl ${prefix}_cbc_encrypt
839 .type ${prefix}_cbc_encrypt,%function
841 ${prefix}_cbc_encrypt:
842 AARCH64_VALID_CALL_TARGET
848 ld1 {$ivec0.4s},[$ivp]
852 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
853 eor @data[0].16b,@data[0].16b,$ivec0.16b
855 &rev32(@data[1],@data[1]);
856 &rev32(@data[0],@data[0]);
857 &rev32(@data[2],@data[2]);
858 &rev32(@data[3],@data[3]);
859 &encrypt_1blk_norev(@data[0]);
861 eor @data[1].16b,@data[1].16b,@data[0].16b
863 &encrypt_1blk_norev(@data[1]);
864 &rev32(@data[0],@data[0]);
867 eor @data[2].16b,@data[2].16b,@data[1].16b
869 &encrypt_1blk_norev(@data[2]);
870 &rev32(@data[1],@data[1]);
872 eor @data[3].16b,@data[3].16b,@data[2].16b
874 &encrypt_1blk_norev(@data[3]);
875 &rev32(@data[2],@data[2]);
876 &rev32(@data[3],@data[3]);
878 orr $ivec0.16b,@data[3].16b,@data[3].16b
879 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
880 subs $blocks,$blocks,#4
881 b.ne .Lcbc_4_blocks_enc
884 subs $blocks,$blocks,#1
886 ld1 {@data[0].4s},[$inp],#16
887 eor $ivec0.16b,$ivec0.16b,@data[0].16b
889 &rev32($ivec0,$ivec0);
890 &encrypt_1blk($ivec0);
892 st1 {$ivec0.4s},[$outp],#16
896 st1 {$ivec0.4s},[$ivp]
900 // decryption mode starts
901 AARCH64_SIGN_LINK_REGISTER
910 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
912 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
914 &rev32(@data[0],@data[0]);
915 &rev32(@data[1],@data[1]);
916 &rev32(@data[2],@data[2]);
917 &rev32(@data[3],$data[3]);
918 &rev32(@datax[0],@datax[0]);
919 &rev32(@datax[1],@datax[1]);
920 &rev32(@datax[2],@datax[2]);
921 &rev32(@datax[3],$datax[3]);
923 bl _${prefix}_enc_8blks
925 &transpose(@vtmp,@datax);
926 &transpose(@data,@datax);
928 ld1 {$ivec1.4s},[$ivp]
929 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
930 // note ivec1 and vtmpx[3] are reusing the same register
931 // care needs to be taken to avoid conflict
932 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
933 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
934 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
935 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
936 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
938 st1 {$vtmpx[3].4s}, [$ivp]
939 eor @data[0].16b,@data[0].16b,$datax[3].16b
940 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
941 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
942 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
943 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
944 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
945 subs $blocks,$blocks,#8
946 b.gt .Lcbc_8_blocks_dec
949 ld1 {$ivec1.4s},[$ivp]
953 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
955 &rev32(@data[0],@data[0]);
956 &rev32(@data[1],@data[1]);
957 &rev32(@data[2],@data[2]);
958 &rev32(@data[3],$data[3]);
960 bl _${prefix}_enc_4blks
961 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
963 &transpose(@vtmp,@datax);
965 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
966 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
967 orr $ivec1.16b,@data[3].16b,@data[3].16b
968 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
969 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
970 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
971 subs $blocks,$blocks,#4
972 b.gt .Lcbc_4_blocks_dec
974 st1 {@data[3].4s}, [$ivp]
977 subs $blocks,$blocks,#1
980 ld1 {@data[0].4s},[$inp],#16
982 st1 {$data[0].4s}, [$ivp]
984 &rev32(@datax[0],@data[0]);
985 &encrypt_1blk(@datax[0]);
987 eor @datax[0].16b,@datax[0].16b,$ivec1.16b
988 st1 {@datax[0].4s},[$outp],#16
990 1: // last two blocks
991 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
993 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
994 subs $blocks,$blocks,1
997 &rev32(@data[0],@data[0]);
998 &rev32(@data[1],@data[1]);
999 &rev32(@data[2],@data[2]);
1000 &rev32(@data[3],@data[3]);
1002 bl _${prefix}_enc_4blks
1003 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1005 &transpose(@vtmp,@datax);
1007 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1008 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1009 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1011 st1 {@data[1].4s}, [$ivp]
1014 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1016 &rev32(@data[0],@data[0]);
1017 &rev32(@data[1],@data[1]);
1018 &rev32(@data[2],@data[2]);
1019 &rev32(@data[3],@data[3]);
1021 bl _${prefix}_enc_4blks
1022 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1024 &transpose(@vtmp,@datax);
1026 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1027 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1028 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1029 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1031 st1 {@data[2].4s}, [$ivp]
1033 ldp d10,d11,[sp,#16]
1034 ldp d12,d13,[sp,#32]
1035 ldp d14,d15,[sp,#48]
1036 ldp x29,x30,[sp,#64]
1038 AARCH64_VALIDATE_LINK_REGISTER
1040 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1050 .globl ${prefix}_ctr32_encrypt_blocks
1051 .type ${prefix}_ctr32_encrypt_blocks,%function
1053 ${prefix}_ctr32_encrypt_blocks:
1054 AARCH64_VALID_CALL_TARGET
1055 ld1 {$ivec.4s},[$ivp]
1057 &rev32($ivec,$ivec);
1062 // fast processing for one single block without
1063 // context saving overhead
1065 &encrypt_1blk($ivec);
1067 ld1 {@data[0].4s},[$inp]
1068 eor @data[0].16b,@data[0].16b,$ivec.16b
1069 st1 {@data[0].4s},[$outp]
1072 AARCH64_SIGN_LINK_REGISTER
1073 stp d8,d9,[sp,#-80]!
1074 stp d10,d11,[sp,#16]
1075 stp d12,d13,[sp,#32]
1076 stp d14,d15,[sp,#48]
1077 stp x29,x30,[sp,#64]
1078 mov $word0,$ivec.s[0]
1079 mov $word1,$ivec.s[1]
1080 mov $word2,$ivec.s[2]
1082 .Lctr32_4_blocks_process:
1085 dup @data[0].4s,$word0
1086 dup @data[1].4s,$word1
1087 dup @data[2].4s,$word2
1088 mov @data[3].s[0],$ctr
1090 mov $data[3].s[1],$ctr
1092 mov @data[3].s[2],$ctr
1094 mov @data[3].s[3],$ctr
1097 b.ge .Lctr32_8_blocks_process
1098 bl _${prefix}_enc_4blks
1099 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1100 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1101 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1102 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1103 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1104 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1105 subs $blocks,$blocks,#4
1106 b.ne .Lctr32_4_blocks_process
1108 .Lctr32_8_blocks_process:
1109 dup @datax[0].4s,$word0
1110 dup @datax[1].4s,$word1
1111 dup @datax[2].4s,$word2
1112 mov @datax[3].s[0],$ctr
1114 mov $datax[3].s[1],$ctr
1116 mov @datax[3].s[2],$ctr
1118 mov @datax[3].s[3],$ctr
1120 bl _${prefix}_enc_8blks
1121 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1122 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1123 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1124 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1125 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1126 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1127 eor @data[0].16b,@data[0].16b,@datax[0].16b
1128 eor @data[1].16b,@data[1].16b,@datax[1].16b
1129 eor @data[2].16b,@data[2].16b,@datax[2].16b
1130 eor @data[3].16b,@data[3].16b,@datax[3].16b
1131 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1132 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1133 subs $blocks,$blocks,#8
1134 b.ne .Lctr32_4_blocks_process
1136 1: // last block processing
1137 subs $blocks,$blocks,#1
1140 mov $ivec.s[0],$word0
1141 mov $ivec.s[1],$word1
1142 mov $ivec.s[2],$word2
1145 &encrypt_1blk($ivec);
1147 ld1 {@data[0].4s},[$inp]
1148 eor @data[0].16b,@data[0].16b,$ivec.16b
1149 st1 {@data[0].4s},[$outp]
1151 1: // last 2 blocks processing
1152 dup @data[0].4s,$word0
1153 dup @data[1].4s,$word1
1154 dup @data[2].4s,$word2
1155 mov @data[3].s[0],$ctr
1157 mov @data[3].s[1],$ctr
1158 subs $blocks,$blocks,#1
1160 bl _${prefix}_enc_4blks
1161 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1162 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1163 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1164 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1165 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1166 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1167 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1168 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1170 1: // last 3 blocks processing
1172 mov @data[3].s[2],$ctr
1173 bl _${prefix}_enc_4blks
1174 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1175 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1176 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1177 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1178 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1179 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1180 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1181 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1182 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1183 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1185 ldp d10,d11,[sp,#16]
1186 ldp d12,d13,[sp,#32]
1187 ldp d14,d15,[sp,#48]
1188 ldp x29,x30,[sp,#64]
1190 AARCH64_VALIDATE_LINK_REGISTER
1192 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1198 my ($blocks,$len)=("x2","x2");
1200 my @twx=map("x$_",(12..27));
1201 my ($rks1,$rks2)=("x26","x27");
1202 my $lastBlk=("x26");
1206 my @tweak=map("v$_",(16..23));
1207 my $lastTweak=("v25");
1209 sub gen_xts_cipher() {
1212 .globl ${prefix}_xts_encrypt${std}
1213 .type ${prefix}_xts_encrypt${std},%function
1215 ${prefix}_xts_encrypt${std}:
1216 AARCH64_SIGN_LINK_REGISTER
1217 stp x15, x16, [sp, #-0x10]!
1218 stp x17, x18, [sp, #-0x10]!
1219 stp x19, x20, [sp, #-0x10]!
1220 stp x21, x22, [sp, #-0x10]!
1221 stp x23, x24, [sp, #-0x10]!
1222 stp x25, x26, [sp, #-0x10]!
1223 stp x27, x28, [sp, #-0x10]!
1224 stp x29, x30, [sp, #-0x10]!
1225 stp d8, d9, [sp, #-0x10]!
1226 stp d10, d11, [sp, #-0x10]!
1227 stp d12, d13, [sp, #-0x10]!
1228 stp d14, d15, [sp, #-0x10]!
1232 ld1 {@tweak[0].4s}, [$ivp]
1236 &rev32(@tweak[0],@tweak[0]);
1237 &encrypt_1blk(@tweak[0]);
1240 and $remain,$len,#0x0F
1241 // convert length into blocks
1247 // If the encryption/decryption Length is N times of 16,
1248 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1249 b.eq .xts_encrypt_blocks${std}
1251 // If the encryption/decryption length is not N times of 16,
1252 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1253 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1254 subs $blocks,$blocks,#1
1255 b.eq .only_2blks_tweak${std}
1256 .xts_encrypt_blocks${std}:
1258 &rbit(@tweak[0],@tweak[0],$std);
1259 &rev32_armeb(@tweak[0],@tweak[0]);
1260 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1261 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1262 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1263 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1264 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1265 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1266 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1267 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1269 .Lxts_8_blocks_process${std}:
1272 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1273 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1274 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1275 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1276 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1277 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1278 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1279 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1280 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[4]);
1281 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1282 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[5]);
1283 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1284 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[6]);
1285 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1286 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[7]);
1287 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1289 b.lt .Lxts_4_blocks_process${std}
1290 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1292 &rbit(@tweak[0],@tweak[0],$std);
1293 &rbit(@tweak[1],@tweak[1],$std);
1294 &rbit(@tweak[2],@tweak[2],$std);
1295 &rbit(@tweak[3],@tweak[3],$std);
1297 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1298 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1299 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1300 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1301 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1303 &rbit(@tweak[4],@tweak[4],$std);
1304 &rbit(@tweak[5],@tweak[5],$std);
1305 &rbit(@tweak[6],@tweak[6],$std);
1306 &rbit(@tweak[7],@tweak[7],$std);
1308 eor @datax[0].16b, @datax[0].16b, @tweak[4].16b
1309 eor @datax[1].16b, @datax[1].16b, @tweak[5].16b
1310 eor @datax[2].16b, @datax[2].16b, @tweak[6].16b
1311 eor @datax[3].16b, @datax[3].16b, @tweak[7].16b
1313 &rev32(@data[0],@data[0]);
1314 &rev32(@data[1],@data[1]);
1315 &rev32(@data[2],@data[2]);
1316 &rev32(@data[3],@data[3]);
1317 &rev32(@datax[0],@datax[0]);
1318 &rev32(@datax[1],@datax[1]);
1319 &rev32(@datax[2],@datax[2]);
1320 &rev32(@datax[3],@datax[3]);
1321 &transpose(@data,@vtmp);
1322 &transpose(@datax,@vtmp);
1324 bl _${prefix}_enc_8blks
1326 &transpose(@vtmp,@datax);
1327 &transpose(@data,@datax);
1329 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1330 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1331 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1332 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1333 eor @data[0].16b, @data[0].16b, @tweak[4].16b
1334 eor @data[1].16b, @data[1].16b, @tweak[5].16b
1335 eor @data[2].16b, @data[2].16b, @tweak[6].16b
1336 eor @data[3].16b, @data[3].16b, @tweak[7].16b
1338 // save the last tweak
1339 mov $lastTweak.16b,@tweak[7].16b
1340 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1341 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1342 subs $blocks,$blocks,#8
1343 b.gt .Lxts_8_blocks_process${std}
1345 .Lxts_4_blocks_process${std}:
1348 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1350 &rbit(@tweak[0],@tweak[0],$std);
1351 &rbit(@tweak[1],@tweak[1],$std);
1352 &rbit(@tweak[2],@tweak[2],$std);
1353 &rbit(@tweak[3],@tweak[3],$std);
1355 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1356 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1357 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1358 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1360 &rev32(@data[0],@data[0]);
1361 &rev32(@data[1],@data[1]);
1362 &rev32(@data[2],@data[2]);
1363 &rev32(@data[3],@data[3]);
1364 &transpose(@data,@vtmp);
1366 bl _${prefix}_enc_4blks
1368 &transpose(@vtmp,@data);
1370 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1371 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1372 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1373 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1374 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1375 sub $blocks,$blocks,#4
1376 mov @tweak[0].16b,@tweak[4].16b
1377 mov @tweak[1].16b,@tweak[5].16b
1378 mov @tweak[2].16b,@tweak[6].16b
1379 // save the last tweak
1380 mov $lastTweak.16b,@tweak[3].16b
1382 // process last block
1386 ld1 {@data[0].4s},[$inp],#16
1388 &rbit(@tweak[0],@tweak[0],$std);
1390 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1392 &rev32(@data[0],@data[0]);
1393 &encrypt_1blk(@data[0]);
1395 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1396 st1 {@data[0].4s},[$outp],#16
1397 // save the last tweak
1398 mov $lastTweak.16b,@tweak[0].16b
1400 1: // process last 2 blocks
1403 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1405 &rbit(@tweak[0],@tweak[0],$std);
1406 &rbit(@tweak[1],@tweak[1],$std);
1408 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1409 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1411 &rev32(@data[0],@data[0]);
1412 &rev32(@data[1],@data[1]);
1413 &transpose(@data,@vtmp);
1415 bl _${prefix}_enc_4blks
1417 &transpose(@vtmp,@data);
1419 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1420 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1421 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1422 // save the last tweak
1423 mov $lastTweak.16b,@tweak[1].16b
1425 1: // process last 3 blocks
1426 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1428 &rbit(@tweak[0],@tweak[0],$std);
1429 &rbit(@tweak[1],@tweak[1],$std);
1430 &rbit(@tweak[2],@tweak[2],$std);
1432 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1433 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1434 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1436 &rev32(@data[0],@data[0]);
1437 &rev32(@data[1],@data[1]);
1438 &rev32(@data[2],@data[2]);
1439 &transpose(@data,@vtmp);
1441 bl _${prefix}_enc_4blks
1443 &transpose(@vtmp,@data);
1445 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1446 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1447 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1448 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1449 // save the last tweak
1450 mov $lastTweak.16b,@tweak[2].16b
1455 // This branch calculates the last two tweaks,
1456 // while the encryption/decryption length is larger than 32
1457 .last_2blks_tweak${std}:
1459 &rev32_armeb($lastTweak,$lastTweak);
1460 &compute_tweak_vec($lastTweak,@tweak[1],$std);
1461 &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1466 // This branch calculates the last two tweaks,
1467 // while the encryption/decryption length is equal to 32, who only need two tweaks
1468 .only_2blks_tweak${std}:
1469 mov @tweak[1].16b,@tweak[0].16b
1471 &rev32_armeb(@tweak[1],@tweak[1]);
1472 &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1477 // Determine whether encryption or decryption is required.
1478 // The last two tweaks need to be swapped for decryption.
1480 // encryption:1 decryption:0
1482 b.eq .process_last_2blks${std}
1483 mov @vtmp[0].16B,@tweak[1].16b
1484 mov @tweak[1].16B,@tweak[2].16b
1485 mov @tweak[2].16B,@vtmp[0].16b
1487 .process_last_2blks${std}:
1489 &rev32_armeb(@tweak[1],@tweak[1]);
1490 &rev32_armeb(@tweak[2],@tweak[2]);
1492 ld1 {@data[0].4s},[$inp],#16
1493 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1495 &rev32(@data[0],@data[0]);
1496 &encrypt_1blk(@data[0]);
1498 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1499 st1 {@data[0].4s},[$outp],#16
1501 sub $lastBlk,$outp,16
1503 subs $remain,$remain,1
1504 ldrb $wtmp0,[$lastBlk,$remain]
1505 ldrb $wtmp1,[$inp,$remain]
1506 strb $wtmp1,[$lastBlk,$remain]
1507 strb $wtmp0,[$outp,$remain]
1509 ld1 {@data[0].4s}, [$lastBlk]
1510 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1512 &rev32(@data[0],@data[0]);
1513 &encrypt_1blk(@data[0]);
1515 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1516 st1 {@data[0].4s}, [$lastBlk]
1518 ldp d14, d15, [sp], #0x10
1519 ldp d12, d13, [sp], #0x10
1520 ldp d10, d11, [sp], #0x10
1521 ldp d8, d9, [sp], #0x10
1522 ldp x29, x30, [sp], #0x10
1523 ldp x27, x28, [sp], #0x10
1524 ldp x25, x26, [sp], #0x10
1525 ldp x23, x24, [sp], #0x10
1526 ldp x21, x22, [sp], #0x10
1527 ldp x19, x20, [sp], #0x10
1528 ldp x17, x18, [sp], #0x10
1529 ldp x15, x16, [sp], #0x10
1530 AARCH64_VALIDATE_LINK_REGISTER
1532 .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1534 } # end of gen_xts_cipher
1535 &gen_xts_cipher("_gb");
1536 &gen_xts_cipher("");
1539 ########################################
1543 last if (!s/^#/\/\// and !/^$/);
1548 foreach(split("\n",$code)) {
1549 s/\`([^\`]*)\`/eval($1)/ge;
1553 close STDOUT or die "error closing STDOUT: $!";