2 # Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # This module implements SM4 with ASIMD on aarch64
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23 die "can't locate arm-xlate.pl";
25 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26 or die "can't call $xlate: $!";
30 my @vtmp=map("v$_",(0..3));
31 my @qtmp=map("q$_",(0..3));
32 my @data=map("v$_",(4..7));
33 my @datax=map("v$_",(8..11));
34 my ($rk0,$rk1)=("v12","v13");
35 my ($rka,$rkb)=("v14","v15");
36 my @vtmpx=map("v$_",(12..15));
37 my @sbox=map("v$_",(16..31));
38 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
40 my ($xtmp1,$xtmp2)=("x8","x9");
41 my ($ptr,$counter)=("x10","w11");
42 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
48 if ($src and ("$src" ne "$dst")) {
51 rev32 $dst.16b,$src.16b
59 rev32 $dst.16b,$dst.16b
69 if ($src and ("$src" ne "$dst")) {
72 rev32 $dst.16b,$src.16b
80 rev32 $dst.16b,$dst.16b
91 if ($src and ("$src" ne "$dst")) {
94 rbit $dst.16b,$src.16b
104 rbit $dst.16b,$src.16b
111 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
114 zip1 $vt0.4s,$dat0.4s,$dat1.4s
115 zip2 $vt1.4s,$dat0.4s,$dat1.4s
116 zip1 $vt2.4s,$dat2.4s,$dat3.4s
117 zip2 $vt3.4s,$dat2.4s,$dat3.4s
118 zip1 $dat0.2d,$vt0.2d,$vt2.2d
119 zip2 $dat1.2d,$vt0.2d,$vt2.2d
120 zip1 $dat2.2d,$vt1.2d,$vt3.2d
121 zip2 $dat3.2d,$vt1.2d,$vt3.2d
125 # sbox operations for 4-lane of words
130 movi @vtmp[0].16b,#64
131 movi @vtmp[1].16b,#128
132 movi @vtmp[2].16b,#192
133 sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
134 sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
135 sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
136 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
137 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
138 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
139 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
140 add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
141 add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
142 add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
144 ushr @vtmp[0].4s,$dat.4s,32-2
145 sli @vtmp[0].4s,$dat.4s,2
146 ushr @vtmp[2].4s,$dat.4s,32-10
147 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
148 sli @vtmp[2].4s,$dat.4s,10
149 eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
150 ushr @vtmp[0].4s,$dat.4s,32-18
151 sli @vtmp[0].4s,$dat.4s,18
152 ushr @vtmp[2].4s,$dat.4s,32-24
153 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
154 sli @vtmp[2].4s,$dat.4s,24
155 eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
159 # sbox operation for 8-lane of words
165 movi @vtmp[3].16b,#64
166 sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
167 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
168 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
169 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
170 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
171 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
172 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
173 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
174 add $dat.2d,@vtmp[2].2d,$dat.2d
175 add $dat.2d,@vtmp[1].2d,$dat.2d
177 sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
178 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
179 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
180 tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
181 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
182 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
183 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
184 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
185 add $datx.2d,@vtmp[2].2d,$datx.2d
186 add $datx.2d,@vtmp[1].2d,$datx.2d
188 ushr @vtmp[0].4s,$dat.4s,32-2
189 sli @vtmp[0].4s,$dat.4s,2
190 ushr @vtmp[2].4s,$datx.4s,32-2
191 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
192 sli @vtmp[2].4s,$datx.4s,2
194 ushr @vtmp[0].4s,$dat.4s,32-10
195 eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
196 sli @vtmp[0].4s,$dat.4s,10
197 ushr @vtmp[2].4s,$datx.4s,32-10
198 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
199 sli @vtmp[2].4s,$datx.4s,10
201 ushr @vtmp[0].4s,$dat.4s,32-18
202 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
203 sli @vtmp[0].4s,$dat.4s,18
204 ushr @vtmp[2].4s,$datx.4s,32-18
205 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
206 sli @vtmp[2].4s,$datx.4s,18
208 ushr @vtmp[0].4s,$dat.4s,32-24
209 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
210 sli @vtmp[0].4s,$dat.4s,24
211 ushr @vtmp[2].4s,$datx.4s,32-24
212 eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
213 sli @vtmp[2].4s,$datx.4s,24
214 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
218 # sbox operation for one single word
223 movi @vtmp[1].16b,#64
224 movi @vtmp[2].16b,#128
225 movi @vtmp[3].16b,#192
226 mov @vtmp[0].s[0],$word
228 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
229 sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
230 sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
232 tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
233 tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
234 tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
235 tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
237 mov $word,@vtmp[0].s[0]
238 mov $wtmp0,@vtmp[1].s[0]
239 mov $wtmp2,@vtmp[2].s[0]
240 add $wtmp0,$word,$wtmp0
241 mov $word,@vtmp[3].s[0]
242 add $wtmp0,$wtmp0,$wtmp2
243 add $wtmp0,$wtmp0,$word
245 eor $word,$wtmp0,$wtmp0,ror #32-2
246 eor $word,$word,$wtmp0,ror #32-10
247 eor $word,$word,$wtmp0,ror #32-18
248 eor $word,$word,$wtmp0,ror #32-24
252 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
257 ldp $wtmp0,$wtmp1,[$kptr],8
258 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
259 eor $tmpw,$word2,$word3
260 eor $wtmp2,$wtmp0,$word1
261 eor $tmpw,$tmpw,$wtmp2
265 eor $word0,$word0,$tmpw
266 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
267 eor $tmpw,$word2,$word3
268 eor $wtmp2,$word0,$wtmp1
269 eor $tmpw,$tmpw,$wtmp2
273 ldp $wtmp0,$wtmp1,[$kptr],8
274 eor $word1,$word1,$tmpw
275 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
276 eor $tmpw,$word0,$word1
277 eor $wtmp2,$wtmp0,$word3
278 eor $tmpw,$tmpw,$wtmp2
282 eor $word2,$word2,$tmpw
283 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
284 eor $tmpw,$word0,$word1
285 eor $wtmp2,$word2,$wtmp1
286 eor $tmpw,$tmpw,$wtmp2
290 eor $word3,$word3,$tmpw
294 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
299 ldp $wtmp0,$wtmp1,[$kptr],8
303 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304 eor $rka.16b,@data[2].16b,@data[3].16b
305 eor $rk0.16b,@data[1].16b,$rk0.16b
306 eor $rk0.16b,$rka.16b,$rk0.16b
310 eor @data[0].16b,@data[0].16b,$rk0.16b
312 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
313 eor $rka.16b,$rka.16b,@data[0].16b
314 eor $rk1.16b,$rka.16b,$rk1.16b
318 ldp $wtmp0,$wtmp1,[$kptr],8
319 eor @data[1].16b,@data[1].16b,$rk1.16b
324 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
325 eor $rka.16b,@data[0].16b,@data[1].16b
326 eor $rk0.16b,@data[3].16b,$rk0.16b
327 eor $rk0.16b,$rka.16b,$rk0.16b
331 eor @data[2].16b,@data[2].16b,$rk0.16b
333 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
334 eor $rka.16b,$rka.16b,@data[2].16b
335 eor $rk1.16b,$rka.16b,$rk1.16b
339 eor @data[3].16b,@data[3].16b,$rk1.16b
343 # sm4 for 8 lanes of data, in neon registers
344 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
349 ldp $wtmp0,$wtmp1,[$kptr],8
350 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
352 eor $rka.16b,@data[2].16b,@data[3].16b
353 eor $rkb.16b,@datax[2].16b,@datax[3].16b
354 eor @vtmp[0].16b,@data[1].16b,$rk0.16b
355 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
356 eor $rk0.16b,$rka.16b,@vtmp[0].16b
357 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
359 &sbox_double($rk0,$rk1);
361 eor @data[0].16b,@data[0].16b,$rk0.16b
362 eor @datax[0].16b,@datax[0].16b,$rk1.16b
364 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
366 eor $rka.16b,$rka.16b,@data[0].16b
367 eor $rkb.16b,$rkb.16b,@datax[0].16b
368 eor $rk0.16b,$rka.16b,$rk1.16b
369 eor $rk1.16b,$rkb.16b,$rk1.16b
371 &sbox_double($rk0,$rk1);
373 ldp $wtmp0,$wtmp1,[$kptr],8
374 eor @data[1].16b,@data[1].16b,$rk0.16b
375 eor @datax[1].16b,@datax[1].16b,$rk1.16b
377 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
379 eor $rka.16b,@data[0].16b,@data[1].16b
380 eor $rkb.16b,@datax[0].16b,@datax[1].16b
381 eor @vtmp[0].16b,@data[3].16b,$rk0.16b
382 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
383 eor $rk0.16b,$rka.16b,@vtmp[0].16b
384 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
386 &sbox_double($rk0,$rk1);
388 eor @data[2].16b,@data[2].16b,$rk0.16b
389 eor @datax[2].16b,@datax[2].16b,$rk1.16b
391 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
393 eor $rka.16b,$rka.16b,@data[2].16b
394 eor $rkb.16b,$rkb.16b,@datax[2].16b
395 eor $rk0.16b,$rka.16b,$rk1.16b
396 eor $rk1.16b,$rkb.16b,$rk1.16b
398 &sbox_double($rk0,$rk1);
400 eor @data[3].16b,@data[3].16b,$rk0.16b
401 eor @datax[3].16b,@datax[3].16b,$rk1.16b
405 sub encrypt_1blk_norev() {
419 subs $counter,$counter,#1
431 &encrypt_1blk_norev($dat);
435 sub encrypt_4blks() {
443 subs $counter,$counter,#1
446 &rev32(@vtmp[3],@data[0]);
447 &rev32(@vtmp[2],@data[1]);
448 &rev32(@vtmp[1],@data[2]);
449 &rev32(@vtmp[0],@data[3]);
452 sub encrypt_8blks() {
460 subs $counter,$counter,#1
463 &rev32(@vtmp[3],@data[0]);
464 &rev32(@vtmp[2],@data[1]);
465 &rev32(@vtmp[1],@data[2]);
466 &rev32(@vtmp[0],@data[3]);
467 &rev32(@data[3],@datax[0]);
468 &rev32(@data[2],@datax[1]);
469 &rev32(@data[1],@datax[2]);
470 &rev32(@data[0],@datax[3]);
478 ld1 {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
479 ld1 {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
480 ld1 {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
481 ld1 {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
486 sub mov_reg_to_vec() {
494 &rev32_armeb($desv,$desv);
497 sub mov_vec_to_reg() {
507 sub compute_tweak() {
514 extr $xtmp2,$src1,$src1,#32
515 extr $des1,$src1,$src0,#63
516 and $wtmp1,$wtmp0,$wtmp2,asr#31
517 eor $des0,$xtmp1,$src0,lsl#1
521 sub compute_tweak_vec() {
525 &rbit(@vtmp[2],$src,$std);
527 ldr @qtmp[0], =0x01010101010101010101010101010187
528 shl $des.16b, @vtmp[2].16b, #1
529 ext @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
530 ushr @vtmp[1].16b, @vtmp[1].16b, #7
531 mul @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
532 eor $des.16b, $des.16b, @vtmp[1].16b
534 &rbit($des,$des,$std);
538 #include "arm_arch.h"
542 .type _vpsm4_consts,%object
546 .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
547 .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
548 .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
549 .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
550 .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
551 .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
552 .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
553 .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
554 .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
555 .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
556 .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
557 .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
558 .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
559 .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
560 .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
561 .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
563 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
564 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
565 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
566 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
567 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
568 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
569 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
570 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
572 .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
574 .dword 0x0B0A090807060504,0x030201000F0E0D0C
576 .size _vpsm4_consts,.-_vpsm4_consts
580 my ($key,$keys,$enc)=("x0","x1","w2");
581 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
582 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
584 .type _vpsm4_set_key,%function
587 AARCH64_VALID_CALL_TARGET
588 ld1 {$vkey.4s},[$key]
593 adr $pointer,.Lshuffles
594 ld1 {$vmap.2d},[$pointer]
596 ld1 {$vfk.2d},[$pointer]
597 eor $vkey.16b,$vkey.16b,$vfk.16b
600 movi @vtmp[0].16b,#64
605 ldr $roundkey,[$pointer],#4
606 eor $roundkey,$roundkey,$wtmp
608 eor $roundkey,$roundkey,$wtmp
610 eor $roundkey,$roundkey,$wtmp
612 mov @data[0].s[0],$roundkey
613 tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
614 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
615 tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
616 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
617 tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
618 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
619 tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
620 mov $wtmp,@vtmp[1].s[0]
621 eor $roundkey,$wtmp,$wtmp,ror #19
622 eor $roundkey,$roundkey,$wtmp,ror #9
624 eor $roundkey,$roundkey,$wtmp
625 mov $vkey.s[0],$roundkey
627 str $roundkey,[$keys],#4
630 str $roundkey,[$keys],#-4
632 tbl $vkey.16b,{$vkey.16b},$vmap.16b
633 subs $schedules,$schedules,#1
636 .size _vpsm4_set_key,.-_vpsm4_set_key
643 .type _vpsm4_enc_4blks,%function
646 AARCH64_VALID_CALL_TARGET
651 .size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
657 .type _vpsm4_enc_8blks,%function
660 AARCH64_VALID_CALL_TARGET
665 .size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
671 my ($key,$keys)=("x0","x1");
673 .globl ${prefix}_set_encrypt_key
674 .type ${prefix}_set_encrypt_key,%function
676 ${prefix}_set_encrypt_key:
677 AARCH64_SIGN_LINK_REGISTER
678 stp x29,x30,[sp,#-16]!
682 AARCH64_VALIDATE_LINK_REGISTER
684 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
689 my ($key,$keys)=("x0","x1");
691 .globl ${prefix}_set_decrypt_key
692 .type ${prefix}_set_decrypt_key,%function
694 ${prefix}_set_decrypt_key:
695 AARCH64_SIGN_LINK_REGISTER
696 stp x29,x30,[sp,#-16]!
700 AARCH64_VALIDATE_LINK_REGISTER
702 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
709 my ($inp,$outp,$rk)=map("x$_",(0..2));
712 .globl ${prefix}_${dir}crypt
713 .type ${prefix}_${dir}crypt,%function
715 ${prefix}_${dir}crypt:
716 AARCH64_VALID_CALL_TARGET
717 ld1 {@data[0].4s},[$inp]
720 &rev32(@data[0],@data[0]);
724 &encrypt_1blk(@data[0]);
726 st1 {@data[0].4s},[$outp]
728 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
737 my @dat=map("v$_",(16..23));
740 .globl ${prefix}_ecb_encrypt
741 .type ${prefix}_ecb_encrypt,%function
743 ${prefix}_ecb_encrypt:
744 AARCH64_SIGN_LINK_REGISTER
745 // convert length into blocks
755 .Lecb_8_blocks_process:
757 b.lt .Lecb_4_blocks_process
758 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
759 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
761 &rev32(@data[0],@data[0]);
762 &rev32(@data[1],@data[1]);
763 &rev32(@data[2],@data[2]);
764 &rev32(@data[3],@data[3]);
765 &rev32(@datax[0],@datax[0]);
766 &rev32(@datax[1],@datax[1]);
767 &rev32(@datax[2],@datax[2]);
768 &rev32(@datax[3],@datax[3]);
771 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
772 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
773 subs $blocks,$blocks,#8
774 b.gt .Lecb_8_blocks_process
776 .Lecb_4_blocks_process:
779 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
781 &rev32(@data[0],@data[0]);
782 &rev32(@data[1],@data[1]);
783 &rev32(@data[2],@data[2]);
784 &rev32(@data[3],@data[3]);
787 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
788 sub $blocks,$blocks,#4
790 // process last block
794 ld1 {@data[0].4s},[$inp]
796 &rev32(@data[0],@data[0]);
797 &encrypt_1blk(@data[0]);
799 st1 {@data[0].4s},[$outp]
801 1: // process last 2 blocks
802 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
803 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
807 &rev32(@data[0],@data[0]);
808 &rev32(@data[1],@data[1]);
809 &rev32(@data[2],@data[2]);
810 &rev32(@data[3],@data[3]);
813 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
814 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
816 1: // process last 3 blocks
817 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
819 &rev32(@data[0],@data[0]);
820 &rev32(@data[1],@data[1]);
821 &rev32(@data[2],@data[2]);
822 &rev32(@data[3],@data[3]);
825 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
826 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
827 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
834 AARCH64_VALIDATE_LINK_REGISTER
836 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
841 my ($len,$ivp,$enc)=("x2","x4","w5");
846 .globl ${prefix}_cbc_encrypt
847 .type ${prefix}_cbc_encrypt,%function
849 ${prefix}_cbc_encrypt:
850 AARCH64_VALID_CALL_TARGET
856 ld1 {$ivec0.4s},[$ivp]
860 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
861 eor @data[0].16b,@data[0].16b,$ivec0.16b
863 &rev32(@data[1],@data[1]);
864 &rev32(@data[0],@data[0]);
865 &rev32(@data[2],@data[2]);
866 &rev32(@data[3],@data[3]);
867 &encrypt_1blk_norev(@data[0]);
869 eor @data[1].16b,@data[1].16b,@data[0].16b
871 &encrypt_1blk_norev(@data[1]);
872 &rev32(@data[0],@data[0]);
875 eor @data[2].16b,@data[2].16b,@data[1].16b
877 &encrypt_1blk_norev(@data[2]);
878 &rev32(@data[1],@data[1]);
880 eor @data[3].16b,@data[3].16b,@data[2].16b
882 &encrypt_1blk_norev(@data[3]);
883 &rev32(@data[2],@data[2]);
884 &rev32(@data[3],@data[3]);
886 orr $ivec0.16b,@data[3].16b,@data[3].16b
887 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
888 subs $blocks,$blocks,#4
889 b.ne .Lcbc_4_blocks_enc
892 subs $blocks,$blocks,#1
894 ld1 {@data[0].4s},[$inp],#16
895 eor $ivec0.16b,$ivec0.16b,@data[0].16b
897 &rev32($ivec0,$ivec0);
898 &encrypt_1blk($ivec0);
900 st1 {$ivec0.4s},[$outp],#16
904 st1 {$ivec0.4s},[$ivp]
908 // decryption mode starts
909 AARCH64_SIGN_LINK_REGISTER
918 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
920 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
922 &rev32(@data[0],@data[0]);
923 &rev32(@data[1],@data[1]);
924 &rev32(@data[2],@data[2]);
925 &rev32(@data[3],$data[3]);
926 &rev32(@datax[0],@datax[0]);
927 &rev32(@datax[1],@datax[1]);
928 &rev32(@datax[2],@datax[2]);
929 &rev32(@datax[3],$datax[3]);
933 &transpose(@vtmp,@datax);
934 &transpose(@data,@datax);
936 ld1 {$ivec1.4s},[$ivp]
937 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
938 // note ivec1 and vtmpx[3] are resuing the same register
939 // care needs to be taken to avoid conflict
940 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
941 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
942 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
943 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
944 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
946 st1 {$vtmpx[3].4s}, [$ivp]
947 eor @data[0].16b,@data[0].16b,$datax[3].16b
948 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
949 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
950 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
951 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
952 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
953 subs $blocks,$blocks,#8
954 b.gt .Lcbc_8_blocks_dec
957 ld1 {$ivec1.4s},[$ivp]
961 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
963 &rev32(@data[0],@data[0]);
964 &rev32(@data[1],@data[1]);
965 &rev32(@data[2],@data[2]);
966 &rev32(@data[3],$data[3]);
969 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
971 &transpose(@vtmp,@datax);
973 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
974 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
975 orr $ivec1.16b,@data[3].16b,@data[3].16b
976 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
977 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
978 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
979 subs $blocks,$blocks,#4
980 b.gt .Lcbc_4_blocks_dec
982 st1 {@data[3].4s}, [$ivp]
985 subs $blocks,$blocks,#1
988 ld1 {@data[0].4s},[$inp],#16
990 st1 {$data[0].4s}, [$ivp]
992 &rev32(@datax[0],@data[0]);
993 &encrypt_1blk(@datax[0]);
995 eor @datax[0].16b,@datax[0].16b,$ivec1.16b
996 st1 {@datax[0].4s},[$outp],#16
998 1: // last two blocks
999 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
1001 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
1002 subs $blocks,$blocks,1
1005 &rev32(@data[0],@data[0]);
1006 &rev32(@data[1],@data[1]);
1007 &rev32(@data[2],@data[2]);
1008 &rev32(@data[3],@data[3]);
1011 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1013 &transpose(@vtmp,@datax);
1015 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1016 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1017 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1019 st1 {@data[1].4s}, [$ivp]
1022 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1024 &rev32(@data[0],@data[0]);
1025 &rev32(@data[1],@data[1]);
1026 &rev32(@data[2],@data[2]);
1027 &rev32(@data[3],@data[3]);
1030 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1032 &transpose(@vtmp,@datax);
1034 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1035 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1036 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1037 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1039 st1 {@data[2].4s}, [$ivp]
1041 ldp d10,d11,[sp,#16]
1042 ldp d12,d13,[sp,#32]
1043 ldp d14,d15,[sp,#48]
1044 ldp x29,x30,[sp,#64]
1046 AARCH64_VALIDATE_LINK_REGISTER
1048 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1058 .globl ${prefix}_ctr32_encrypt_blocks
1059 .type ${prefix}_ctr32_encrypt_blocks,%function
1061 ${prefix}_ctr32_encrypt_blocks:
1062 AARCH64_VALID_CALL_TARGET
1063 ld1 {$ivec.4s},[$ivp]
1065 &rev32($ivec,$ivec);
1070 // fast processing for one single block without
1071 // context saving overhead
1073 &encrypt_1blk($ivec);
1075 ld1 {@data[0].4s},[$inp]
1076 eor @data[0].16b,@data[0].16b,$ivec.16b
1077 st1 {@data[0].4s},[$outp]
1080 AARCH64_SIGN_LINK_REGISTER
1081 stp d8,d9,[sp,#-80]!
1082 stp d10,d11,[sp,#16]
1083 stp d12,d13,[sp,#32]
1084 stp d14,d15,[sp,#48]
1085 stp x29,x30,[sp,#64]
1086 mov $word0,$ivec.s[0]
1087 mov $word1,$ivec.s[1]
1088 mov $word2,$ivec.s[2]
1090 .Lctr32_4_blocks_process:
1093 dup @data[0].4s,$word0
1094 dup @data[1].4s,$word1
1095 dup @data[2].4s,$word2
1096 mov @data[3].s[0],$ctr
1098 mov $data[3].s[1],$ctr
1100 mov @data[3].s[2],$ctr
1102 mov @data[3].s[3],$ctr
1105 b.ge .Lctr32_8_blocks_process
1107 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1108 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1109 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1110 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1111 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1112 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1113 subs $blocks,$blocks,#4
1114 b.ne .Lctr32_4_blocks_process
1116 .Lctr32_8_blocks_process:
1117 dup @datax[0].4s,$word0
1118 dup @datax[1].4s,$word1
1119 dup @datax[2].4s,$word2
1120 mov @datax[3].s[0],$ctr
1122 mov $datax[3].s[1],$ctr
1124 mov @datax[3].s[2],$ctr
1126 mov @datax[3].s[3],$ctr
1129 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1130 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1131 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1132 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1133 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1134 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1135 eor @data[0].16b,@data[0].16b,@datax[0].16b
1136 eor @data[1].16b,@data[1].16b,@datax[1].16b
1137 eor @data[2].16b,@data[2].16b,@datax[2].16b
1138 eor @data[3].16b,@data[3].16b,@datax[3].16b
1139 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1140 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1141 subs $blocks,$blocks,#8
1142 b.ne .Lctr32_4_blocks_process
1144 1: // last block processing
1145 subs $blocks,$blocks,#1
1148 mov $ivec.s[0],$word0
1149 mov $ivec.s[1],$word1
1150 mov $ivec.s[2],$word2
1153 &encrypt_1blk($ivec);
1155 ld1 {@data[0].4s},[$inp]
1156 eor @data[0].16b,@data[0].16b,$ivec.16b
1157 st1 {@data[0].4s},[$outp]
1159 1: // last 2 blocks processing
1160 dup @data[0].4s,$word0
1161 dup @data[1].4s,$word1
1162 dup @data[2].4s,$word2
1163 mov @data[3].s[0],$ctr
1165 mov @data[3].s[1],$ctr
1166 subs $blocks,$blocks,#1
1169 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1170 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1171 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1172 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1173 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1174 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1175 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1176 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1178 1: // last 3 blocks processing
1180 mov @data[3].s[2],$ctr
1182 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1183 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1184 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1185 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1186 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1187 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1188 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1189 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1190 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1191 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1193 ldp d10,d11,[sp,#16]
1194 ldp d12,d13,[sp,#32]
1195 ldp d14,d15,[sp,#48]
1196 ldp x29,x30,[sp,#64]
1198 AARCH64_VALIDATE_LINK_REGISTER
1200 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1205 my ($blocks,$len)=("x2","x2");
1207 my @twx=map("x$_",(12..27));
1208 my ($rks1,$rks2)=("x26","x27");
1209 my $lastBlk=("x26");
1215 sub gen_xts_cipher() {
1218 .globl ${prefix}_xts_encrypt${std}
1219 .type ${prefix}_xts_encrypt${std},%function
1221 ${prefix}_xts_encrypt${std}:
1222 AARCH64_SIGN_LINK_REGISTER
1223 stp x15, x16, [sp, #-0x10]!
1224 stp x17, x18, [sp, #-0x10]!
1225 stp x19, x20, [sp, #-0x10]!
1226 stp x21, x22, [sp, #-0x10]!
1227 stp x23, x24, [sp, #-0x10]!
1228 stp x25, x26, [sp, #-0x10]!
1229 stp x27, x28, [sp, #-0x10]!
1230 stp x29, x30, [sp, #-0x10]!
1231 stp d8, d9, [sp, #-0x10]!
1232 stp d10, d11, [sp, #-0x10]!
1233 stp d12, d13, [sp, #-0x10]!
1234 stp d14, d15, [sp, #-0x10]!
1238 ld1 {@tweak[0].4s}, [$ivp]
1242 &rev32(@tweak[0],@tweak[0]);
1243 &encrypt_1blk(@tweak[0]);
1246 and $remain,$len,#0x0F
1247 // convert length into blocks
1253 // If the encryption/decryption Length is N times of 16,
1254 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1255 b.eq .xts_encrypt_blocks${std}
1257 // If the encryption/decryption length is not N times of 16,
1258 // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1259 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1260 subs $blocks,$blocks,#1
1261 b.eq .only_2blks_tweak${std}
1262 .xts_encrypt_blocks${std}:
1264 &rbit(@tweak[0],@tweak[0],$std);
1265 &rev32_armeb(@tweak[0],@tweak[0]);
1266 &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1267 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1268 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1269 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1270 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1271 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1272 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1273 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1275 .Lxts_8_blocks_process${std}:
1277 b.lt .Lxts_4_blocks_process${std}
1279 &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
1280 &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
1281 &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
1282 &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
1283 &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
1284 &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
1285 &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
1286 &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
1288 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1290 &rbit(@vtmp[0],@vtmp[0],$std);
1291 &rbit(@vtmp[1],@vtmp[1],$std);
1292 &rbit(@vtmp[2],@vtmp[2],$std);
1293 &rbit(@vtmp[3],@vtmp[3],$std);
1295 eor @data[0].16b, @data[0].16b, @vtmp[0].16b
1296 eor @data[1].16b, @data[1].16b, @vtmp[1].16b
1297 eor @data[2].16b, @data[2].16b, @vtmp[2].16b
1298 eor @data[3].16b, @data[3].16b, @vtmp[3].16b
1299 ld1 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1301 &rbit(@vtmpx[0],@vtmpx[0],$std);
1302 &rbit(@vtmpx[1],@vtmpx[1],$std);
1303 &rbit(@vtmpx[2],@vtmpx[2],$std);
1304 &rbit(@vtmpx[3],@vtmpx[3],$std);
1306 eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
1307 eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
1308 eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
1309 eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
1311 &rev32(@data[0],@data[0]);
1312 &rev32(@data[1],@data[1]);
1313 &rev32(@data[2],@data[2]);
1314 &rev32(@data[3],@data[3]);
1315 &rev32(@datax[0],@datax[0]);
1316 &rev32(@datax[1],@datax[1]);
1317 &rev32(@datax[2],@datax[2]);
1318 &rev32(@datax[3],@datax[3]);
1319 &transpose(@data,@vtmp);
1320 &transpose(@datax,@vtmp);
1322 bl _${prefix}_enc_8blks
1324 &transpose(@vtmp,@datax);
1325 &transpose(@data,@datax);
1327 &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
1328 &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1329 &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
1330 &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1331 &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
1332 &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1333 &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
1334 &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1335 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1336 &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1337 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1338 &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1339 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1340 &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1341 &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
1342 &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1344 eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
1345 eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
1346 eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
1347 eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
1348 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1349 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1350 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1351 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1353 // save the last tweak
1354 st1 {@tweak[3].4s},[$ivp]
1355 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1356 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1357 subs $blocks,$blocks,#8
1358 b.gt .Lxts_8_blocks_process${std}
1360 .Lxts_4_blocks_process${std}:
1362 &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1363 &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1364 &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1365 &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1369 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1371 &rbit(@tweak[0],@tweak[0],$std);
1372 &rbit(@tweak[1],@tweak[1],$std);
1373 &rbit(@tweak[2],@tweak[2],$std);
1374 &rbit(@tweak[3],@tweak[3],$std);
1376 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1377 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1378 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1379 eor @data[3].16b, @data[3].16b, @tweak[3].16b
1381 &rev32(@data[0],@data[0]);
1382 &rev32(@data[1],@data[1]);
1383 &rev32(@data[2],@data[2]);
1384 &rev32(@data[3],@data[3]);
1385 &transpose(@data,@vtmp);
1387 bl _${prefix}_enc_4blks
1389 &transpose(@vtmp,@data);
1391 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1392 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1393 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1394 eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1395 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1396 sub $blocks,$blocks,#4
1398 &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1399 &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1400 &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1402 // save the last tweak
1403 st1 {@tweak[3].4s},[$ivp]
1405 // process last block
1409 ld1 {@data[0].4s},[$inp],#16
1411 &rbit(@tweak[0],@tweak[0],$std);
1413 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1415 &rev32(@data[0],@data[0]);
1416 &encrypt_1blk(@data[0]);
1418 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1419 st1 {@data[0].4s},[$outp],#16
1420 // save the last tweak
1421 st1 {@tweak[0].4s},[$ivp]
1423 1: // process last 2 blocks
1426 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
1428 &rbit(@tweak[0],@tweak[0],$std);
1429 &rbit(@tweak[1],@tweak[1],$std);
1431 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1432 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1434 &rev32(@data[0],@data[0]);
1435 &rev32(@data[1],@data[1]);
1436 &transpose(@data,@vtmp);
1438 bl _${prefix}_enc_4blks
1440 &transpose(@vtmp,@data);
1442 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1443 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1444 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1445 // save the last tweak
1446 st1 {@tweak[1].4s},[$ivp]
1448 1: // process last 3 blocks
1449 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1451 &rbit(@tweak[0],@tweak[0],$std);
1452 &rbit(@tweak[1],@tweak[1],$std);
1453 &rbit(@tweak[2],@tweak[2],$std);
1455 eor @data[0].16b, @data[0].16b, @tweak[0].16b
1456 eor @data[1].16b, @data[1].16b, @tweak[1].16b
1457 eor @data[2].16b, @data[2].16b, @tweak[2].16b
1459 &rev32(@data[0],@data[0]);
1460 &rev32(@data[1],@data[1]);
1461 &rev32(@data[2],@data[2]);
1462 &transpose(@data,@vtmp);
1464 bl _${prefix}_enc_4blks
1466 &transpose(@vtmp,@data);
1468 eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1469 eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1470 eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1471 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1472 // save the last tweak
1473 st1 {@tweak[2].4s},[$ivp]
1478 // This brance calculates the last two tweaks,
1479 // while the encryption/decryption length is larger than 32
1480 .last_2blks_tweak${std}:
1481 ld1 {@tweak[0].4s},[$ivp]
1483 &rev32_armeb(@tweak[0],@tweak[0]);
1484 &compute_tweak_vec(@tweak[0],@tweak[1],$std);
1485 &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1490 // This brance calculates the last two tweaks,
1491 // while the encryption/decryption length is equal to 32, who only need two tweaks
1492 .only_2blks_tweak${std}:
1493 mov @tweak[1].16b,@tweak[0].16b
1495 &rev32_armeb(@tweak[1],@tweak[1]);
1496 &compute_tweak_vec(@tweak[1],@tweak[2]);
1501 // Determine whether encryption or decryption is required.
1502 // The last two tweaks need to be swapped for decryption.
1504 // encryption:1 decryption:0
1506 b.eq .prcess_last_2blks${std}
1507 mov @vtmp[0].16B,@tweak[1].16b
1508 mov @tweak[1].16B,@tweak[2].16b
1509 mov @tweak[2].16B,@vtmp[0].16b
1511 .prcess_last_2blks${std}:
1513 &rev32_armeb(@tweak[1],@tweak[1]);
1514 &rev32_armeb(@tweak[2],@tweak[2]);
1516 ld1 {@data[0].4s},[$inp],#16
1517 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1519 &rev32(@data[0],@data[0]);
1520 &encrypt_1blk(@data[0]);
1522 eor @data[0].16b, @data[0].16b, @tweak[1].16b
1523 st1 {@data[0].4s},[$outp],#16
1525 sub $lastBlk,$outp,16
1527 subs $remain,$remain,1
1528 ldrb $wtmp0,[$lastBlk,$remain]
1529 ldrb $wtmp1,[$inp,$remain]
1530 strb $wtmp1,[$lastBlk,$remain]
1531 strb $wtmp0,[$outp,$remain]
1533 ld1 {@data[0].4s}, [$lastBlk]
1534 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1536 &rev32(@data[0],@data[0]);
1537 &encrypt_1blk(@data[0]);
1539 eor @data[0].16b, @data[0].16b, @tweak[2].16b
1540 st1 {@data[0].4s}, [$lastBlk]
1542 ldp d14, d15, [sp], #0x10
1543 ldp d12, d13, [sp], #0x10
1544 ldp d10, d11, [sp], #0x10
1545 ldp d8, d9, [sp], #0x10
1546 ldp x29, x30, [sp], #0x10
1547 ldp x27, x28, [sp], #0x10
1548 ldp x25, x26, [sp], #0x10
1549 ldp x23, x24, [sp], #0x10
1550 ldp x21, x22, [sp], #0x10
1551 ldp x19, x20, [sp], #0x10
1552 ldp x17, x18, [sp], #0x10
1553 ldp x15, x16, [sp], #0x10
1554 AARCH64_VALIDATE_LINK_REGISTER
1556 .size ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1558 } # end of gen_xts_cipher
1559 &gen_xts_cipher("_gb");
1560 &gen_xts_cipher("");
1562 ########################################
1566 last if (!s/^#/\/\// and !/^$/);
1571 foreach(split("\n",$code)) {
1572 s/\`([^\`]*)\`/eval($1)/ge;
1576 close STDOUT or die "error closing STDOUT: $!";