2 # Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
4 # Licensed under the Apache License 2.0 (the "License"). You may not use
5 # this file except in compliance with the License. You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
10 # This module implements SM4 with ASIMD on aarch64
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23 die "can't locate arm-xlate.pl";
25 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26 or die "can't call $xlate: $!";
30 my @vtmp=map("v$_",(0..3));
31 my @data=map("v$_",(4..7));
32 my @datax=map("v$_",(8..11));
33 my ($rk0,$rk1)=("v12","v13");
34 my ($rka,$rkb)=("v14","v15");
35 my @vtmpx=map("v$_",(12..15));
36 my @sbox=map("v$_",(16..31));
37 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
38 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
39 my ($ptr,$counter)=("x10","w11");
40 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
46 if ($src and ("$src" ne "$dst")) {
49 rev32 $dst.16b,$src.16b
57 rev32 $dst.16b,$dst.16b
64 my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
67 zip1 $vt0.4s,$dat0.4s,$dat1.4s
68 zip2 $vt1.4s,$dat0.4s,$dat1.4s
69 zip1 $vt2.4s,$dat2.4s,$dat3.4s
70 zip2 $vt3.4s,$dat2.4s,$dat3.4s
71 zip1 $dat0.2d,$vt0.2d,$vt2.2d
72 zip2 $dat1.2d,$vt0.2d,$vt2.2d
73 zip1 $dat2.2d,$vt1.2d,$vt3.2d
74 zip2 $dat3.2d,$vt1.2d,$vt3.2d
78 # sbox operations for 4-lane of words
84 movi @vtmp[1].16b,#128
85 movi @vtmp[2].16b,#192
86 sub @vtmp[0].16b,$dat.16b,@vtmp[0].16b
87 sub @vtmp[1].16b,$dat.16b,@vtmp[1].16b
88 sub @vtmp[2].16b,$dat.16b,@vtmp[2].16b
89 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
90 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
91 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
92 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
93 add @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
94 add @vtmp[2].2d,@vtmp[2].2d,$dat.2d
95 add $dat.2d,@vtmp[0].2d,@vtmp[2].2d
97 ushr @vtmp[0].4s,$dat.4s,32-2
98 sli @vtmp[0].4s,$dat.4s,2
99 ushr @vtmp[2].4s,$dat.4s,32-10
100 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
101 sli @vtmp[2].4s,$dat.4s,10
102 eor @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
103 ushr @vtmp[0].4s,$dat.4s,32-18
104 sli @vtmp[0].4s,$dat.4s,18
105 ushr @vtmp[2].4s,$dat.4s,32-24
106 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
107 sli @vtmp[2].4s,$dat.4s,24
108 eor $dat.16b,@vtmp[2].16b,@vtmp[1].16b
112 # sbox operation for 8-lane of words
118 movi @vtmp[3].16b,#64
119 sub @vtmp[0].16b,$dat.16b,@vtmp[3].16b
120 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
121 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
122 tbl $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
123 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
124 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
125 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
126 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
127 add $dat.2d,@vtmp[2].2d,$dat.2d
128 add $dat.2d,@vtmp[1].2d,$dat.2d
130 sub @vtmp[0].16b,$datx.16b,@vtmp[3].16b
131 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
132 sub @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
133 tbl $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
134 tbl @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
135 tbl @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
136 tbl @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
137 add @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
138 add $datx.2d,@vtmp[2].2d,$datx.2d
139 add $datx.2d,@vtmp[1].2d,$datx.2d
141 ushr @vtmp[0].4s,$dat.4s,32-2
142 sli @vtmp[0].4s,$dat.4s,2
143 ushr @vtmp[2].4s,$datx.4s,32-2
144 eor @vtmp[1].16b,@vtmp[0].16b,$dat.16b
145 sli @vtmp[2].4s,$datx.4s,2
147 ushr @vtmp[0].4s,$dat.4s,32-10
148 eor @vtmp[3].16b,@vtmp[2].16b,$datx.16b
149 sli @vtmp[0].4s,$dat.4s,10
150 ushr @vtmp[2].4s,$datx.4s,32-10
151 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
152 sli @vtmp[2].4s,$datx.4s,10
154 ushr @vtmp[0].4s,$dat.4s,32-18
155 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
156 sli @vtmp[0].4s,$dat.4s,18
157 ushr @vtmp[2].4s,$datx.4s,32-18
158 eor @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
159 sli @vtmp[2].4s,$datx.4s,18
161 ushr @vtmp[0].4s,$dat.4s,32-24
162 eor @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
163 sli @vtmp[0].4s,$dat.4s,24
164 ushr @vtmp[2].4s,$datx.4s,32-24
165 eor $dat.16b,@vtmp[0].16b,@vtmp[1].16b
166 sli @vtmp[2].4s,$datx.4s,24
167 eor $datx.16b,@vtmp[2].16b,@vtmp[3].16b
171 # sbox operation for one single word
176 movi @vtmp[1].16b,#64
177 movi @vtmp[2].16b,#128
178 movi @vtmp[3].16b,#192
179 mov @vtmp[0].s[0],$word
181 sub @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
182 sub @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
183 sub @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
185 tbl @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
186 tbl @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
187 tbl @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
188 tbl @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
190 mov $word,@vtmp[0].s[0]
191 mov $wtmp0,@vtmp[1].s[0]
192 mov $wtmp2,@vtmp[2].s[0]
193 add $wtmp0,$word,$wtmp0
194 mov $word,@vtmp[3].s[0]
195 add $wtmp0,$wtmp0,$wtmp2
196 add $wtmp0,$wtmp0,$word
198 eor $word,$wtmp0,$wtmp0,ror #32-2
199 eor $word,$word,$wtmp0,ror #32-10
200 eor $word,$word,$wtmp0,ror #32-18
201 eor $word,$word,$wtmp0,ror #32-24
205 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
210 ldp $wtmp0,$wtmp1,[$kptr],8
211 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
212 eor $tmpw,$word2,$word3
213 eor $wtmp2,$wtmp0,$word1
214 eor $tmpw,$tmpw,$wtmp2
218 eor $word0,$word0,$tmpw
219 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
220 eor $tmpw,$word2,$word3
221 eor $wtmp2,$word0,$wtmp1
222 eor $tmpw,$tmpw,$wtmp2
226 ldp $wtmp0,$wtmp1,[$kptr],8
227 eor $word1,$word1,$tmpw
228 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
229 eor $tmpw,$word0,$word1
230 eor $wtmp2,$wtmp0,$word3
231 eor $tmpw,$tmpw,$wtmp2
235 eor $word2,$word2,$tmpw
236 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
237 eor $tmpw,$word0,$word1
238 eor $wtmp2,$word2,$wtmp1
239 eor $tmpw,$tmpw,$wtmp2
243 eor $word3,$word3,$tmpw
247 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
252 ldp $wtmp0,$wtmp1,[$kptr],8
256 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
257 eor $rka.16b,@data[2].16b,@data[3].16b
258 eor $rk0.16b,@data[1].16b,$rk0.16b
259 eor $rk0.16b,$rka.16b,$rk0.16b
263 eor @data[0].16b,@data[0].16b,$rk0.16b
265 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
266 eor $rka.16b,$rka.16b,@data[0].16b
267 eor $rk1.16b,$rka.16b,$rk1.16b
271 ldp $wtmp0,$wtmp1,[$kptr],8
272 eor @data[1].16b,@data[1].16b,$rk1.16b
277 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
278 eor $rka.16b,@data[0].16b,@data[1].16b
279 eor $rk0.16b,@data[3].16b,$rk0.16b
280 eor $rk0.16b,$rka.16b,$rk0.16b
284 eor @data[2].16b,@data[2].16b,$rk0.16b
286 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
287 eor $rka.16b,$rka.16b,@data[2].16b
288 eor $rk1.16b,$rka.16b,$rk1.16b
292 eor @data[3].16b,@data[3].16b,$rk1.16b
296 # sm4 for 8 lanes of data, in neon registers
297 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
302 ldp $wtmp0,$wtmp1,[$kptr],8
303 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
305 eor $rka.16b,@data[2].16b,@data[3].16b
306 eor $rkb.16b,@datax[2].16b,@datax[3].16b
307 eor @vtmp[0].16b,@data[1].16b,$rk0.16b
308 eor @vtmp[1].16b,@datax[1].16b,$rk0.16b
309 eor $rk0.16b,$rka.16b,@vtmp[0].16b
310 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
312 &sbox_double($rk0,$rk1);
314 eor @data[0].16b,@data[0].16b,$rk0.16b
315 eor @datax[0].16b,@datax[0].16b,$rk1.16b
317 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
319 eor $rka.16b,$rka.16b,@data[0].16b
320 eor $rkb.16b,$rkb.16b,@datax[0].16b
321 eor $rk0.16b,$rka.16b,$rk1.16b
322 eor $rk1.16b,$rkb.16b,$rk1.16b
324 &sbox_double($rk0,$rk1);
326 ldp $wtmp0,$wtmp1,[$kptr],8
327 eor @data[1].16b,@data[1].16b,$rk0.16b
328 eor @datax[1].16b,@datax[1].16b,$rk1.16b
330 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
332 eor $rka.16b,@data[0].16b,@data[1].16b
333 eor $rkb.16b,@datax[0].16b,@datax[1].16b
334 eor @vtmp[0].16b,@data[3].16b,$rk0.16b
335 eor @vtmp[1].16b,@datax[3].16b,$rk0.16b
336 eor $rk0.16b,$rka.16b,@vtmp[0].16b
337 eor $rk1.16b,$rkb.16b,@vtmp[1].16b
339 &sbox_double($rk0,$rk1);
341 eor @data[2].16b,@data[2].16b,$rk0.16b
342 eor @datax[2].16b,@datax[2].16b,$rk1.16b
344 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
346 eor $rka.16b,$rka.16b,@data[2].16b
347 eor $rkb.16b,$rkb.16b,@datax[2].16b
348 eor $rk0.16b,$rka.16b,$rk1.16b
349 eor $rk1.16b,$rkb.16b,$rk1.16b
351 &sbox_double($rk0,$rk1);
353 eor @data[3].16b,@data[3].16b,$rk0.16b
354 eor @datax[3].16b,@datax[3].16b,$rk1.16b
358 sub encrypt_1blk_norev() {
372 subs $counter,$counter,#1
384 &encrypt_1blk_norev($dat);
388 sub encrypt_4blks() {
396 subs $counter,$counter,#1
399 &rev32(@vtmp[3],@data[0]);
400 &rev32(@vtmp[2],@data[1]);
401 &rev32(@vtmp[1],@data[2]);
402 &rev32(@vtmp[0],@data[3]);
405 sub encrypt_8blks() {
413 subs $counter,$counter,#1
416 &rev32(@vtmp[3],@data[0]);
417 &rev32(@vtmp[2],@data[1]);
418 &rev32(@vtmp[1],@data[2]);
419 &rev32(@vtmp[0],@data[3]);
420 &rev32(@data[3],@datax[0]);
421 &rev32(@data[2],@datax[1]);
422 &rev32(@data[1],@datax[2]);
423 &rev32(@data[0],@datax[3]);
431 ld1 {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
432 ld1 {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
433 ld1 {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
434 ld1 {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
439 #include "arm_arch.h"
443 .type _vpsm4_consts,%object
447 .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
448 .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
449 .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
450 .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
451 .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
452 .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
453 .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
454 .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
455 .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
456 .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
457 .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
458 .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
459 .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
460 .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
461 .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
462 .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
464 .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
465 .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
466 .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
467 .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
468 .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
469 .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
470 .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
471 .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
473 .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
475 .dword 0x0B0A090807060504,0x030201000F0E0D0C
477 .size _vpsm4_consts,.-_vpsm4_consts
481 my ($key,$keys,$enc)=("x0","x1","w2");
482 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
483 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
485 .type _vpsm4_set_key,%function
488 AARCH64_VALID_CALL_TARGET
489 ld1 {$vkey.4s},[$key]
494 adr $pointer,.Lshuffles
495 ld1 {$vmap.4s},[$pointer]
497 ld1 {$vfk.4s},[$pointer]
498 eor $vkey.16b,$vkey.16b,$vfk.16b
501 movi @vtmp[0].16b,#64
506 ldr $roundkey,[$pointer],#4
507 eor $roundkey,$roundkey,$wtmp
509 eor $roundkey,$roundkey,$wtmp
511 eor $roundkey,$roundkey,$wtmp
513 mov @data[0].s[0],$roundkey
514 tbl @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
515 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
516 tbx @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
517 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
518 tbx @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
519 sub @data[0].16b,@data[0].16b,@vtmp[0].16b
520 tbx @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
521 mov $wtmp,@vtmp[1].s[0]
522 eor $roundkey,$wtmp,$wtmp,ror #19
523 eor $roundkey,$roundkey,$wtmp,ror #9
525 eor $roundkey,$roundkey,$wtmp
526 mov $vkey.s[0],$roundkey
528 str $roundkey,[$keys],#4
531 str $roundkey,[$keys],#-4
533 tbl $vkey.16b,{$vkey.16b},$vmap.16b
534 subs $schedules,$schedules,#1
537 .size _vpsm4_set_key,.-_vpsm4_set_key
544 .type _vpsm4_enc_4blks,%function
547 AARCH64_VALID_CALL_TARGET
552 .size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
558 .type _vpsm4_enc_8blks,%function
561 AARCH64_VALID_CALL_TARGET
566 .size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
572 my ($key,$keys)=("x0","x1");
574 .globl ${prefix}_set_encrypt_key
575 .type ${prefix}_set_encrypt_key,%function
577 ${prefix}_set_encrypt_key:
578 AARCH64_SIGN_LINK_REGISTER
579 stp x29,x30,[sp,#-16]!
583 AARCH64_VALIDATE_LINK_REGISTER
585 .size ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
590 my ($key,$keys)=("x0","x1");
592 .globl ${prefix}_set_decrypt_key
593 .type ${prefix}_set_decrypt_key,%function
595 ${prefix}_set_decrypt_key:
596 AARCH64_SIGN_LINK_REGISTER
597 stp x29,x30,[sp,#-16]!
601 AARCH64_VALIDATE_LINK_REGISTER
603 .size ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
610 my ($inp,$outp,$rk)=map("x$_",(0..2));
613 .globl ${prefix}_${dir}crypt
614 .type ${prefix}_${dir}crypt,%function
616 ${prefix}_${dir}crypt:
617 AARCH64_VALID_CALL_TARGET
618 ld1 {@data[0].16b},[$inp]
621 &rev32(@data[0],@data[0]);
625 &encrypt_1blk(@data[0]);
627 st1 {@data[0].16b},[$outp]
629 .size ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
638 my @dat=map("v$_",(16..23));
641 .globl ${prefix}_ecb_encrypt
642 .type ${prefix}_ecb_encrypt,%function
644 ${prefix}_ecb_encrypt:
645 AARCH64_SIGN_LINK_REGISTER
646 // convert length into blocks
656 .Lecb_8_blocks_process:
658 b.lt .Lecb_4_blocks_process
659 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
660 ld4 {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
662 &rev32(@data[0],@data[0]);
663 &rev32(@data[1],@data[1]);
664 &rev32(@data[2],@data[2]);
665 &rev32(@data[3],@data[3]);
666 &rev32(@datax[0],@datax[0]);
667 &rev32(@datax[1],@datax[1]);
668 &rev32(@datax[2],@datax[2]);
669 &rev32(@datax[3],@datax[3]);
672 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
673 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
674 subs $blocks,$blocks,#8
675 b.gt .Lecb_8_blocks_process
677 .Lecb_4_blocks_process:
680 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
682 &rev32(@data[0],@data[0]);
683 &rev32(@data[1],@data[1]);
684 &rev32(@data[2],@data[2]);
685 &rev32(@data[3],@data[3]);
688 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
689 sub $blocks,$blocks,#4
691 // process last block
695 ld1 {@data[0].16b},[$inp]
697 &rev32(@data[0],@data[0]);
698 &encrypt_1blk(@data[0]);
700 st1 {@data[0].16b},[$outp]
702 1: // process last 2 blocks
703 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
704 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
708 &rev32(@data[0],@data[0]);
709 &rev32(@data[1],@data[1]);
710 &rev32(@data[2],@data[2]);
711 &rev32(@data[3],@data[3]);
714 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
715 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
717 1: // process last 3 blocks
718 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
720 &rev32(@data[0],@data[0]);
721 &rev32(@data[1],@data[1]);
722 &rev32(@data[2],@data[2]);
723 &rev32(@data[3],@data[3]);
726 st4 {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
727 st4 {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
728 st4 {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
735 AARCH64_VALIDATE_LINK_REGISTER
737 .size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
742 my ($len,$ivp,$enc)=("x2","x4","w5");
747 .globl ${prefix}_cbc_encrypt
748 .type ${prefix}_cbc_encrypt,%function
750 ${prefix}_cbc_encrypt:
751 AARCH64_VALID_CALL_TARGET
757 ld1 {$ivec0.4s},[$ivp]
761 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
762 eor @data[0].16b,@data[0].16b,$ivec0.16b
764 &rev32(@data[1],@data[1]);
765 &rev32(@data[0],@data[0]);
766 &rev32(@data[2],@data[2]);
767 &rev32(@data[3],@data[3]);
768 &encrypt_1blk_norev(@data[0]);
770 eor @data[1].16b,@data[1].16b,@data[0].16b
772 &encrypt_1blk_norev(@data[1]);
773 &rev32(@data[0],@data[0]);
776 eor @data[2].16b,@data[2].16b,@data[1].16b
778 &encrypt_1blk_norev(@data[2]);
779 &rev32(@data[1],@data[1]);
781 eor @data[3].16b,@data[3].16b,@data[2].16b
783 &encrypt_1blk_norev(@data[3]);
784 &rev32(@data[2],@data[2]);
785 &rev32(@data[3],@data[3]);
787 orr $ivec0.16b,@data[3].16b,@data[3].16b
788 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
789 subs $blocks,$blocks,#4
790 b.ne .Lcbc_4_blocks_enc
793 subs $blocks,$blocks,#1
795 ld1 {@data[0].4s},[$inp],#16
796 eor $ivec0.16b,$ivec0.16b,@data[0].16b
798 &rev32($ivec0,$ivec0);
799 &encrypt_1blk($ivec0);
801 st1 {$ivec0.16b},[$outp],#16
805 st1 {$ivec0.16b},[$ivp]
809 // decryption mode starts
810 AARCH64_SIGN_LINK_REGISTER
819 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
821 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
823 &rev32(@data[0],@data[0]);
824 &rev32(@data[1],@data[1]);
825 &rev32(@data[2],@data[2]);
826 &rev32(@data[3],$data[3]);
827 &rev32(@datax[0],@datax[0]);
828 &rev32(@datax[1],@datax[1]);
829 &rev32(@datax[2],@datax[2]);
830 &rev32(@datax[3],$datax[3]);
834 &transpose(@vtmp,@datax);
835 &transpose(@data,@datax);
837 ld1 {$ivec1.16b},[$ivp]
838 ld1 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
839 // note ivec1 and vtmpx[3] are resuing the same register
840 // care needs to be taken to avoid conflict
841 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
842 ld1 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
843 eor @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
844 eor @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
845 eor @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
847 st1 {$vtmpx[3].16b}, [$ivp]
848 eor @data[0].16b,@data[0].16b,$datax[3].16b
849 eor @data[1].16b,@data[1].16b,@vtmpx[0].16b
850 eor @data[2].16b,@data[2].16b,@vtmpx[1].16b
851 eor @data[3].16b,$data[3].16b,@vtmpx[2].16b
852 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
853 st1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
854 subs $blocks,$blocks,#8
855 b.gt .Lcbc_8_blocks_dec
858 ld1 {$ivec1.16b},[$ivp]
862 ld4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
864 &rev32(@data[0],@data[0]);
865 &rev32(@data[1],@data[1]);
866 &rev32(@data[2],@data[2]);
867 &rev32(@data[3],$data[3]);
870 ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
872 &transpose(@vtmp,@datax);
874 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
875 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
876 orr $ivec1.16b,@data[3].16b,@data[3].16b
877 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
878 eor @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
879 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
880 subs $blocks,$blocks,#4
881 b.gt .Lcbc_4_blocks_dec
883 st1 {@vtmp[3].16b}, [$ivp]
886 subs $blocks,$blocks,#1
889 ld1 {@data[0].4s},[$inp],#16
891 st1 {$data[0].16b}, [$ivp]
893 &rev32(@datax[0],@data[0]);
894 &encrypt_1blk(@datax[0]);
896 eor @datax[0].16b,@datax[0].16b,$ivec1.16b
897 st1 {@datax[0].16b},[$outp],#16
899 1: // last two blocks
900 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
902 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
903 subs $blocks,$blocks,1
906 &rev32(@data[0],@data[0]);
907 &rev32(@data[1],@data[1]);
908 &rev32(@data[2],@data[2]);
909 &rev32(@data[3],@data[3]);
912 ld1 {@data[0].4s,@data[1].4s},[$inp],#32
914 &transpose(@vtmp,@datax);
916 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
917 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
918 st1 {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
920 st1 {@data[1].16b}, [$ivp]
923 ld4 {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
925 &rev32(@data[0],@data[0]);
926 &rev32(@data[1],@data[1]);
927 &rev32(@data[2],@data[2]);
928 &rev32(@data[3],@data[3]);
931 ld1 {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
933 &transpose(@vtmp,@datax);
935 eor @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
936 eor @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
937 eor @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
938 st1 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
940 st1 {@data[2].16b}, [$ivp]
947 AARCH64_VALIDATE_LINK_REGISTER
949 .size ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
959 .globl ${prefix}_ctr32_encrypt_blocks
960 .type ${prefix}_ctr32_encrypt_blocks,%function
962 ${prefix}_ctr32_encrypt_blocks:
963 AARCH64_VALID_CALL_TARGET
964 ld1 {$ivec.4s},[$ivp]
971 // fast processing for one single block without
972 // context saving overhead
974 &encrypt_1blk($ivec);
976 ld1 {@data[0].16b},[$inp]
977 eor @data[0].16b,@data[0].16b,$ivec.16b
978 st1 {@data[0].16b},[$outp]
981 AARCH64_SIGN_LINK_REGISTER
987 mov $word0,$ivec.s[0]
988 mov $word1,$ivec.s[1]
989 mov $word2,$ivec.s[2]
991 .Lctr32_4_blocks_process:
994 dup @data[0].4s,$word0
995 dup @data[1].4s,$word1
996 dup @data[2].4s,$word2
997 mov @data[3].s[0],$ctr
999 mov $data[3].s[1],$ctr
1001 mov @data[3].s[2],$ctr
1003 mov @data[3].s[3],$ctr
1006 b.ge .Lctr32_8_blocks_process
1008 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1009 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1010 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1011 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1012 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1013 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1014 subs $blocks,$blocks,#4
1015 b.ne .Lctr32_4_blocks_process
1017 .Lctr32_8_blocks_process:
1018 dup @datax[0].4s,$word0
1019 dup @datax[1].4s,$word1
1020 dup @datax[2].4s,$word2
1021 mov @datax[3].s[0],$ctr
1023 mov $datax[3].s[1],$ctr
1025 mov @datax[3].s[2],$ctr
1027 mov @datax[3].s[3],$ctr
1030 ld4 {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1031 ld4 {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1032 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1033 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1034 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1035 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1036 eor @data[0].16b,@data[0].16b,@datax[0].16b
1037 eor @data[1].16b,@data[1].16b,@datax[1].16b
1038 eor @data[2].16b,@data[2].16b,@datax[2].16b
1039 eor @data[3].16b,@data[3].16b,@datax[3].16b
1040 st4 {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1041 st4 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1042 subs $blocks,$blocks,#8
1043 b.ne .Lctr32_4_blocks_process
1045 1: // last block processing
1046 subs $blocks,$blocks,#1
1049 mov $ivec.s[0],$word0
1050 mov $ivec.s[1],$word1
1051 mov $ivec.s[2],$word2
1054 &encrypt_1blk($ivec);
1056 ld1 {@data[0].16b},[$inp]
1057 eor @data[0].16b,@data[0].16b,$ivec.16b
1058 st1 {@data[0].16b},[$outp]
1060 1: // last 2 blocks processing
1061 dup @data[0].4s,$word0
1062 dup @data[1].4s,$word1
1063 dup @data[2].4s,$word2
1064 mov @data[3].s[0],$ctr
1066 mov @data[3].s[1],$ctr
1067 subs $blocks,$blocks,#1
1070 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1071 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1072 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1073 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1074 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1075 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1076 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1077 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1079 1: // last 3 blocks processing
1081 mov @data[3].s[2],$ctr
1083 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1084 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1085 ld4 {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1086 eor @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1087 eor @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1088 eor @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1089 eor @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1090 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1091 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1092 st4 {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1094 ldp d10,d11,[sp,#16]
1095 ldp d12,d13,[sp,#32]
1096 ldp d14,d15,[sp,#48]
1097 ldp x29,x30,[sp,#64]
1099 AARCH64_VALIDATE_LINK_REGISTER
1101 .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1104 ########################################
1108 last if (!s/^#/\/\// and !/^$/);
1113 foreach(split("\n",$code)) {
1114 s/\`([^\`]*)\`/eval($1)/ge;
1118 close STDOUT or die "error closing STDOUT: $!";