35b86cd68f63bd752309f17965dc5c7781287141
[openssl.git] / crypto / sm4 / asm / vpsm4-armv8.pl
1 #! /usr/bin/env perl
2 # Copyright 2020-2023 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # This module implements SM4 with ASIMD on aarch64
11 #
12 # Feb 2022
13 #
14
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23 die "can't locate arm-xlate.pl";
24
25 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26     or die "can't call $xlate: $!";
27 *STDOUT=*OUT;
28
29 $prefix="vpsm4";
30 my @vtmp=map("v$_",(0..3));
31 my @qtmp=map("q$_",(0..3));
32 my @data=map("v$_",(4..7));
33 my @datax=map("v$_",(8..11));
34 my ($rk0,$rk1)=("v12","v13");
35 my ($rka,$rkb)=("v14","v15");
36 my @vtmpx=map("v$_",(12..15));
37 my @sbox=map("v$_",(16..31));
38 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
40 my ($xtmp1,$xtmp2)=("x8","x9");
41 my ($ptr,$counter)=("x10","w11");
42 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
43
44 sub rev32() {
45         my $dst = shift;
46         my $src = shift;
47
48         if ($src and ("$src" ne "$dst")) {
49 $code.=<<___;
50 #ifndef __AARCH64EB__
51         rev32   $dst.16b,$src.16b
52 #else
53         mov     $dst.16b,$src.16b
54 #endif
55 ___
56         } else {
57 $code.=<<___;
58 #ifndef __AARCH64EB__
59         rev32   $dst.16b,$dst.16b
60 #endif
61 ___
62         }
63 }
64
65 sub rev32_armeb() {
66         my $dst = shift;
67         my $src = shift;
68
69         if ($src and ("$src" ne "$dst")) {
70 $code.=<<___;
71 #ifdef __AARCH64EB__
72         rev32   $dst.16b,$src.16b
73 #else
74         mov     $dst.16b,$src.16b
75 #endif
76 ___
77         } else {
78 $code.=<<___;
79 #ifdef __AARCH64EB__
80         rev32   $dst.16b,$dst.16b
81 #endif
82 ___
83         }
84 }
85
86 sub rbit() {
87         my $dst = shift;
88         my $src = shift;
89         my $std = shift;
90
91         if ($src and ("$src" ne "$dst")) {
92                 if ($std eq "_gb") {
93 $code.=<<___;
94                         rbit $dst.16b,$src.16b
95 ___
96                 } else {
97 $code.=<<___;
98                         mov $dst.16b,$src.16b
99 ___
100                 }
101         } else {
102                 if ($std eq "_gb") {
103 $code.=<<___;
104                         rbit $dst.16b,$src.16b
105 ___
106                 }
107         }
108 }
109
110 sub transpose() {
111         my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
112
113 $code.=<<___;
114         zip1    $vt0.4s,$dat0.4s,$dat1.4s
115         zip2    $vt1.4s,$dat0.4s,$dat1.4s
116         zip1    $vt2.4s,$dat2.4s,$dat3.4s
117         zip2    $vt3.4s,$dat2.4s,$dat3.4s
118         zip1    $dat0.2d,$vt0.2d,$vt2.2d
119         zip2    $dat1.2d,$vt0.2d,$vt2.2d
120         zip1    $dat2.2d,$vt1.2d,$vt3.2d
121         zip2    $dat3.2d,$vt1.2d,$vt3.2d
122 ___
123 }
124
125 # sbox operations for 4-lane of words
126 sub sbox() {
127         my $dat = shift;
128
129 $code.=<<___;
130         movi    @vtmp[0].16b,#64
131         movi    @vtmp[1].16b,#128
132         movi    @vtmp[2].16b,#192
133         sub     @vtmp[0].16b,$dat.16b,@vtmp[0].16b
134         sub     @vtmp[1].16b,$dat.16b,@vtmp[1].16b
135         sub     @vtmp[2].16b,$dat.16b,@vtmp[2].16b
136         tbl     $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
137         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
138         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
139         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
140         add     @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
141         add     @vtmp[2].2d,@vtmp[2].2d,$dat.2d
142         add     $dat.2d,@vtmp[0].2d,@vtmp[2].2d
143
144         ushr    @vtmp[0].4s,$dat.4s,32-2
145         sli     @vtmp[0].4s,$dat.4s,2
146         ushr    @vtmp[2].4s,$dat.4s,32-10
147         eor     @vtmp[1].16b,@vtmp[0].16b,$dat.16b
148         sli     @vtmp[2].4s,$dat.4s,10
149         eor     @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
150         ushr    @vtmp[0].4s,$dat.4s,32-18
151         sli     @vtmp[0].4s,$dat.4s,18
152         ushr    @vtmp[2].4s,$dat.4s,32-24
153         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
154         sli     @vtmp[2].4s,$dat.4s,24
155         eor     $dat.16b,@vtmp[2].16b,@vtmp[1].16b
156 ___
157 }
158
159 # sbox operation for 8-lane of words
160 sub sbox_double() {
161         my $dat = shift;
162         my $datx = shift;
163
164 $code.=<<___;
165         movi    @vtmp[3].16b,#64
166         sub     @vtmp[0].16b,$dat.16b,@vtmp[3].16b
167         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
168         sub     @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
169         tbl     $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
170         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
171         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
172         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
173         add     @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
174         add     $dat.2d,@vtmp[2].2d,$dat.2d
175         add     $dat.2d,@vtmp[1].2d,$dat.2d
176
177         sub     @vtmp[0].16b,$datx.16b,@vtmp[3].16b
178         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
179         sub     @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
180         tbl     $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
181         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
182         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
183         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
184         add     @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
185         add     $datx.2d,@vtmp[2].2d,$datx.2d
186         add     $datx.2d,@vtmp[1].2d,$datx.2d
187
188         ushr    @vtmp[0].4s,$dat.4s,32-2
189         sli     @vtmp[0].4s,$dat.4s,2
190         ushr    @vtmp[2].4s,$datx.4s,32-2
191         eor     @vtmp[1].16b,@vtmp[0].16b,$dat.16b
192         sli     @vtmp[2].4s,$datx.4s,2
193
194         ushr    @vtmp[0].4s,$dat.4s,32-10
195         eor     @vtmp[3].16b,@vtmp[2].16b,$datx.16b
196         sli     @vtmp[0].4s,$dat.4s,10
197         ushr    @vtmp[2].4s,$datx.4s,32-10
198         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
199         sli     @vtmp[2].4s,$datx.4s,10
200
201         ushr    @vtmp[0].4s,$dat.4s,32-18
202         eor     @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
203         sli     @vtmp[0].4s,$dat.4s,18
204         ushr    @vtmp[2].4s,$datx.4s,32-18
205         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
206         sli     @vtmp[2].4s,$datx.4s,18
207
208         ushr    @vtmp[0].4s,$dat.4s,32-24
209         eor     @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
210         sli     @vtmp[0].4s,$dat.4s,24
211         ushr    @vtmp[2].4s,$datx.4s,32-24
212         eor     $dat.16b,@vtmp[0].16b,@vtmp[1].16b
213         sli     @vtmp[2].4s,$datx.4s,24
214         eor     $datx.16b,@vtmp[2].16b,@vtmp[3].16b
215 ___
216 }
217
218 # sbox operation for one single word
219 sub sbox_1word () {
220         my $word = shift;
221
222 $code.=<<___;
223         movi    @vtmp[1].16b,#64
224         movi    @vtmp[2].16b,#128
225         movi    @vtmp[3].16b,#192
226         mov     @vtmp[0].s[0],$word
227
228         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
229         sub     @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
230         sub     @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
231
232         tbl     @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
233         tbl     @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
234         tbl     @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
235         tbl     @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
236
237         mov     $word,@vtmp[0].s[0]
238         mov     $wtmp0,@vtmp[1].s[0]
239         mov     $wtmp2,@vtmp[2].s[0]
240         add     $wtmp0,$word,$wtmp0
241         mov     $word,@vtmp[3].s[0]
242         add     $wtmp0,$wtmp0,$wtmp2
243         add     $wtmp0,$wtmp0,$word
244
245         eor     $word,$wtmp0,$wtmp0,ror #32-2
246         eor     $word,$word,$wtmp0,ror #32-10
247         eor     $word,$word,$wtmp0,ror #32-18
248         eor     $word,$word,$wtmp0,ror #32-24
249 ___
250 }
251
252 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
253 sub sm4_1blk () {
254         my $kptr = shift;
255
256 $code.=<<___;
257         ldp     $wtmp0,$wtmp1,[$kptr],8
258         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
259         eor     $tmpw,$word2,$word3
260         eor     $wtmp2,$wtmp0,$word1
261         eor     $tmpw,$tmpw,$wtmp2
262 ___
263         &sbox_1word($tmpw);
264 $code.=<<___;
265         eor     $word0,$word0,$tmpw
266         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
267         eor     $tmpw,$word2,$word3
268         eor     $wtmp2,$word0,$wtmp1
269         eor     $tmpw,$tmpw,$wtmp2
270 ___
271         &sbox_1word($tmpw);
272 $code.=<<___;
273         ldp     $wtmp0,$wtmp1,[$kptr],8
274         eor     $word1,$word1,$tmpw
275         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
276         eor     $tmpw,$word0,$word1
277         eor     $wtmp2,$wtmp0,$word3
278         eor     $tmpw,$tmpw,$wtmp2
279 ___
280         &sbox_1word($tmpw);
281 $code.=<<___;
282         eor     $word2,$word2,$tmpw
283         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
284         eor     $tmpw,$word0,$word1
285         eor     $wtmp2,$word2,$wtmp1
286         eor     $tmpw,$tmpw,$wtmp2
287 ___
288         &sbox_1word($tmpw);
289 $code.=<<___;
290         eor     $word3,$word3,$tmpw
291 ___
292 }
293
294 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
295 sub sm4_4blks () {
296         my $kptr = shift;
297
298 $code.=<<___;
299         ldp     $wtmp0,$wtmp1,[$kptr],8
300         dup     $rk0.4s,$wtmp0
301         dup     $rk1.4s,$wtmp1
302
303         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304         eor     $rka.16b,@data[2].16b,@data[3].16b
305         eor     $rk0.16b,@data[1].16b,$rk0.16b
306         eor     $rk0.16b,$rka.16b,$rk0.16b
307 ___
308         &sbox($rk0);
309 $code.=<<___;
310         eor     @data[0].16b,@data[0].16b,$rk0.16b
311
312         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
313         eor     $rka.16b,$rka.16b,@data[0].16b
314         eor     $rk1.16b,$rka.16b,$rk1.16b
315 ___
316         &sbox($rk1);
317 $code.=<<___;
318         ldp     $wtmp0,$wtmp1,[$kptr],8
319         eor     @data[1].16b,@data[1].16b,$rk1.16b
320
321         dup     $rk0.4s,$wtmp0
322         dup     $rk1.4s,$wtmp1
323
324         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
325         eor     $rka.16b,@data[0].16b,@data[1].16b
326         eor     $rk0.16b,@data[3].16b,$rk0.16b
327         eor     $rk0.16b,$rka.16b,$rk0.16b
328 ___
329         &sbox($rk0);
330 $code.=<<___;
331         eor     @data[2].16b,@data[2].16b,$rk0.16b
332
333         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
334         eor     $rka.16b,$rka.16b,@data[2].16b
335         eor     $rk1.16b,$rka.16b,$rk1.16b
336 ___
337         &sbox($rk1);
338 $code.=<<___;
339         eor     @data[3].16b,@data[3].16b,$rk1.16b
340 ___
341 }
342
343 # sm4 for 8 lanes of data, in neon registers
344 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
345 sub sm4_8blks () {
346         my $kptr = shift;
347
348 $code.=<<___;
349         ldp     $wtmp0,$wtmp1,[$kptr],8
350         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
351         dup     $rk0.4s,$wtmp0
352         eor     $rka.16b,@data[2].16b,@data[3].16b
353         eor     $rkb.16b,@datax[2].16b,@datax[3].16b
354         eor     @vtmp[0].16b,@data[1].16b,$rk0.16b
355         eor     @vtmp[1].16b,@datax[1].16b,$rk0.16b
356         eor     $rk0.16b,$rka.16b,@vtmp[0].16b
357         eor     $rk1.16b,$rkb.16b,@vtmp[1].16b
358 ___
359         &sbox_double($rk0,$rk1);
360 $code.=<<___;
361         eor     @data[0].16b,@data[0].16b,$rk0.16b
362         eor     @datax[0].16b,@datax[0].16b,$rk1.16b
363
364         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
365         dup     $rk1.4s,$wtmp1
366         eor     $rka.16b,$rka.16b,@data[0].16b
367         eor     $rkb.16b,$rkb.16b,@datax[0].16b
368         eor     $rk0.16b,$rka.16b,$rk1.16b
369         eor     $rk1.16b,$rkb.16b,$rk1.16b
370 ___
371         &sbox_double($rk0,$rk1);
372 $code.=<<___;
373         ldp     $wtmp0,$wtmp1,[$kptr],8
374         eor     @data[1].16b,@data[1].16b,$rk0.16b
375         eor     @datax[1].16b,@datax[1].16b,$rk1.16b
376
377         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
378         dup     $rk0.4s,$wtmp0
379         eor     $rka.16b,@data[0].16b,@data[1].16b
380         eor     $rkb.16b,@datax[0].16b,@datax[1].16b
381         eor     @vtmp[0].16b,@data[3].16b,$rk0.16b
382         eor     @vtmp[1].16b,@datax[3].16b,$rk0.16b
383         eor     $rk0.16b,$rka.16b,@vtmp[0].16b
384         eor     $rk1.16b,$rkb.16b,@vtmp[1].16b
385 ___
386         &sbox_double($rk0,$rk1);
387 $code.=<<___;
388         eor     @data[2].16b,@data[2].16b,$rk0.16b
389         eor     @datax[2].16b,@datax[2].16b,$rk1.16b
390
391         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
392         dup     $rk1.4s,$wtmp1
393         eor     $rka.16b,$rka.16b,@data[2].16b
394         eor     $rkb.16b,$rkb.16b,@datax[2].16b
395         eor     $rk0.16b,$rka.16b,$rk1.16b
396         eor     $rk1.16b,$rkb.16b,$rk1.16b
397 ___
398         &sbox_double($rk0,$rk1);
399 $code.=<<___;
400         eor     @data[3].16b,@data[3].16b,$rk0.16b
401         eor     @datax[3].16b,@datax[3].16b,$rk1.16b
402 ___
403 }
404
405 sub encrypt_1blk_norev() {
406         my $dat = shift;
407
408 $code.=<<___;
409         mov     $ptr,$rks
410         mov     $counter,#8
411         mov     $word0,$dat.s[0]
412         mov     $word1,$dat.s[1]
413         mov     $word2,$dat.s[2]
414         mov     $word3,$dat.s[3]
415 10:
416 ___
417         &sm4_1blk($ptr);
418 $code.=<<___;
419         subs    $counter,$counter,#1
420         b.ne    10b
421         mov     $dat.s[0],$word3
422         mov     $dat.s[1],$word2
423         mov     $dat.s[2],$word1
424         mov     $dat.s[3],$word0
425 ___
426 }
427
428 sub encrypt_1blk() {
429         my $dat = shift;
430
431         &encrypt_1blk_norev($dat);
432         &rev32($dat,$dat);
433 }
434
435 sub encrypt_4blks() {
436 $code.=<<___;
437         mov     $ptr,$rks
438         mov     $counter,#8
439 10:
440 ___
441         &sm4_4blks($ptr);
442 $code.=<<___;
443         subs    $counter,$counter,#1
444         b.ne    10b
445 ___
446         &rev32(@vtmp[3],@data[0]);
447         &rev32(@vtmp[2],@data[1]);
448         &rev32(@vtmp[1],@data[2]);
449         &rev32(@vtmp[0],@data[3]);
450 }
451
452 sub encrypt_8blks() {
453 $code.=<<___;
454         mov     $ptr,$rks
455         mov     $counter,#8
456 10:
457 ___
458         &sm4_8blks($ptr);
459 $code.=<<___;
460         subs    $counter,$counter,#1
461         b.ne    10b
462 ___
463         &rev32(@vtmp[3],@data[0]);
464         &rev32(@vtmp[2],@data[1]);
465         &rev32(@vtmp[1],@data[2]);
466         &rev32(@vtmp[0],@data[3]);
467         &rev32(@data[3],@datax[0]);
468         &rev32(@data[2],@datax[1]);
469         &rev32(@data[1],@datax[2]);
470         &rev32(@data[0],@datax[3]);
471 }
472
473 sub load_sbox () {
474         my $data = shift;
475
476 $code.=<<___;
477         adr     $ptr,.Lsbox
478         ld1     {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
479         ld1     {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
480         ld1     {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
481         ld1     {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
482 ___
483 }
484
485
486 sub mov_reg_to_vec() {
487         my $src0 = shift;
488         my $src1 = shift;
489         my $desv = shift;
490 $code.=<<___;
491         mov $desv.d[0],$src0
492         mov $desv.d[1],$src1
493 ___
494         &rev32_armeb($desv,$desv);
495 }
496
497 sub mov_vec_to_reg() {
498         my $srcv = shift;
499         my $des0 = shift;
500         my $des1 = shift;
501 $code.=<<___;
502         mov $des0,$srcv.d[0]
503         mov $des1,$srcv.d[1]
504 ___
505 }
506
507 sub compute_tweak() {
508         my $src0 = shift;
509         my $src1 = shift;
510         my $des0 = shift;
511         my $des1 = shift;
512 $code.=<<___;
513         mov $wtmp0,0x87
514         extr    $xtmp2,$src1,$src1,#32
515         extr    $des1,$src1,$src0,#63
516         and     $wtmp1,$wtmp0,$wtmp2,asr#31
517         eor     $des0,$xtmp1,$src0,lsl#1
518 ___
519 }
520
521 sub compute_tweak_vec() {
522         my $src = shift;
523         my $des = shift;
524         my $std = shift;
525         &rbit(@vtmp[2],$src,$std);
526 $code.=<<___;
527         ldr  @qtmp[0], .Lxts_magic
528         shl  $des.16b, @vtmp[2].16b, #1
529         ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
530         ushr @vtmp[1].16b, @vtmp[1].16b, #7
531         mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
532         eor  $des.16b, $des.16b, @vtmp[1].16b
533 ___
534         &rbit($des,$des,$std);
535 }
536
537 $code=<<___;
538 #include "arm_arch.h"
539 .arch   armv8-a
540 .text
541
542 .type   _vpsm4_consts,%object
543 .align  7
544 _vpsm4_consts:
545 .Lsbox:
546         .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
547         .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
548         .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
549         .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
550         .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
551         .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
552         .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
553         .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
554         .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
555         .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
556         .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
557         .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
558         .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
559         .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
560         .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
561         .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
562 .Lck:
563         .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
564         .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
565         .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
566         .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
567         .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
568         .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
569         .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
570         .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
571 .Lfk:
572         .quad 0x56aa3350a3b1bac6,0xb27022dc677d9197
573 .Lshuffles:
574         .quad 0x0B0A090807060504,0x030201000F0E0D0C
575 .Lxts_magic:
576         .quad 0x0101010101010187,0x0101010101010101
577
578 .size   _vpsm4_consts,.-_vpsm4_consts
579 ___
580
581 {{{
582 my ($key,$keys,$enc)=("x0","x1","w2");
583 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
584 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
585 $code.=<<___;
586 .type   _vpsm4_set_key,%function
587 .align  4
588 _vpsm4_set_key:
589         AARCH64_VALID_CALL_TARGET
590         ld1     {$vkey.4s},[$key]
591 ___
592         &load_sbox();
593         &rev32($vkey,$vkey);
594 $code.=<<___;
595         adr     $pointer,.Lshuffles
596         ld1     {$vmap.2d},[$pointer]
597         adr     $pointer,.Lfk
598         ld1     {$vfk.2d},[$pointer]
599         eor     $vkey.16b,$vkey.16b,$vfk.16b
600         mov     $schedules,#32
601         adr     $pointer,.Lck
602         movi    @vtmp[0].16b,#64
603         cbnz    $enc,1f
604         add     $keys,$keys,124
605 1:
606         mov     $wtmp,$vkey.s[1]
607         ldr     $roundkey,[$pointer],#4
608         eor     $roundkey,$roundkey,$wtmp
609         mov     $wtmp,$vkey.s[2]
610         eor     $roundkey,$roundkey,$wtmp
611         mov     $wtmp,$vkey.s[3]
612         eor     $roundkey,$roundkey,$wtmp
613         // sbox lookup
614         mov     @data[0].s[0],$roundkey
615         tbl     @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
616         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
617         tbx     @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
618         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
619         tbx     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
620         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
621         tbx     @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
622         mov     $wtmp,@vtmp[1].s[0]
623         eor     $roundkey,$wtmp,$wtmp,ror #19
624         eor     $roundkey,$roundkey,$wtmp,ror #9
625         mov     $wtmp,$vkey.s[0]
626         eor     $roundkey,$roundkey,$wtmp
627         mov     $vkey.s[0],$roundkey
628         cbz     $enc,2f
629         str     $roundkey,[$keys],#4
630         b       3f
631 2:
632         str     $roundkey,[$keys],#-4
633 3:
634         tbl     $vkey.16b,{$vkey.16b},$vmap.16b
635         subs    $schedules,$schedules,#1
636         b.ne    1b
637         ret
638 .size   _vpsm4_set_key,.-_vpsm4_set_key
639 ___
640 }}}
641
642
643 {{{
644 $code.=<<___;
645 .type   _vpsm4_enc_4blks,%function
646 .align  4
647 _vpsm4_enc_4blks:
648         AARCH64_VALID_CALL_TARGET
649 ___
650         &encrypt_4blks();
651 $code.=<<___;
652         ret
653 .size   _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
654 ___
655 }}}
656
657 {{{
658 $code.=<<___;
659 .type   _vpsm4_enc_8blks,%function
660 .align  4
661 _vpsm4_enc_8blks:
662         AARCH64_VALID_CALL_TARGET
663 ___
664         &encrypt_8blks();
665 $code.=<<___;
666         ret
667 .size   _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
668 ___
669 }}}
670
671
672 {{{
673 my ($key,$keys)=("x0","x1");
674 $code.=<<___;
675 .globl  ${prefix}_set_encrypt_key
676 .type   ${prefix}_set_encrypt_key,%function
677 .align  5
678 ${prefix}_set_encrypt_key:
679         AARCH64_SIGN_LINK_REGISTER
680         stp     x29,x30,[sp,#-16]!
681         mov     w2,1
682         bl      _vpsm4_set_key
683         ldp     x29,x30,[sp],#16
684         AARCH64_VALIDATE_LINK_REGISTER
685         ret
686 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
687 ___
688 }}}
689
690 {{{
691 my ($key,$keys)=("x0","x1");
692 $code.=<<___;
693 .globl  ${prefix}_set_decrypt_key
694 .type   ${prefix}_set_decrypt_key,%function
695 .align  5
696 ${prefix}_set_decrypt_key:
697         AARCH64_SIGN_LINK_REGISTER
698         stp     x29,x30,[sp,#-16]!
699         mov     w2,0
700         bl      _vpsm4_set_key
701         ldp     x29,x30,[sp],#16
702         AARCH64_VALIDATE_LINK_REGISTER
703         ret
704 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
705 ___
706 }}}
707
708 {{{
709 sub gen_block () {
710         my $dir = shift;
711         my ($inp,$outp,$rk)=map("x$_",(0..2));
712
713 $code.=<<___;
714 .globl  ${prefix}_${dir}crypt
715 .type   ${prefix}_${dir}crypt,%function
716 .align  5
717 ${prefix}_${dir}crypt:
718         AARCH64_VALID_CALL_TARGET
719         ld1     {@data[0].4s},[$inp]
720 ___
721         &load_sbox();
722         &rev32(@data[0],@data[0]);
723 $code.=<<___;
724         mov     $rks,x2
725 ___
726         &encrypt_1blk(@data[0]);
727 $code.=<<___;
728         st1     {@data[0].4s},[$outp]
729         ret
730 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
731 ___
732 }
733 &gen_block("en");
734 &gen_block("de");
735 }}}
736
737 {{{
738 my ($enc) = ("w4");
739 my @dat=map("v$_",(16..23));
740
741 $code.=<<___;
742 .globl  ${prefix}_ecb_encrypt
743 .type   ${prefix}_ecb_encrypt,%function
744 .align  5
745 ${prefix}_ecb_encrypt:
746         AARCH64_SIGN_LINK_REGISTER
747         // convert length into blocks
748         lsr     x2,x2,4
749         stp     d8,d9,[sp,#-80]!
750         stp     d10,d11,[sp,#16]
751         stp     d12,d13,[sp,#32]
752         stp     d14,d15,[sp,#48]
753         stp     x29,x30,[sp,#64]
754 ___
755         &load_sbox();
756 $code.=<<___;
757 .Lecb_8_blocks_process:
758         cmp     $blocks,#8
759         b.lt    .Lecb_4_blocks_process
760         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
761         ld4     {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
762 ___
763         &rev32(@data[0],@data[0]);
764         &rev32(@data[1],@data[1]);
765         &rev32(@data[2],@data[2]);
766         &rev32(@data[3],@data[3]);
767         &rev32(@datax[0],@datax[0]);
768         &rev32(@datax[1],@datax[1]);
769         &rev32(@datax[2],@datax[2]);
770         &rev32(@datax[3],@datax[3]);
771 $code.=<<___;
772         bl      _vpsm4_enc_8blks
773         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
774         st4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
775         subs    $blocks,$blocks,#8
776         b.gt    .Lecb_8_blocks_process
777         b       100f
778 .Lecb_4_blocks_process:
779         cmp     $blocks,#4
780         b.lt    1f
781         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
782 ___
783         &rev32(@data[0],@data[0]);
784         &rev32(@data[1],@data[1]);
785         &rev32(@data[2],@data[2]);
786         &rev32(@data[3],@data[3]);
787 $code.=<<___;
788         bl      _vpsm4_enc_4blks
789         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
790         sub     $blocks,$blocks,#4
791 1:
792         // process last block
793         cmp     $blocks,#1
794         b.lt    100f
795         b.gt    1f
796         ld1     {@data[0].4s},[$inp]
797 ___
798         &rev32(@data[0],@data[0]);
799         &encrypt_1blk(@data[0]);
800 $code.=<<___;
801         st1     {@data[0].4s},[$outp]
802         b       100f
803 1:      // process last 2 blocks
804         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
805         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
806         cmp     $blocks,#2
807         b.gt    1f
808 ___
809         &rev32(@data[0],@data[0]);
810         &rev32(@data[1],@data[1]);
811         &rev32(@data[2],@data[2]);
812         &rev32(@data[3],@data[3]);
813 $code.=<<___;
814         bl      _vpsm4_enc_4blks
815         st4     {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
816         st4     {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
817         b       100f
818 1:      // process last 3 blocks
819         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
820 ___
821         &rev32(@data[0],@data[0]);
822         &rev32(@data[1],@data[1]);
823         &rev32(@data[2],@data[2]);
824         &rev32(@data[3],@data[3]);
825 $code.=<<___;
826         bl      _vpsm4_enc_4blks
827         st4     {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
828         st4     {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
829         st4     {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
830 100:
831         ldp     d10,d11,[sp,#16]
832         ldp     d12,d13,[sp,#32]
833         ldp     d14,d15,[sp,#48]
834         ldp     x29,x30,[sp,#64]
835         ldp     d8,d9,[sp],#80
836         AARCH64_VALIDATE_LINK_REGISTER
837         ret
838 .size   ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
839 ___
840 }}}
841
842 {{{
843 my ($len,$ivp,$enc)=("x2","x4","w5");
844 my $ivec0=("v3");
845 my $ivec1=("v15");
846
847 $code.=<<___;
848 .globl  ${prefix}_cbc_encrypt
849 .type   ${prefix}_cbc_encrypt,%function
850 .align  5
851 ${prefix}_cbc_encrypt:
852         AARCH64_VALID_CALL_TARGET
853         lsr     $len,$len,4
854 ___
855         &load_sbox();
856 $code.=<<___;
857         cbz     $enc,.Ldec
858         ld1     {$ivec0.4s},[$ivp]
859 .Lcbc_4_blocks_enc:
860         cmp     $blocks,#4
861         b.lt    1f
862         ld1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
863         eor     @data[0].16b,@data[0].16b,$ivec0.16b
864 ___
865         &rev32(@data[1],@data[1]);
866         &rev32(@data[0],@data[0]);
867         &rev32(@data[2],@data[2]);
868         &rev32(@data[3],@data[3]);
869         &encrypt_1blk_norev(@data[0]);
870 $code.=<<___;
871         eor     @data[1].16b,@data[1].16b,@data[0].16b
872 ___
873         &encrypt_1blk_norev(@data[1]);
874         &rev32(@data[0],@data[0]);
875
876 $code.=<<___;
877         eor     @data[2].16b,@data[2].16b,@data[1].16b
878 ___
879         &encrypt_1blk_norev(@data[2]);
880         &rev32(@data[1],@data[1]);
881 $code.=<<___;
882         eor     @data[3].16b,@data[3].16b,@data[2].16b
883 ___
884         &encrypt_1blk_norev(@data[3]);
885         &rev32(@data[2],@data[2]);
886         &rev32(@data[3],@data[3]);
887 $code.=<<___;
888         orr     $ivec0.16b,@data[3].16b,@data[3].16b
889         st1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
890         subs    $blocks,$blocks,#4
891         b.ne    .Lcbc_4_blocks_enc
892         b       2f
893 1:
894         subs    $blocks,$blocks,#1
895         b.lt    2f
896         ld1     {@data[0].4s},[$inp],#16
897         eor     $ivec0.16b,$ivec0.16b,@data[0].16b
898 ___
899         &rev32($ivec0,$ivec0);
900         &encrypt_1blk($ivec0);
901 $code.=<<___;
902         st1     {$ivec0.4s},[$outp],#16
903         b       1b
904 2:
905         // save back IV
906         st1     {$ivec0.4s},[$ivp]
907         ret
908
909 .Ldec:
910         // decryption mode starts
911         AARCH64_SIGN_LINK_REGISTER
912         stp     d8,d9,[sp,#-80]!
913         stp     d10,d11,[sp,#16]
914         stp     d12,d13,[sp,#32]
915         stp     d14,d15,[sp,#48]
916         stp     x29,x30,[sp,#64]
917 .Lcbc_8_blocks_dec:
918         cmp     $blocks,#8
919         b.lt    1f
920         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
921         add     $ptr,$inp,#64
922         ld4     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
923 ___
924         &rev32(@data[0],@data[0]);
925         &rev32(@data[1],@data[1]);
926         &rev32(@data[2],@data[2]);
927         &rev32(@data[3],$data[3]);
928         &rev32(@datax[0],@datax[0]);
929         &rev32(@datax[1],@datax[1]);
930         &rev32(@datax[2],@datax[2]);
931         &rev32(@datax[3],$datax[3]);
932 $code.=<<___;
933         bl      _vpsm4_enc_8blks
934 ___
935         &transpose(@vtmp,@datax);
936         &transpose(@data,@datax);
937 $code.=<<___;
938         ld1     {$ivec1.4s},[$ivp]
939         ld1     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
940         // note ivec1 and vtmpx[3] are reusing the same register
941         // care needs to be taken to avoid conflict
942         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
943         ld1     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
944         eor     @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
945         eor     @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
946         eor     @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
947         // save back IV
948         st1     {$vtmpx[3].4s}, [$ivp]
949         eor     @data[0].16b,@data[0].16b,$datax[3].16b
950         eor     @data[1].16b,@data[1].16b,@vtmpx[0].16b
951         eor     @data[2].16b,@data[2].16b,@vtmpx[1].16b
952         eor     @data[3].16b,$data[3].16b,@vtmpx[2].16b
953         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
954         st1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
955         subs    $blocks,$blocks,#8
956         b.gt    .Lcbc_8_blocks_dec
957         b.eq    100f
958 1:
959         ld1     {$ivec1.4s},[$ivp]
960 .Lcbc_4_blocks_dec:
961         cmp     $blocks,#4
962         b.lt    1f
963         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
964 ___
965         &rev32(@data[0],@data[0]);
966         &rev32(@data[1],@data[1]);
967         &rev32(@data[2],@data[2]);
968         &rev32(@data[3],$data[3]);
969 $code.=<<___;
970         bl      _vpsm4_enc_4blks
971         ld1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
972 ___
973         &transpose(@vtmp,@datax);
974 $code.=<<___;
975         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
976         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
977         orr     $ivec1.16b,@data[3].16b,@data[3].16b
978         eor     @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
979         eor     @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
980         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
981         subs    $blocks,$blocks,#4
982         b.gt    .Lcbc_4_blocks_dec
983         // save back IV
984         st1     {@data[3].4s}, [$ivp]
985         b       100f
986 1:      // last block
987         subs    $blocks,$blocks,#1
988         b.lt    100f
989         b.gt    1f
990         ld1     {@data[0].4s},[$inp],#16
991         // save back IV
992         st1     {$data[0].4s}, [$ivp]
993 ___
994         &rev32(@datax[0],@data[0]);
995         &encrypt_1blk(@datax[0]);
996 $code.=<<___;
997         eor     @datax[0].16b,@datax[0].16b,$ivec1.16b
998         st1     {@datax[0].4s},[$outp],#16
999         b       100f
1000 1:      // last two blocks
1001         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
1002         add     $ptr,$inp,#16
1003         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
1004         subs    $blocks,$blocks,1
1005         b.gt    1f
1006 ___
1007         &rev32(@data[0],@data[0]);
1008         &rev32(@data[1],@data[1]);
1009         &rev32(@data[2],@data[2]);
1010         &rev32(@data[3],@data[3]);
1011 $code.=<<___;
1012         bl      _vpsm4_enc_4blks
1013         ld1     {@data[0].4s,@data[1].4s},[$inp],#32
1014 ___
1015         &transpose(@vtmp,@datax);
1016 $code.=<<___;
1017         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1018         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1019         st1     {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1020         // save back IV
1021         st1     {@data[1].4s}, [$ivp]
1022         b       100f
1023 1:      // last 3 blocks
1024         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1025 ___
1026         &rev32(@data[0],@data[0]);
1027         &rev32(@data[1],@data[1]);
1028         &rev32(@data[2],@data[2]);
1029         &rev32(@data[3],@data[3]);
1030 $code.=<<___;
1031         bl      _vpsm4_enc_4blks
1032         ld1     {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1033 ___
1034         &transpose(@vtmp,@datax);
1035 $code.=<<___;
1036         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1037         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1038         eor     @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1039         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1040         // save back IV
1041         st1     {@data[2].4s}, [$ivp]
1042 100:
1043         ldp     d10,d11,[sp,#16]
1044         ldp     d12,d13,[sp,#32]
1045         ldp     d14,d15,[sp,#48]
1046         ldp     x29,x30,[sp,#64]
1047         ldp     d8,d9,[sp],#80
1048         AARCH64_VALIDATE_LINK_REGISTER
1049         ret
1050 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1051 ___
1052 }}}
1053
1054 {{{
1055 my ($ivp)=("x4");
1056 my ($ctr)=("w5");
1057 my $ivec=("v3");
1058
1059 $code.=<<___;
1060 .globl  ${prefix}_ctr32_encrypt_blocks
1061 .type   ${prefix}_ctr32_encrypt_blocks,%function
1062 .align  5
1063 ${prefix}_ctr32_encrypt_blocks:
1064         AARCH64_VALID_CALL_TARGET
1065         ld1     {$ivec.4s},[$ivp]
1066 ___
1067         &rev32($ivec,$ivec);
1068         &load_sbox();
1069 $code.=<<___;
1070         cmp     $blocks,#1
1071         b.ne    1f
1072         // fast processing for one single block without
1073         // context saving overhead
1074 ___
1075         &encrypt_1blk($ivec);
1076 $code.=<<___;
1077         ld1     {@data[0].4s},[$inp]
1078         eor     @data[0].16b,@data[0].16b,$ivec.16b
1079         st1     {@data[0].4s},[$outp]
1080         ret
1081 1:
1082         AARCH64_SIGN_LINK_REGISTER
1083         stp     d8,d9,[sp,#-80]!
1084         stp     d10,d11,[sp,#16]
1085         stp     d12,d13,[sp,#32]
1086         stp     d14,d15,[sp,#48]
1087         stp     x29,x30,[sp,#64]
1088         mov     $word0,$ivec.s[0]
1089         mov     $word1,$ivec.s[1]
1090         mov     $word2,$ivec.s[2]
1091         mov     $ctr,$ivec.s[3]
1092 .Lctr32_4_blocks_process:
1093         cmp     $blocks,#4
1094         b.lt    1f
1095         dup     @data[0].4s,$word0
1096         dup     @data[1].4s,$word1
1097         dup     @data[2].4s,$word2
1098         mov     @data[3].s[0],$ctr
1099         add     $ctr,$ctr,#1
1100         mov     $data[3].s[1],$ctr
1101         add     $ctr,$ctr,#1
1102         mov     @data[3].s[2],$ctr
1103         add     $ctr,$ctr,#1
1104         mov     @data[3].s[3],$ctr
1105         add     $ctr,$ctr,#1
1106         cmp     $blocks,#8
1107         b.ge    .Lctr32_8_blocks_process
1108         bl      _vpsm4_enc_4blks
1109         ld4     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1110         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1111         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1112         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1113         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1114         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1115         subs    $blocks,$blocks,#4
1116         b.ne    .Lctr32_4_blocks_process
1117         b       100f
1118 .Lctr32_8_blocks_process:
1119         dup     @datax[0].4s,$word0
1120         dup     @datax[1].4s,$word1
1121         dup     @datax[2].4s,$word2
1122         mov     @datax[3].s[0],$ctr
1123         add     $ctr,$ctr,#1
1124         mov     $datax[3].s[1],$ctr
1125         add     $ctr,$ctr,#1
1126         mov     @datax[3].s[2],$ctr
1127         add     $ctr,$ctr,#1
1128         mov     @datax[3].s[3],$ctr
1129         add     $ctr,$ctr,#1
1130         bl      _vpsm4_enc_8blks
1131         ld4     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1132         ld4     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1133         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1134         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1135         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1136         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1137         eor     @data[0].16b,@data[0].16b,@datax[0].16b
1138         eor     @data[1].16b,@data[1].16b,@datax[1].16b
1139         eor     @data[2].16b,@data[2].16b,@datax[2].16b
1140         eor     @data[3].16b,@data[3].16b,@datax[3].16b
1141         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1142         st4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1143         subs    $blocks,$blocks,#8
1144         b.ne    .Lctr32_4_blocks_process
1145         b       100f
1146 1:      // last block processing
1147         subs    $blocks,$blocks,#1
1148         b.lt    100f
1149         b.gt    1f
1150         mov     $ivec.s[0],$word0
1151         mov     $ivec.s[1],$word1
1152         mov     $ivec.s[2],$word2
1153         mov     $ivec.s[3],$ctr
1154 ___
1155         &encrypt_1blk($ivec);
1156 $code.=<<___;
1157         ld1     {@data[0].4s},[$inp]
1158         eor     @data[0].16b,@data[0].16b,$ivec.16b
1159         st1     {@data[0].4s},[$outp]
1160         b       100f
1161 1:      // last 2 blocks processing
1162         dup     @data[0].4s,$word0
1163         dup     @data[1].4s,$word1
1164         dup     @data[2].4s,$word2
1165         mov     @data[3].s[0],$ctr
1166         add     $ctr,$ctr,#1
1167         mov     @data[3].s[1],$ctr
1168         subs    $blocks,$blocks,#1
1169         b.ne    1f
1170         bl      _vpsm4_enc_4blks
1171         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1172         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1173         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1174         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1175         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1176         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1177         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1178         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1179         b       100f
1180 1:      // last 3 blocks processing
1181         add     $ctr,$ctr,#1
1182         mov     @data[3].s[2],$ctr
1183         bl      _vpsm4_enc_4blks
1184         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1185         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1186         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1187         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1188         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1189         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1190         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1191         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1192         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1193         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1194 100:
1195         ldp     d10,d11,[sp,#16]
1196         ldp     d12,d13,[sp,#32]
1197         ldp     d14,d15,[sp,#48]
1198         ldp     x29,x30,[sp,#64]
1199         ldp     d8,d9,[sp],#80
1200         AARCH64_VALIDATE_LINK_REGISTER
1201         ret
1202 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1203 ___
1204 }}}
1205
1206 {{{
1207 my ($blocks,$len)=("x2","x2");
1208 my $ivp=("x5");
1209 my @twx=map("x$_",(12..27));
1210 my ($rks1,$rks2)=("x26","x27");
1211 my $lastBlk=("x26");
1212 my $enc=("w28");
1213 my $remain=("x29");
1214
1215 my @tweak=@datax;
1216
1217 sub gen_xts_cipher() {
1218         my $std = shift;
1219 $code.=<<___;
1220 .globl  ${prefix}_xts_encrypt${std}
1221 .type   ${prefix}_xts_encrypt${std},%function
1222 .align  5
1223 ${prefix}_xts_encrypt${std}:
1224         AARCH64_SIGN_LINK_REGISTER
1225         stp     x15, x16, [sp, #-0x10]!
1226         stp     x17, x18, [sp, #-0x10]!
1227         stp     x19, x20, [sp, #-0x10]!
1228         stp     x21, x22, [sp, #-0x10]!
1229         stp     x23, x24, [sp, #-0x10]!
1230         stp     x25, x26, [sp, #-0x10]!
1231         stp     x27, x28, [sp, #-0x10]!
1232         stp     x29, x30, [sp, #-0x10]!
1233         stp     d8, d9, [sp, #-0x10]!
1234         stp     d10, d11, [sp, #-0x10]!
1235         stp     d12, d13, [sp, #-0x10]!
1236         stp     d14, d15, [sp, #-0x10]!
1237         mov     $rks1,x3
1238         mov     $rks2,x4
1239         mov     $enc,w6
1240         ld1     {@tweak[0].4s}, [$ivp]
1241         mov     $rks,$rks2
1242 ___
1243         &load_sbox();
1244         &rev32(@tweak[0],@tweak[0]);
1245         &encrypt_1blk(@tweak[0]);
1246 $code.=<<___;
1247         mov     $rks,$rks1
1248         and     $remain,$len,#0x0F
1249         // convert length into blocks
1250         lsr     $blocks,$len,4
1251         cmp     $blocks,#1
1252         b.lt .return${std}
1253
1254         cmp $remain,0
1255         // If the encryption/decryption Length is N times of 16,
1256         // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1257         b.eq .xts_encrypt_blocks${std}
1258
1259         // If the encryption/decryption length is not N times of 16,
1260         // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1261         // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1262         subs $blocks,$blocks,#1
1263         b.eq .only_2blks_tweak${std}
1264 .xts_encrypt_blocks${std}:
1265 ___
1266         &rbit(@tweak[0],@tweak[0],$std);
1267         &rev32_armeb(@tweak[0],@tweak[0]);
1268         &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1269         &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1270         &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1271         &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1272         &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1273         &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1274         &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1275         &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1276 $code.=<<___;
1277 .Lxts_8_blocks_process${std}:
1278         cmp     $blocks,#8
1279         b.lt    .Lxts_4_blocks_process${std}
1280 ___
1281         &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
1282         &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
1283         &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
1284         &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
1285         &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
1286         &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
1287         &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
1288         &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
1289 $code.=<<___;
1290         ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1291 ___
1292         &rbit(@vtmp[0],@vtmp[0],$std);
1293         &rbit(@vtmp[1],@vtmp[1],$std);
1294         &rbit(@vtmp[2],@vtmp[2],$std);
1295         &rbit(@vtmp[3],@vtmp[3],$std);
1296 $code.=<<___;
1297         eor @data[0].16b, @data[0].16b, @vtmp[0].16b
1298         eor @data[1].16b, @data[1].16b, @vtmp[1].16b
1299         eor @data[2].16b, @data[2].16b, @vtmp[2].16b
1300         eor @data[3].16b, @data[3].16b, @vtmp[3].16b
1301         ld1     {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1302 ___
1303         &rbit(@vtmpx[0],@vtmpx[0],$std);
1304         &rbit(@vtmpx[1],@vtmpx[1],$std);
1305         &rbit(@vtmpx[2],@vtmpx[2],$std);
1306         &rbit(@vtmpx[3],@vtmpx[3],$std);
1307 $code.=<<___;
1308         eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
1309         eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
1310         eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
1311         eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
1312 ___
1313         &rev32(@data[0],@data[0]);
1314         &rev32(@data[1],@data[1]);
1315         &rev32(@data[2],@data[2]);
1316         &rev32(@data[3],@data[3]);
1317         &rev32(@datax[0],@datax[0]);
1318         &rev32(@datax[1],@datax[1]);
1319         &rev32(@datax[2],@datax[2]);
1320         &rev32(@datax[3],@datax[3]);
1321         &transpose(@data,@vtmp);
1322         &transpose(@datax,@vtmp);
1323 $code.=<<___;
1324         bl      _${prefix}_enc_8blks
1325 ___
1326         &transpose(@vtmp,@datax);
1327         &transpose(@data,@datax);
1328
1329         &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
1330         &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1331         &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
1332         &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1333         &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
1334         &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1335         &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
1336         &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1337         &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1338         &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1339         &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1340         &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1341         &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1342         &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1343         &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
1344         &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1345 $code.=<<___;
1346         eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
1347         eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
1348         eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
1349         eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
1350         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1351         eor @data[1].16b, @data[1].16b, @tweak[1].16b
1352         eor @data[2].16b, @data[2].16b, @tweak[2].16b
1353         eor @data[3].16b, @data[3].16b, @tweak[3].16b
1354
1355         // save the last tweak
1356         st1     {@tweak[3].4s},[$ivp]
1357         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1358         st1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1359         subs    $blocks,$blocks,#8
1360         b.gt    .Lxts_8_blocks_process${std}
1361         b       100f
1362 .Lxts_4_blocks_process${std}:
1363 ___
1364         &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1365         &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1366         &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1367         &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1368 $code.=<<___;
1369         cmp     $blocks,#4
1370         b.lt    1f
1371         ld1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1372 ___
1373         &rbit(@tweak[0],@tweak[0],$std);
1374         &rbit(@tweak[1],@tweak[1],$std);
1375         &rbit(@tweak[2],@tweak[2],$std);
1376         &rbit(@tweak[3],@tweak[3],$std);
1377 $code.=<<___;
1378         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1379         eor @data[1].16b, @data[1].16b, @tweak[1].16b
1380         eor @data[2].16b, @data[2].16b, @tweak[2].16b
1381         eor @data[3].16b, @data[3].16b, @tweak[3].16b
1382 ___
1383         &rev32(@data[0],@data[0]);
1384         &rev32(@data[1],@data[1]);
1385         &rev32(@data[2],@data[2]);
1386         &rev32(@data[3],@data[3]);
1387         &transpose(@data,@vtmp);
1388 $code.=<<___;
1389         bl      _${prefix}_enc_4blks
1390 ___
1391         &transpose(@vtmp,@data);
1392 $code.=<<___;
1393         eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1394         eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1395         eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1396         eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1397         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1398         sub     $blocks,$blocks,#4
1399 ___
1400         &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1401         &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1402         &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1403 $code.=<<___;
1404         // save the last tweak
1405         st1     {@tweak[3].4s},[$ivp]
1406 1:
1407         // process last block
1408         cmp     $blocks,#1
1409         b.lt    100f
1410         b.gt    1f
1411         ld1     {@data[0].4s},[$inp],#16
1412 ___
1413         &rbit(@tweak[0],@tweak[0],$std);
1414 $code.=<<___;
1415         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1416 ___
1417         &rev32(@data[0],@data[0]);
1418         &encrypt_1blk(@data[0]);
1419 $code.=<<___;
1420         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1421         st1     {@data[0].4s},[$outp],#16
1422         // save the last tweak
1423         st1     {@tweak[0].4s},[$ivp]
1424         b       100f
1425 1:  // process last 2 blocks
1426         cmp     $blocks,#2
1427         b.gt    1f
1428         ld1     {@data[0].4s,@data[1].4s},[$inp],#32
1429 ___
1430         &rbit(@tweak[0],@tweak[0],$std);
1431         &rbit(@tweak[1],@tweak[1],$std);
1432 $code.=<<___;
1433         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1434         eor @data[1].16b, @data[1].16b, @tweak[1].16b
1435 ___
1436         &rev32(@data[0],@data[0]);
1437         &rev32(@data[1],@data[1]);
1438         &transpose(@data,@vtmp);
1439 $code.=<<___;
1440         bl      _${prefix}_enc_4blks
1441 ___
1442         &transpose(@vtmp,@data);
1443 $code.=<<___;
1444         eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1445         eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1446         st1     {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1447         // save the last tweak
1448         st1     {@tweak[1].4s},[$ivp]
1449         b       100f
1450 1:  // process last 3 blocks
1451         ld1     {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1452 ___
1453         &rbit(@tweak[0],@tweak[0],$std);
1454         &rbit(@tweak[1],@tweak[1],$std);
1455         &rbit(@tweak[2],@tweak[2],$std);
1456 $code.=<<___;
1457         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1458         eor @data[1].16b, @data[1].16b, @tweak[1].16b
1459         eor @data[2].16b, @data[2].16b, @tweak[2].16b
1460 ___
1461         &rev32(@data[0],@data[0]);
1462         &rev32(@data[1],@data[1]);
1463         &rev32(@data[2],@data[2]);
1464         &transpose(@data,@vtmp);
1465 $code.=<<___;
1466         bl      _${prefix}_enc_4blks
1467 ___
1468         &transpose(@vtmp,@data);
1469 $code.=<<___;
1470         eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1471         eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1472         eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1473         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1474         // save the last tweak
1475         st1     {@tweak[2].4s},[$ivp]
1476 100:
1477         cmp $remain,0
1478         b.eq .return${std}
1479
1480 // This branch calculates the last two tweaks, 
1481 // while the encryption/decryption length is larger than 32
1482 .last_2blks_tweak${std}:
1483         ld1     {@tweak[0].4s},[$ivp]
1484 ___
1485         &rev32_armeb(@tweak[0],@tweak[0]);
1486         &compute_tweak_vec(@tweak[0],@tweak[1],$std);
1487         &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1488 $code.=<<___;
1489         b .check_dec${std}
1490
1491
1492 // This branch calculates the last two tweaks, 
1493 // while the encryption/decryption length is equal to 32, who only need two tweaks
1494 .only_2blks_tweak${std}:
1495         mov @tweak[1].16b,@tweak[0].16b
1496 ___
1497         &rev32_armeb(@tweak[1],@tweak[1]);
1498         &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1499 $code.=<<___;
1500         b .check_dec${std}
1501
1502
1503 // Determine whether encryption or decryption is required.
1504 // The last two tweaks need to be swapped for decryption.
1505 .check_dec${std}:
1506         // encryption:1 decryption:0
1507         cmp $enc,1
1508         b.eq .process_last_2blks${std}
1509         mov @vtmp[0].16B,@tweak[1].16b
1510         mov @tweak[1].16B,@tweak[2].16b
1511         mov @tweak[2].16B,@vtmp[0].16b
1512
1513 .process_last_2blks${std}:
1514 ___
1515         &rev32_armeb(@tweak[1],@tweak[1]);
1516         &rev32_armeb(@tweak[2],@tweak[2]);
1517 $code.=<<___;
1518         ld1     {@data[0].4s},[$inp],#16
1519         eor @data[0].16b, @data[0].16b, @tweak[1].16b
1520 ___
1521         &rev32(@data[0],@data[0]);
1522         &encrypt_1blk(@data[0]);
1523 $code.=<<___;
1524         eor @data[0].16b, @data[0].16b, @tweak[1].16b
1525         st1     {@data[0].4s},[$outp],#16
1526
1527         sub $lastBlk,$outp,16
1528         .loop${std}:
1529                 subs $remain,$remain,1
1530                 ldrb    $wtmp0,[$lastBlk,$remain]
1531                 ldrb    $wtmp1,[$inp,$remain]
1532                 strb    $wtmp1,[$lastBlk,$remain]
1533                 strb    $wtmp0,[$outp,$remain]
1534         b.gt .loop${std}
1535         ld1             {@data[0].4s}, [$lastBlk]       
1536         eor @data[0].16b, @data[0].16b, @tweak[2].16b
1537 ___
1538         &rev32(@data[0],@data[0]);
1539         &encrypt_1blk(@data[0]);
1540 $code.=<<___;
1541         eor @data[0].16b, @data[0].16b, @tweak[2].16b
1542         st1             {@data[0].4s}, [$lastBlk]
1543 .return${std}:
1544         ldp             d14, d15, [sp], #0x10
1545         ldp             d12, d13, [sp], #0x10
1546         ldp             d10, d11, [sp], #0x10
1547         ldp             d8, d9, [sp], #0x10
1548         ldp             x29, x30, [sp], #0x10
1549         ldp             x27, x28, [sp], #0x10
1550         ldp             x25, x26, [sp], #0x10
1551         ldp             x23, x24, [sp], #0x10
1552         ldp             x21, x22, [sp], #0x10
1553         ldp             x19, x20, [sp], #0x10
1554         ldp             x17, x18, [sp], #0x10
1555         ldp             x15, x16, [sp], #0x10
1556         AARCH64_VALIDATE_LINK_REGISTER
1557         ret
1558 .size   ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1559 ___
1560 } # end of gen_xts_cipher
1561 &gen_xts_cipher("_gb");
1562 &gen_xts_cipher("");
1563 }}}
1564 ########################################
1565 open SELF,$0;
1566 while(<SELF>) {
1567         next if (/^#!/);
1568         last if (!s/^#/\/\// and !/^$/);
1569         print;
1570 }
1571 close SELF;
1572
1573 foreach(split("\n",$code)) {
1574         s/\`([^\`]*)\`/eval($1)/ge;
1575         print $_,"\n";
1576 }
1577
1578 close STDOUT or die "error closing STDOUT: $!";