cb1be122e4e08490872ea0694c64f79ad309a7bb
[openssl.git] / crypto / sm4 / asm / vpsm4-armv8.pl
1 #! /usr/bin/env perl
2 # Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # This module implements SM4 with ASIMD on aarch64
11 #
12 # Feb 2022
13 #
14
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23 die "can't locate arm-xlate.pl";
24
25 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26     or die "can't call $xlate: $!";
27 *STDOUT=*OUT;
28
29 $prefix="vpsm4";
30 my @vtmp=map("v$_",(0..3));
31 my @qtmp=map("q$_",(0..3));
32 my @data=map("v$_",(4..7));
33 my @datax=map("v$_",(8..11));
34 my ($rk0,$rk1)=("v12","v13");
35 my ($rka,$rkb)=("v14","v15");
36 my @vtmpx=map("v$_",(12..15));
37 my @sbox=map("v$_",(16..31));
38 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
39 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
40 my ($xtmp1,$xtmp2)=("x8","x9");
41 my ($ptr,$counter)=("x10","w11");
42 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
43
44 sub rev32() {
45         my $dst = shift;
46         my $src = shift;
47
48         if ($src and ("$src" ne "$dst")) {
49 $code.=<<___;
50 #ifndef __AARCH64EB__
51         rev32   $dst.16b,$src.16b
52 #else
53         mov     $dst.16b,$src.16b
54 #endif
55 ___
56         } else {
57 $code.=<<___;
58 #ifndef __AARCH64EB__
59         rev32   $dst.16b,$dst.16b
60 #endif
61 ___
62         }
63 }
64
65 sub rev32_armeb() {
66         my $dst = shift;
67         my $src = shift;
68
69         if ($src and ("$src" ne "$dst")) {
70 $code.=<<___;
71 #ifdef __AARCH64EB__
72         rev32   $dst.16b,$src.16b
73 #else
74         mov     $dst.16b,$src.16b
75 #endif
76 ___
77         } else {
78 $code.=<<___;
79 #ifdef __AARCH64EB__
80         rev32   $dst.16b,$dst.16b
81 #endif
82 ___
83         }
84 }
85
86 sub rbit() {
87         my $dst = shift;
88         my $src = shift;
89         my $std = shift;
90
91         if ($src and ("$src" ne "$dst")) {
92                 if ($std eq "_gb") {
93 $code.=<<___;
94                         rbit $dst.16b,$src.16b
95 ___
96                 } else {
97 $code.=<<___;
98                         mov $dst.16b,$src.16b
99 ___
100                 }
101         } else {
102                 if ($std eq "_gb") {
103 $code.=<<___;
104                         rbit $dst.16b,$src.16b
105 ___
106                 }
107         }
108 }
109
110 sub transpose() {
111         my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
112
113 $code.=<<___;
114         zip1    $vt0.4s,$dat0.4s,$dat1.4s
115         zip2    $vt1.4s,$dat0.4s,$dat1.4s
116         zip1    $vt2.4s,$dat2.4s,$dat3.4s
117         zip2    $vt3.4s,$dat2.4s,$dat3.4s
118         zip1    $dat0.2d,$vt0.2d,$vt2.2d
119         zip2    $dat1.2d,$vt0.2d,$vt2.2d
120         zip1    $dat2.2d,$vt1.2d,$vt3.2d
121         zip2    $dat3.2d,$vt1.2d,$vt3.2d
122 ___
123 }
124
125 # sbox operations for 4-lane of words
126 sub sbox() {
127         my $dat = shift;
128
129 $code.=<<___;
130         movi    @vtmp[0].16b,#64
131         movi    @vtmp[1].16b,#128
132         movi    @vtmp[2].16b,#192
133         sub     @vtmp[0].16b,$dat.16b,@vtmp[0].16b
134         sub     @vtmp[1].16b,$dat.16b,@vtmp[1].16b
135         sub     @vtmp[2].16b,$dat.16b,@vtmp[2].16b
136         tbl     $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
137         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
138         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
139         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
140         add     @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
141         add     @vtmp[2].2d,@vtmp[2].2d,$dat.2d
142         add     $dat.2d,@vtmp[0].2d,@vtmp[2].2d
143
144         ushr    @vtmp[0].4s,$dat.4s,32-2
145         sli     @vtmp[0].4s,$dat.4s,2
146         ushr    @vtmp[2].4s,$dat.4s,32-10
147         eor     @vtmp[1].16b,@vtmp[0].16b,$dat.16b
148         sli     @vtmp[2].4s,$dat.4s,10
149         eor     @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
150         ushr    @vtmp[0].4s,$dat.4s,32-18
151         sli     @vtmp[0].4s,$dat.4s,18
152         ushr    @vtmp[2].4s,$dat.4s,32-24
153         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
154         sli     @vtmp[2].4s,$dat.4s,24
155         eor     $dat.16b,@vtmp[2].16b,@vtmp[1].16b
156 ___
157 }
158
159 # sbox operation for 8-lane of words
160 sub sbox_double() {
161         my $dat = shift;
162         my $datx = shift;
163
164 $code.=<<___;
165         movi    @vtmp[3].16b,#64
166         sub     @vtmp[0].16b,$dat.16b,@vtmp[3].16b
167         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
168         sub     @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
169         tbl     $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
170         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
171         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
172         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
173         add     @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
174         add     $dat.2d,@vtmp[2].2d,$dat.2d
175         add     $dat.2d,@vtmp[1].2d,$dat.2d
176
177         sub     @vtmp[0].16b,$datx.16b,@vtmp[3].16b
178         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
179         sub     @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
180         tbl     $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
181         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
182         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
183         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
184         add     @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
185         add     $datx.2d,@vtmp[2].2d,$datx.2d
186         add     $datx.2d,@vtmp[1].2d,$datx.2d
187
188         ushr    @vtmp[0].4s,$dat.4s,32-2
189         sli     @vtmp[0].4s,$dat.4s,2
190         ushr    @vtmp[2].4s,$datx.4s,32-2
191         eor     @vtmp[1].16b,@vtmp[0].16b,$dat.16b
192         sli     @vtmp[2].4s,$datx.4s,2
193
194         ushr    @vtmp[0].4s,$dat.4s,32-10
195         eor     @vtmp[3].16b,@vtmp[2].16b,$datx.16b
196         sli     @vtmp[0].4s,$dat.4s,10
197         ushr    @vtmp[2].4s,$datx.4s,32-10
198         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
199         sli     @vtmp[2].4s,$datx.4s,10
200
201         ushr    @vtmp[0].4s,$dat.4s,32-18
202         eor     @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
203         sli     @vtmp[0].4s,$dat.4s,18
204         ushr    @vtmp[2].4s,$datx.4s,32-18
205         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
206         sli     @vtmp[2].4s,$datx.4s,18
207
208         ushr    @vtmp[0].4s,$dat.4s,32-24
209         eor     @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
210         sli     @vtmp[0].4s,$dat.4s,24
211         ushr    @vtmp[2].4s,$datx.4s,32-24
212         eor     $dat.16b,@vtmp[0].16b,@vtmp[1].16b
213         sli     @vtmp[2].4s,$datx.4s,24
214         eor     $datx.16b,@vtmp[2].16b,@vtmp[3].16b
215 ___
216 }
217
218 # sbox operation for one single word
219 sub sbox_1word () {
220         my $word = shift;
221
222 $code.=<<___;
223         movi    @vtmp[1].16b,#64
224         movi    @vtmp[2].16b,#128
225         movi    @vtmp[3].16b,#192
226         mov     @vtmp[0].s[0],$word
227
228         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
229         sub     @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
230         sub     @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
231
232         tbl     @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
233         tbl     @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
234         tbl     @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
235         tbl     @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
236
237         mov     $word,@vtmp[0].s[0]
238         mov     $wtmp0,@vtmp[1].s[0]
239         mov     $wtmp2,@vtmp[2].s[0]
240         add     $wtmp0,$word,$wtmp0
241         mov     $word,@vtmp[3].s[0]
242         add     $wtmp0,$wtmp0,$wtmp2
243         add     $wtmp0,$wtmp0,$word
244
245         eor     $word,$wtmp0,$wtmp0,ror #32-2
246         eor     $word,$word,$wtmp0,ror #32-10
247         eor     $word,$word,$wtmp0,ror #32-18
248         eor     $word,$word,$wtmp0,ror #32-24
249 ___
250 }
251
252 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
253 sub sm4_1blk () {
254         my $kptr = shift;
255
256 $code.=<<___;
257         ldp     $wtmp0,$wtmp1,[$kptr],8
258         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
259         eor     $tmpw,$word2,$word3
260         eor     $wtmp2,$wtmp0,$word1
261         eor     $tmpw,$tmpw,$wtmp2
262 ___
263         &sbox_1word($tmpw);
264 $code.=<<___;
265         eor     $word0,$word0,$tmpw
266         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
267         eor     $tmpw,$word2,$word3
268         eor     $wtmp2,$word0,$wtmp1
269         eor     $tmpw,$tmpw,$wtmp2
270 ___
271         &sbox_1word($tmpw);
272 $code.=<<___;
273         ldp     $wtmp0,$wtmp1,[$kptr],8
274         eor     $word1,$word1,$tmpw
275         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
276         eor     $tmpw,$word0,$word1
277         eor     $wtmp2,$wtmp0,$word3
278         eor     $tmpw,$tmpw,$wtmp2
279 ___
280         &sbox_1word($tmpw);
281 $code.=<<___;
282         eor     $word2,$word2,$tmpw
283         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
284         eor     $tmpw,$word0,$word1
285         eor     $wtmp2,$word2,$wtmp1
286         eor     $tmpw,$tmpw,$wtmp2
287 ___
288         &sbox_1word($tmpw);
289 $code.=<<___;
290         eor     $word3,$word3,$tmpw
291 ___
292 }
293
294 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
295 sub sm4_4blks () {
296         my $kptr = shift;
297
298 $code.=<<___;
299         ldp     $wtmp0,$wtmp1,[$kptr],8
300         dup     $rk0.4s,$wtmp0
301         dup     $rk1.4s,$wtmp1
302
303         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304         eor     $rka.16b,@data[2].16b,@data[3].16b
305         eor     $rk0.16b,@data[1].16b,$rk0.16b
306         eor     $rk0.16b,$rka.16b,$rk0.16b
307 ___
308         &sbox($rk0);
309 $code.=<<___;
310         eor     @data[0].16b,@data[0].16b,$rk0.16b
311
312         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
313         eor     $rka.16b,$rka.16b,@data[0].16b
314         eor     $rk1.16b,$rka.16b,$rk1.16b
315 ___
316         &sbox($rk1);
317 $code.=<<___;
318         ldp     $wtmp0,$wtmp1,[$kptr],8
319         eor     @data[1].16b,@data[1].16b,$rk1.16b
320
321         dup     $rk0.4s,$wtmp0
322         dup     $rk1.4s,$wtmp1
323
324         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
325         eor     $rka.16b,@data[0].16b,@data[1].16b
326         eor     $rk0.16b,@data[3].16b,$rk0.16b
327         eor     $rk0.16b,$rka.16b,$rk0.16b
328 ___
329         &sbox($rk0);
330 $code.=<<___;
331         eor     @data[2].16b,@data[2].16b,$rk0.16b
332
333         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
334         eor     $rka.16b,$rka.16b,@data[2].16b
335         eor     $rk1.16b,$rka.16b,$rk1.16b
336 ___
337         &sbox($rk1);
338 $code.=<<___;
339         eor     @data[3].16b,@data[3].16b,$rk1.16b
340 ___
341 }
342
343 # sm4 for 8 lanes of data, in neon registers
344 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
345 sub sm4_8blks () {
346         my $kptr = shift;
347
348 $code.=<<___;
349         ldp     $wtmp0,$wtmp1,[$kptr],8
350         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
351         dup     $rk0.4s,$wtmp0
352         eor     $rka.16b,@data[2].16b,@data[3].16b
353         eor     $rkb.16b,@datax[2].16b,@datax[3].16b
354         eor     @vtmp[0].16b,@data[1].16b,$rk0.16b
355         eor     @vtmp[1].16b,@datax[1].16b,$rk0.16b
356         eor     $rk0.16b,$rka.16b,@vtmp[0].16b
357         eor     $rk1.16b,$rkb.16b,@vtmp[1].16b
358 ___
359         &sbox_double($rk0,$rk1);
360 $code.=<<___;
361         eor     @data[0].16b,@data[0].16b,$rk0.16b
362         eor     @datax[0].16b,@datax[0].16b,$rk1.16b
363
364         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
365         dup     $rk1.4s,$wtmp1
366         eor     $rka.16b,$rka.16b,@data[0].16b
367         eor     $rkb.16b,$rkb.16b,@datax[0].16b
368         eor     $rk0.16b,$rka.16b,$rk1.16b
369         eor     $rk1.16b,$rkb.16b,$rk1.16b
370 ___
371         &sbox_double($rk0,$rk1);
372 $code.=<<___;
373         ldp     $wtmp0,$wtmp1,[$kptr],8
374         eor     @data[1].16b,@data[1].16b,$rk0.16b
375         eor     @datax[1].16b,@datax[1].16b,$rk1.16b
376
377         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
378         dup     $rk0.4s,$wtmp0
379         eor     $rka.16b,@data[0].16b,@data[1].16b
380         eor     $rkb.16b,@datax[0].16b,@datax[1].16b
381         eor     @vtmp[0].16b,@data[3].16b,$rk0.16b
382         eor     @vtmp[1].16b,@datax[3].16b,$rk0.16b
383         eor     $rk0.16b,$rka.16b,@vtmp[0].16b
384         eor     $rk1.16b,$rkb.16b,@vtmp[1].16b
385 ___
386         &sbox_double($rk0,$rk1);
387 $code.=<<___;
388         eor     @data[2].16b,@data[2].16b,$rk0.16b
389         eor     @datax[2].16b,@datax[2].16b,$rk1.16b
390
391         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
392         dup     $rk1.4s,$wtmp1
393         eor     $rka.16b,$rka.16b,@data[2].16b
394         eor     $rkb.16b,$rkb.16b,@datax[2].16b
395         eor     $rk0.16b,$rka.16b,$rk1.16b
396         eor     $rk1.16b,$rkb.16b,$rk1.16b
397 ___
398         &sbox_double($rk0,$rk1);
399 $code.=<<___;
400         eor     @data[3].16b,@data[3].16b,$rk0.16b
401         eor     @datax[3].16b,@datax[3].16b,$rk1.16b
402 ___
403 }
404
405 sub encrypt_1blk_norev() {
406         my $dat = shift;
407
408 $code.=<<___;
409         mov     $ptr,$rks
410         mov     $counter,#8
411         mov     $word0,$dat.s[0]
412         mov     $word1,$dat.s[1]
413         mov     $word2,$dat.s[2]
414         mov     $word3,$dat.s[3]
415 10:
416 ___
417         &sm4_1blk($ptr);
418 $code.=<<___;
419         subs    $counter,$counter,#1
420         b.ne    10b
421         mov     $dat.s[0],$word3
422         mov     $dat.s[1],$word2
423         mov     $dat.s[2],$word1
424         mov     $dat.s[3],$word0
425 ___
426 }
427
428 sub encrypt_1blk() {
429         my $dat = shift;
430
431         &encrypt_1blk_norev($dat);
432         &rev32($dat,$dat);
433 }
434
435 sub encrypt_4blks() {
436 $code.=<<___;
437         mov     $ptr,$rks
438         mov     $counter,#8
439 10:
440 ___
441         &sm4_4blks($ptr);
442 $code.=<<___;
443         subs    $counter,$counter,#1
444         b.ne    10b
445 ___
446         &rev32(@vtmp[3],@data[0]);
447         &rev32(@vtmp[2],@data[1]);
448         &rev32(@vtmp[1],@data[2]);
449         &rev32(@vtmp[0],@data[3]);
450 }
451
452 sub encrypt_8blks() {
453 $code.=<<___;
454         mov     $ptr,$rks
455         mov     $counter,#8
456 10:
457 ___
458         &sm4_8blks($ptr);
459 $code.=<<___;
460         subs    $counter,$counter,#1
461         b.ne    10b
462 ___
463         &rev32(@vtmp[3],@data[0]);
464         &rev32(@vtmp[2],@data[1]);
465         &rev32(@vtmp[1],@data[2]);
466         &rev32(@vtmp[0],@data[3]);
467         &rev32(@data[3],@datax[0]);
468         &rev32(@data[2],@datax[1]);
469         &rev32(@data[1],@datax[2]);
470         &rev32(@data[0],@datax[3]);
471 }
472
473 sub load_sbox () {
474         my $data = shift;
475
476 $code.=<<___;
477         adr     $ptr,.Lsbox
478         ld1     {@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},[$ptr],#64
479         ld1     {@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},[$ptr],#64
480         ld1     {@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},[$ptr],#64
481         ld1     {@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},[$ptr]
482 ___
483 }
484
485
486 sub mov_reg_to_vec() {
487         my $src0 = shift;
488         my $src1 = shift;
489         my $desv = shift;
490 $code.=<<___;
491         mov $desv.d[0],$src0
492         mov $desv.d[1],$src1
493 ___
494         &rev32_armeb($desv,$desv);
495 }
496
497 sub mov_vec_to_reg() {
498         my $srcv = shift;
499         my $des0 = shift;
500         my $des1 = shift;
501 $code.=<<___;
502         mov $des0,$srcv.d[0]
503         mov $des1,$srcv.d[1]
504 ___
505 }
506
507 sub compute_tweak() {
508         my $src0 = shift;
509         my $src1 = shift;
510         my $des0 = shift;
511         my $des1 = shift;
512 $code.=<<___;
513         mov $wtmp0,0x87
514         extr    $xtmp2,$src1,$src1,#32
515         extr    $des1,$src1,$src0,#63
516         and     $wtmp1,$wtmp0,$wtmp2,asr#31
517         eor     $des0,$xtmp1,$src0,lsl#1
518 ___
519 }
520
521 sub compute_tweak_vec() {
522         my $src = shift;
523         my $des = shift;
524         my $std = shift;
525         &rbit(@vtmp[2],$src,$std);
526 $code.=<<___;
527         ldr  @qtmp[0], =0x01010101010101010101010101010187
528         shl  $des.16b, @vtmp[2].16b, #1
529         ext  @vtmp[1].16b, @vtmp[2].16b, @vtmp[2].16b,#15
530         ushr @vtmp[1].16b, @vtmp[1].16b, #7
531         mul  @vtmp[1].16b, @vtmp[1].16b, @vtmp[0].16b
532         eor  $des.16b, $des.16b, @vtmp[1].16b
533 ___
534         &rbit($des,$des,$std);
535 }
536
537 $code=<<___;
538 #include "arm_arch.h"
539 .arch   armv8-a
540 .text
541
542 .type   _vpsm4_consts,%object
543 .align  7
544 _vpsm4_consts:
545 .Lsbox:
546         .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
547         .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
548         .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
549         .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
550         .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
551         .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
552         .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
553         .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
554         .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
555         .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
556         .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
557         .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
558         .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
559         .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
560         .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
561         .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
562 .Lck:
563         .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
564         .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
565         .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
566         .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
567         .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
568         .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
569         .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
570         .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
571 .Lfk:
572         .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
573 .Lshuffles:
574         .dword 0x0B0A090807060504,0x030201000F0E0D0C
575
576 .size   _vpsm4_consts,.-_vpsm4_consts
577 ___
578
579 {{{
580 my ($key,$keys,$enc)=("x0","x1","w2");
581 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
582 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
583 $code.=<<___;
584 .type   _vpsm4_set_key,%function
585 .align  4
586 _vpsm4_set_key:
587         AARCH64_VALID_CALL_TARGET
588         ld1     {$vkey.4s},[$key]
589 ___
590         &load_sbox();
591         &rev32($vkey,$vkey);
592 $code.=<<___;
593         adr     $pointer,.Lshuffles
594         ld1     {$vmap.2d},[$pointer]
595         adr     $pointer,.Lfk
596         ld1     {$vfk.2d},[$pointer]
597         eor     $vkey.16b,$vkey.16b,$vfk.16b
598         mov     $schedules,#32
599         adr     $pointer,.Lck
600         movi    @vtmp[0].16b,#64
601         cbnz    $enc,1f
602         add     $keys,$keys,124
603 1:
604         mov     $wtmp,$vkey.s[1]
605         ldr     $roundkey,[$pointer],#4
606         eor     $roundkey,$roundkey,$wtmp
607         mov     $wtmp,$vkey.s[2]
608         eor     $roundkey,$roundkey,$wtmp
609         mov     $wtmp,$vkey.s[3]
610         eor     $roundkey,$roundkey,$wtmp
611         // sbox lookup
612         mov     @data[0].s[0],$roundkey
613         tbl     @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
614         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
615         tbx     @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
616         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
617         tbx     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
618         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
619         tbx     @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
620         mov     $wtmp,@vtmp[1].s[0]
621         eor     $roundkey,$wtmp,$wtmp,ror #19
622         eor     $roundkey,$roundkey,$wtmp,ror #9
623         mov     $wtmp,$vkey.s[0]
624         eor     $roundkey,$roundkey,$wtmp
625         mov     $vkey.s[0],$roundkey
626         cbz     $enc,2f
627         str     $roundkey,[$keys],#4
628         b       3f
629 2:
630         str     $roundkey,[$keys],#-4
631 3:
632         tbl     $vkey.16b,{$vkey.16b},$vmap.16b
633         subs    $schedules,$schedules,#1
634         b.ne    1b
635         ret
636 .size   _vpsm4_set_key,.-_vpsm4_set_key
637 ___
638 }}}
639
640
641 {{{
642 $code.=<<___;
643 .type   _vpsm4_enc_4blks,%function
644 .align  4
645 _vpsm4_enc_4blks:
646         AARCH64_VALID_CALL_TARGET
647 ___
648         &encrypt_4blks();
649 $code.=<<___;
650         ret
651 .size   _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
652 ___
653 }}}
654
655 {{{
656 $code.=<<___;
657 .type   _vpsm4_enc_8blks,%function
658 .align  4
659 _vpsm4_enc_8blks:
660         AARCH64_VALID_CALL_TARGET
661 ___
662         &encrypt_8blks();
663 $code.=<<___;
664         ret
665 .size   _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
666 ___
667 }}}
668
669
670 {{{
671 my ($key,$keys)=("x0","x1");
672 $code.=<<___;
673 .globl  ${prefix}_set_encrypt_key
674 .type   ${prefix}_set_encrypt_key,%function
675 .align  5
676 ${prefix}_set_encrypt_key:
677         AARCH64_SIGN_LINK_REGISTER
678         stp     x29,x30,[sp,#-16]!
679         mov     w2,1
680         bl      _vpsm4_set_key
681         ldp     x29,x30,[sp],#16
682         AARCH64_VALIDATE_LINK_REGISTER
683         ret
684 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
685 ___
686 }}}
687
688 {{{
689 my ($key,$keys)=("x0","x1");
690 $code.=<<___;
691 .globl  ${prefix}_set_decrypt_key
692 .type   ${prefix}_set_decrypt_key,%function
693 .align  5
694 ${prefix}_set_decrypt_key:
695         AARCH64_SIGN_LINK_REGISTER
696         stp     x29,x30,[sp,#-16]!
697         mov     w2,0
698         bl      _vpsm4_set_key
699         ldp     x29,x30,[sp],#16
700         AARCH64_VALIDATE_LINK_REGISTER
701         ret
702 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
703 ___
704 }}}
705
706 {{{
707 sub gen_block () {
708         my $dir = shift;
709         my ($inp,$outp,$rk)=map("x$_",(0..2));
710
711 $code.=<<___;
712 .globl  ${prefix}_${dir}crypt
713 .type   ${prefix}_${dir}crypt,%function
714 .align  5
715 ${prefix}_${dir}crypt:
716         AARCH64_VALID_CALL_TARGET
717         ld1     {@data[0].4s},[$inp]
718 ___
719         &load_sbox();
720         &rev32(@data[0],@data[0]);
721 $code.=<<___;
722         mov     $rks,x2
723 ___
724         &encrypt_1blk(@data[0]);
725 $code.=<<___;
726         st1     {@data[0].4s},[$outp]
727         ret
728 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
729 ___
730 }
731 &gen_block("en");
732 &gen_block("de");
733 }}}
734
735 {{{
736 my ($enc) = ("w4");
737 my @dat=map("v$_",(16..23));
738
739 $code.=<<___;
740 .globl  ${prefix}_ecb_encrypt
741 .type   ${prefix}_ecb_encrypt,%function
742 .align  5
743 ${prefix}_ecb_encrypt:
744         AARCH64_SIGN_LINK_REGISTER
745         // convert length into blocks
746         lsr     x2,x2,4
747         stp     d8,d9,[sp,#-80]!
748         stp     d10,d11,[sp,#16]
749         stp     d12,d13,[sp,#32]
750         stp     d14,d15,[sp,#48]
751         stp     x29,x30,[sp,#64]
752 ___
753         &load_sbox();
754 $code.=<<___;
755 .Lecb_8_blocks_process:
756         cmp     $blocks,#8
757         b.lt    .Lecb_4_blocks_process
758         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
759         ld4     {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
760 ___
761         &rev32(@data[0],@data[0]);
762         &rev32(@data[1],@data[1]);
763         &rev32(@data[2],@data[2]);
764         &rev32(@data[3],@data[3]);
765         &rev32(@datax[0],@datax[0]);
766         &rev32(@datax[1],@datax[1]);
767         &rev32(@datax[2],@datax[2]);
768         &rev32(@datax[3],@datax[3]);
769 $code.=<<___;
770         bl      _vpsm4_enc_8blks
771         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
772         st4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
773         subs    $blocks,$blocks,#8
774         b.gt    .Lecb_8_blocks_process
775         b       100f
776 .Lecb_4_blocks_process:
777         cmp     $blocks,#4
778         b.lt    1f
779         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
780 ___
781         &rev32(@data[0],@data[0]);
782         &rev32(@data[1],@data[1]);
783         &rev32(@data[2],@data[2]);
784         &rev32(@data[3],@data[3]);
785 $code.=<<___;
786         bl      _vpsm4_enc_4blks
787         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
788         sub     $blocks,$blocks,#4
789 1:
790         // process last block
791         cmp     $blocks,#1
792         b.lt    100f
793         b.gt    1f
794         ld1     {@data[0].4s},[$inp]
795 ___
796         &rev32(@data[0],@data[0]);
797         &encrypt_1blk(@data[0]);
798 $code.=<<___;
799         st1     {@data[0].4s},[$outp]
800         b       100f
801 1:      // process last 2 blocks
802         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
803         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
804         cmp     $blocks,#2
805         b.gt    1f
806 ___
807         &rev32(@data[0],@data[0]);
808         &rev32(@data[1],@data[1]);
809         &rev32(@data[2],@data[2]);
810         &rev32(@data[3],@data[3]);
811 $code.=<<___;
812         bl      _vpsm4_enc_4blks
813         st4     {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
814         st4     {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
815         b       100f
816 1:      // process last 3 blocks
817         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
818 ___
819         &rev32(@data[0],@data[0]);
820         &rev32(@data[1],@data[1]);
821         &rev32(@data[2],@data[2]);
822         &rev32(@data[3],@data[3]);
823 $code.=<<___;
824         bl      _vpsm4_enc_4blks
825         st4     {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
826         st4     {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
827         st4     {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
828 100:
829         ldp     d10,d11,[sp,#16]
830         ldp     d12,d13,[sp,#32]
831         ldp     d14,d15,[sp,#48]
832         ldp     x29,x30,[sp,#64]
833         ldp     d8,d9,[sp],#80
834         AARCH64_VALIDATE_LINK_REGISTER
835         ret
836 .size   ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
837 ___
838 }}}
839
840 {{{
841 my ($len,$ivp,$enc)=("x2","x4","w5");
842 my $ivec0=("v3");
843 my $ivec1=("v15");
844
845 $code.=<<___;
846 .globl  ${prefix}_cbc_encrypt
847 .type   ${prefix}_cbc_encrypt,%function
848 .align  5
849 ${prefix}_cbc_encrypt:
850         AARCH64_VALID_CALL_TARGET
851         lsr     $len,$len,4
852 ___
853         &load_sbox();
854 $code.=<<___;
855         cbz     $enc,.Ldec
856         ld1     {$ivec0.4s},[$ivp]
857 .Lcbc_4_blocks_enc:
858         cmp     $blocks,#4
859         b.lt    1f
860         ld1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
861         eor     @data[0].16b,@data[0].16b,$ivec0.16b
862 ___
863         &rev32(@data[1],@data[1]);
864         &rev32(@data[0],@data[0]);
865         &rev32(@data[2],@data[2]);
866         &rev32(@data[3],@data[3]);
867         &encrypt_1blk_norev(@data[0]);
868 $code.=<<___;
869         eor     @data[1].16b,@data[1].16b,@data[0].16b
870 ___
871         &encrypt_1blk_norev(@data[1]);
872         &rev32(@data[0],@data[0]);
873
874 $code.=<<___;
875         eor     @data[2].16b,@data[2].16b,@data[1].16b
876 ___
877         &encrypt_1blk_norev(@data[2]);
878         &rev32(@data[1],@data[1]);
879 $code.=<<___;
880         eor     @data[3].16b,@data[3].16b,@data[2].16b
881 ___
882         &encrypt_1blk_norev(@data[3]);
883         &rev32(@data[2],@data[2]);
884         &rev32(@data[3],@data[3]);
885 $code.=<<___;
886         orr     $ivec0.16b,@data[3].16b,@data[3].16b
887         st1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
888         subs    $blocks,$blocks,#4
889         b.ne    .Lcbc_4_blocks_enc
890         b       2f
891 1:
892         subs    $blocks,$blocks,#1
893         b.lt    2f
894         ld1     {@data[0].4s},[$inp],#16
895         eor     $ivec0.16b,$ivec0.16b,@data[0].16b
896 ___
897         &rev32($ivec0,$ivec0);
898         &encrypt_1blk($ivec0);
899 $code.=<<___;
900         st1     {$ivec0.4s},[$outp],#16
901         b       1b
902 2:
903         // save back IV
904         st1     {$ivec0.4s},[$ivp]
905         ret
906
907 .Ldec:
908         // decryption mode starts
909         AARCH64_SIGN_LINK_REGISTER
910         stp     d8,d9,[sp,#-80]!
911         stp     d10,d11,[sp,#16]
912         stp     d12,d13,[sp,#32]
913         stp     d14,d15,[sp,#48]
914         stp     x29,x30,[sp,#64]
915 .Lcbc_8_blocks_dec:
916         cmp     $blocks,#8
917         b.lt    1f
918         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
919         add     $ptr,$inp,#64
920         ld4     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
921 ___
922         &rev32(@data[0],@data[0]);
923         &rev32(@data[1],@data[1]);
924         &rev32(@data[2],@data[2]);
925         &rev32(@data[3],$data[3]);
926         &rev32(@datax[0],@datax[0]);
927         &rev32(@datax[1],@datax[1]);
928         &rev32(@datax[2],@datax[2]);
929         &rev32(@datax[3],$datax[3]);
930 $code.=<<___;
931         bl      _vpsm4_enc_8blks
932 ___
933         &transpose(@vtmp,@datax);
934         &transpose(@data,@datax);
935 $code.=<<___;
936         ld1     {$ivec1.4s},[$ivp]
937         ld1     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
938         // note ivec1 and vtmpx[3] are resuing the same register
939         // care needs to be taken to avoid conflict
940         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
941         ld1     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
942         eor     @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
943         eor     @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
944         eor     @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
945         // save back IV
946         st1     {$vtmpx[3].4s}, [$ivp]
947         eor     @data[0].16b,@data[0].16b,$datax[3].16b
948         eor     @data[1].16b,@data[1].16b,@vtmpx[0].16b
949         eor     @data[2].16b,@data[2].16b,@vtmpx[1].16b
950         eor     @data[3].16b,$data[3].16b,@vtmpx[2].16b
951         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
952         st1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
953         subs    $blocks,$blocks,#8
954         b.gt    .Lcbc_8_blocks_dec
955         b.eq    100f
956 1:
957         ld1     {$ivec1.4s},[$ivp]
958 .Lcbc_4_blocks_dec:
959         cmp     $blocks,#4
960         b.lt    1f
961         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
962 ___
963         &rev32(@data[0],@data[0]);
964         &rev32(@data[1],@data[1]);
965         &rev32(@data[2],@data[2]);
966         &rev32(@data[3],$data[3]);
967 $code.=<<___;
968         bl      _vpsm4_enc_4blks
969         ld1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
970 ___
971         &transpose(@vtmp,@datax);
972 $code.=<<___;
973         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
974         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
975         orr     $ivec1.16b,@data[3].16b,@data[3].16b
976         eor     @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
977         eor     @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
978         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
979         subs    $blocks,$blocks,#4
980         b.gt    .Lcbc_4_blocks_dec
981         // save back IV
982         st1     {@data[3].4s}, [$ivp]
983         b       100f
984 1:      // last block
985         subs    $blocks,$blocks,#1
986         b.lt    100f
987         b.gt    1f
988         ld1     {@data[0].4s},[$inp],#16
989         // save back IV
990         st1     {$data[0].4s}, [$ivp]
991 ___
992         &rev32(@datax[0],@data[0]);
993         &encrypt_1blk(@datax[0]);
994 $code.=<<___;
995         eor     @datax[0].16b,@datax[0].16b,$ivec1.16b
996         st1     {@datax[0].4s},[$outp],#16
997         b       100f
998 1:      // last two blocks
999         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
1000         add     $ptr,$inp,#16
1001         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
1002         subs    $blocks,$blocks,1
1003         b.gt    1f
1004 ___
1005         &rev32(@data[0],@data[0]);
1006         &rev32(@data[1],@data[1]);
1007         &rev32(@data[2],@data[2]);
1008         &rev32(@data[3],@data[3]);
1009 $code.=<<___;
1010         bl      _vpsm4_enc_4blks
1011         ld1     {@data[0].4s,@data[1].4s},[$inp],#32
1012 ___
1013         &transpose(@vtmp,@datax);
1014 $code.=<<___;
1015         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1016         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1017         st1     {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1018         // save back IV
1019         st1     {@data[1].4s}, [$ivp]
1020         b       100f
1021 1:      // last 3 blocks
1022         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
1023 ___
1024         &rev32(@data[0],@data[0]);
1025         &rev32(@data[1],@data[1]);
1026         &rev32(@data[2],@data[2]);
1027         &rev32(@data[3],@data[3]);
1028 $code.=<<___;
1029         bl      _vpsm4_enc_4blks
1030         ld1     {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1031 ___
1032         &transpose(@vtmp,@datax);
1033 $code.=<<___;
1034         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
1035         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
1036         eor     @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
1037         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1038         // save back IV
1039         st1     {@data[2].4s}, [$ivp]
1040 100:
1041         ldp     d10,d11,[sp,#16]
1042         ldp     d12,d13,[sp,#32]
1043         ldp     d14,d15,[sp,#48]
1044         ldp     x29,x30,[sp,#64]
1045         ldp     d8,d9,[sp],#80
1046         AARCH64_VALIDATE_LINK_REGISTER
1047         ret
1048 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1049 ___
1050 }}}
1051
1052 {{{
1053 my ($ivp)=("x4");
1054 my ($ctr)=("w5");
1055 my $ivec=("v3");
1056
1057 $code.=<<___;
1058 .globl  ${prefix}_ctr32_encrypt_blocks
1059 .type   ${prefix}_ctr32_encrypt_blocks,%function
1060 .align  5
1061 ${prefix}_ctr32_encrypt_blocks:
1062         AARCH64_VALID_CALL_TARGET
1063         ld1     {$ivec.4s},[$ivp]
1064 ___
1065         &rev32($ivec,$ivec);
1066         &load_sbox();
1067 $code.=<<___;
1068         cmp     $blocks,#1
1069         b.ne    1f
1070         // fast processing for one single block without
1071         // context saving overhead
1072 ___
1073         &encrypt_1blk($ivec);
1074 $code.=<<___;
1075         ld1     {@data[0].4s},[$inp]
1076         eor     @data[0].16b,@data[0].16b,$ivec.16b
1077         st1     {@data[0].4s},[$outp]
1078         ret
1079 1:
1080         AARCH64_SIGN_LINK_REGISTER
1081         stp     d8,d9,[sp,#-80]!
1082         stp     d10,d11,[sp,#16]
1083         stp     d12,d13,[sp,#32]
1084         stp     d14,d15,[sp,#48]
1085         stp     x29,x30,[sp,#64]
1086         mov     $word0,$ivec.s[0]
1087         mov     $word1,$ivec.s[1]
1088         mov     $word2,$ivec.s[2]
1089         mov     $ctr,$ivec.s[3]
1090 .Lctr32_4_blocks_process:
1091         cmp     $blocks,#4
1092         b.lt    1f
1093         dup     @data[0].4s,$word0
1094         dup     @data[1].4s,$word1
1095         dup     @data[2].4s,$word2
1096         mov     @data[3].s[0],$ctr
1097         add     $ctr,$ctr,#1
1098         mov     $data[3].s[1],$ctr
1099         add     $ctr,$ctr,#1
1100         mov     @data[3].s[2],$ctr
1101         add     $ctr,$ctr,#1
1102         mov     @data[3].s[3],$ctr
1103         add     $ctr,$ctr,#1
1104         cmp     $blocks,#8
1105         b.ge    .Lctr32_8_blocks_process
1106         bl      _vpsm4_enc_4blks
1107         ld4     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1108         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1109         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1110         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1111         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1112         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1113         subs    $blocks,$blocks,#4
1114         b.ne    .Lctr32_4_blocks_process
1115         b       100f
1116 .Lctr32_8_blocks_process:
1117         dup     @datax[0].4s,$word0
1118         dup     @datax[1].4s,$word1
1119         dup     @datax[2].4s,$word2
1120         mov     @datax[3].s[0],$ctr
1121         add     $ctr,$ctr,#1
1122         mov     $datax[3].s[1],$ctr
1123         add     $ctr,$ctr,#1
1124         mov     @datax[3].s[2],$ctr
1125         add     $ctr,$ctr,#1
1126         mov     @datax[3].s[3],$ctr
1127         add     $ctr,$ctr,#1
1128         bl      _vpsm4_enc_8blks
1129         ld4     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1130         ld4     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1131         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1132         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1133         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1134         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1135         eor     @data[0].16b,@data[0].16b,@datax[0].16b
1136         eor     @data[1].16b,@data[1].16b,@datax[1].16b
1137         eor     @data[2].16b,@data[2].16b,@datax[2].16b
1138         eor     @data[3].16b,@data[3].16b,@datax[3].16b
1139         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1140         st4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1141         subs    $blocks,$blocks,#8
1142         b.ne    .Lctr32_4_blocks_process
1143         b       100f
1144 1:      // last block processing
1145         subs    $blocks,$blocks,#1
1146         b.lt    100f
1147         b.gt    1f
1148         mov     $ivec.s[0],$word0
1149         mov     $ivec.s[1],$word1
1150         mov     $ivec.s[2],$word2
1151         mov     $ivec.s[3],$ctr
1152 ___
1153         &encrypt_1blk($ivec);
1154 $code.=<<___;
1155         ld1     {@data[0].4s},[$inp]
1156         eor     @data[0].16b,@data[0].16b,$ivec.16b
1157         st1     {@data[0].4s},[$outp]
1158         b       100f
1159 1:      // last 2 blocks processing
1160         dup     @data[0].4s,$word0
1161         dup     @data[1].4s,$word1
1162         dup     @data[2].4s,$word2
1163         mov     @data[3].s[0],$ctr
1164         add     $ctr,$ctr,#1
1165         mov     @data[3].s[1],$ctr
1166         subs    $blocks,$blocks,#1
1167         b.ne    1f
1168         bl      _vpsm4_enc_4blks
1169         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1170         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1171         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1172         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1173         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1174         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1175         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1176         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1177         b       100f
1178 1:      // last 3 blocks processing
1179         add     $ctr,$ctr,#1
1180         mov     @data[3].s[2],$ctr
1181         bl      _vpsm4_enc_4blks
1182         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1183         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1184         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1185         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1186         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1187         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1188         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1189         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1190         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1191         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1192 100:
1193         ldp     d10,d11,[sp,#16]
1194         ldp     d12,d13,[sp,#32]
1195         ldp     d14,d15,[sp,#48]
1196         ldp     x29,x30,[sp,#64]
1197         ldp     d8,d9,[sp],#80
1198         AARCH64_VALIDATE_LINK_REGISTER
1199         ret
1200 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1201 ___
1202 }}}
1203
1204 {{{
1205 my ($blocks,$len)=("x2","x2");
1206 my $ivp=("x5");
1207 my @twx=map("x$_",(12..27));
1208 my ($rks1,$rks2)=("x26","x27");
1209 my $lastBlk=("x26");
1210 my $enc=("w28");
1211 my $remain=("x29");
1212
1213 my @tweak=@datax;
1214
1215 sub gen_xts_cipher() {
1216         my $std = shift;
1217 $code.=<<___;
1218 .globl  ${prefix}_xts_encrypt${std}
1219 .type   ${prefix}_xts_encrypt${std},%function
1220 .align  5
1221 ${prefix}_xts_encrypt${std}:
1222         AARCH64_SIGN_LINK_REGISTER
1223         stp     x15, x16, [sp, #-0x10]!
1224         stp     x17, x18, [sp, #-0x10]!
1225         stp     x19, x20, [sp, #-0x10]!
1226         stp     x21, x22, [sp, #-0x10]!
1227         stp     x23, x24, [sp, #-0x10]!
1228         stp     x25, x26, [sp, #-0x10]!
1229         stp     x27, x28, [sp, #-0x10]!
1230         stp     x29, x30, [sp, #-0x10]!
1231         stp     d8, d9, [sp, #-0x10]!
1232         stp     d10, d11, [sp, #-0x10]!
1233         stp     d12, d13, [sp, #-0x10]!
1234         stp     d14, d15, [sp, #-0x10]!
1235         mov     $rks1,x3
1236         mov     $rks2,x4
1237         mov     $enc,w6
1238         ld1     {@tweak[0].4s}, [$ivp]
1239         mov     $rks,$rks2
1240 ___
1241         &load_sbox();
1242         &rev32(@tweak[0],@tweak[0]);
1243         &encrypt_1blk(@tweak[0]);
1244 $code.=<<___;
1245         mov     $rks,$rks1
1246         and     $remain,$len,#0x0F
1247         // convert length into blocks
1248         lsr     $blocks,$len,4
1249         cmp     $blocks,#1
1250         b.lt .return${std}
1251
1252         cmp $remain,0
1253         // If the encryption/decryption Length is N times of 16,
1254         // the all blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1255         b.eq .xts_encrypt_blocks${std}
1256
1257         // If the encryption/decryption length is not N times of 16,
1258         // the last two blocks are encrypted/decrypted in .last_2blks_tweak${std} or .only_2blks_tweak${std}
1259         // the other blocks are encrypted/decrypted in .xts_encrypt_blocks${std}
1260         subs $blocks,$blocks,#1
1261         b.eq .only_2blks_tweak${std}
1262 .xts_encrypt_blocks${std}:
1263 ___
1264         &rbit(@tweak[0],@tweak[0],$std);
1265         &rev32_armeb(@tweak[0],@tweak[0]);
1266         &mov_vec_to_reg(@tweak[0],@twx[0],@twx[1]);
1267         &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1268         &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1269         &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1270         &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1271         &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1272         &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1273         &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1274 $code.=<<___;
1275 .Lxts_8_blocks_process${std}:
1276         cmp     $blocks,#8
1277         b.lt    .Lxts_4_blocks_process${std}
1278 ___
1279         &mov_reg_to_vec(@twx[0],@twx[1],@vtmp[0]);
1280         &mov_reg_to_vec(@twx[2],@twx[3],@vtmp[1]);
1281         &mov_reg_to_vec(@twx[4],@twx[5],@vtmp[2]);
1282         &mov_reg_to_vec(@twx[6],@twx[7],@vtmp[3]);
1283         &mov_reg_to_vec(@twx[8],@twx[9],@vtmpx[0]);
1284         &mov_reg_to_vec(@twx[10],@twx[11],@vtmpx[1]);
1285         &mov_reg_to_vec(@twx[12],@twx[13],@vtmpx[2]);
1286         &mov_reg_to_vec(@twx[14],@twx[15],@vtmpx[3]);
1287 $code.=<<___;
1288         ld1 {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1289 ___
1290         &rbit(@vtmp[0],@vtmp[0],$std);
1291         &rbit(@vtmp[1],@vtmp[1],$std);
1292         &rbit(@vtmp[2],@vtmp[2],$std);
1293         &rbit(@vtmp[3],@vtmp[3],$std);
1294 $code.=<<___;
1295         eor @data[0].16b, @data[0].16b, @vtmp[0].16b
1296         eor @data[1].16b, @data[1].16b, @vtmp[1].16b
1297         eor @data[2].16b, @data[2].16b, @vtmp[2].16b
1298         eor @data[3].16b, @data[3].16b, @vtmp[3].16b
1299         ld1     {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1300 ___
1301         &rbit(@vtmpx[0],@vtmpx[0],$std);
1302         &rbit(@vtmpx[1],@vtmpx[1],$std);
1303         &rbit(@vtmpx[2],@vtmpx[2],$std);
1304         &rbit(@vtmpx[3],@vtmpx[3],$std);
1305 $code.=<<___;
1306         eor @datax[0].16b, @datax[0].16b, @vtmpx[0].16b
1307         eor @datax[1].16b, @datax[1].16b, @vtmpx[1].16b
1308         eor @datax[2].16b, @datax[2].16b, @vtmpx[2].16b
1309         eor @datax[3].16b, @datax[3].16b, @vtmpx[3].16b
1310 ___
1311         &rev32(@data[0],@data[0]);
1312         &rev32(@data[1],@data[1]);
1313         &rev32(@data[2],@data[2]);
1314         &rev32(@data[3],@data[3]);
1315         &rev32(@datax[0],@datax[0]);
1316         &rev32(@datax[1],@datax[1]);
1317         &rev32(@datax[2],@datax[2]);
1318         &rev32(@datax[3],@datax[3]);
1319         &transpose(@data,@vtmp);
1320         &transpose(@datax,@vtmp);
1321 $code.=<<___;
1322         bl      _${prefix}_enc_8blks
1323 ___
1324         &transpose(@vtmp,@datax);
1325         &transpose(@data,@datax);
1326
1327         &mov_reg_to_vec(@twx[0],@twx[1],@vtmpx[0]);
1328         &compute_tweak(@twx[14],@twx[15],@twx[0],@twx[1]);
1329         &mov_reg_to_vec(@twx[2],@twx[3],@vtmpx[1]);
1330         &compute_tweak(@twx[0],@twx[1],@twx[2],@twx[3]);
1331         &mov_reg_to_vec(@twx[4],@twx[5],@vtmpx[2]);
1332         &compute_tweak(@twx[2],@twx[3],@twx[4],@twx[5]);
1333         &mov_reg_to_vec(@twx[6],@twx[7],@vtmpx[3]);
1334         &compute_tweak(@twx[4],@twx[5],@twx[6],@twx[7]);
1335         &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1336         &compute_tweak(@twx[6],@twx[7],@twx[8],@twx[9]);
1337         &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1338         &compute_tweak(@twx[8],@twx[9],@twx[10],@twx[11]);
1339         &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1340         &compute_tweak(@twx[10],@twx[11],@twx[12],@twx[13]);
1341         &mov_reg_to_vec(@twx[14],@twx[15],@tweak[3]);
1342         &compute_tweak(@twx[12],@twx[13],@twx[14],@twx[15]);
1343 $code.=<<___;
1344         eor @vtmp[0].16b, @vtmp[0].16b, @vtmpx[0].16b
1345         eor @vtmp[1].16b, @vtmp[1].16b, @vtmpx[1].16b
1346         eor @vtmp[2].16b, @vtmp[2].16b, @vtmpx[2].16b
1347         eor @vtmp[3].16b, @vtmp[3].16b, @vtmpx[3].16b
1348         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1349         eor @data[1].16b, @data[1].16b, @tweak[1].16b
1350         eor @data[2].16b, @data[2].16b, @tweak[2].16b
1351         eor @data[3].16b, @data[3].16b, @tweak[3].16b
1352
1353         // save the last tweak
1354         st1     {@tweak[3].4s},[$ivp]
1355         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1356         st1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1357         subs    $blocks,$blocks,#8
1358         b.gt    .Lxts_8_blocks_process${std}
1359         b       100f
1360 .Lxts_4_blocks_process${std}:
1361 ___
1362         &mov_reg_to_vec(@twx[0],@twx[1],@tweak[0]);
1363         &mov_reg_to_vec(@twx[2],@twx[3],@tweak[1]);
1364         &mov_reg_to_vec(@twx[4],@twx[5],@tweak[2]);
1365         &mov_reg_to_vec(@twx[6],@twx[7],@tweak[3]);
1366 $code.=<<___;
1367         cmp     $blocks,#4
1368         b.lt    1f
1369         ld1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
1370 ___
1371         &rbit(@tweak[0],@tweak[0],$std);
1372         &rbit(@tweak[1],@tweak[1],$std);
1373         &rbit(@tweak[2],@tweak[2],$std);
1374         &rbit(@tweak[3],@tweak[3],$std);
1375 $code.=<<___;
1376         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1377         eor @data[1].16b, @data[1].16b, @tweak[1].16b
1378         eor @data[2].16b, @data[2].16b, @tweak[2].16b
1379         eor @data[3].16b, @data[3].16b, @tweak[3].16b
1380 ___
1381         &rev32(@data[0],@data[0]);
1382         &rev32(@data[1],@data[1]);
1383         &rev32(@data[2],@data[2]);
1384         &rev32(@data[3],@data[3]);
1385         &transpose(@data,@vtmp);
1386 $code.=<<___;
1387         bl      _${prefix}_enc_4blks
1388 ___
1389         &transpose(@vtmp,@data);
1390 $code.=<<___;
1391         eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1392         eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1393         eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1394         eor @vtmp[3].16b, @vtmp[3].16b, @tweak[3].16b
1395         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1396         sub     $blocks,$blocks,#4
1397 ___
1398         &mov_reg_to_vec(@twx[8],@twx[9],@tweak[0]);
1399         &mov_reg_to_vec(@twx[10],@twx[11],@tweak[1]);
1400         &mov_reg_to_vec(@twx[12],@twx[13],@tweak[2]);
1401 $code.=<<___;
1402         // save the last tweak
1403         st1     {@tweak[3].4s},[$ivp]
1404 1:
1405         // process last block
1406         cmp     $blocks,#1
1407         b.lt    100f
1408         b.gt    1f
1409         ld1     {@data[0].4s},[$inp],#16
1410 ___
1411         &rbit(@tweak[0],@tweak[0],$std);
1412 $code.=<<___;
1413         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1414 ___
1415         &rev32(@data[0],@data[0]);
1416         &encrypt_1blk(@data[0]);
1417 $code.=<<___;
1418         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1419         st1     {@data[0].4s},[$outp],#16
1420         // save the last tweak
1421         st1     {@tweak[0].4s},[$ivp]
1422         b       100f
1423 1:  // process last 2 blocks
1424         cmp     $blocks,#2
1425         b.gt    1f
1426         ld1     {@data[0].4s,@data[1].4s},[$inp],#32
1427 ___
1428         &rbit(@tweak[0],@tweak[0],$std);
1429         &rbit(@tweak[1],@tweak[1],$std);
1430 $code.=<<___;
1431         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1432         eor @data[1].16b, @data[1].16b, @tweak[1].16b
1433 ___
1434         &rev32(@data[0],@data[0]);
1435         &rev32(@data[1],@data[1]);
1436         &transpose(@data,@vtmp);
1437 $code.=<<___;
1438         bl      _${prefix}_enc_4blks
1439 ___
1440         &transpose(@vtmp,@data);
1441 $code.=<<___;
1442         eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1443         eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1444         st1     {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
1445         // save the last tweak
1446         st1     {@tweak[1].4s},[$ivp]
1447         b       100f
1448 1:  // process last 3 blocks
1449         ld1     {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
1450 ___
1451         &rbit(@tweak[0],@tweak[0],$std);
1452         &rbit(@tweak[1],@tweak[1],$std);
1453         &rbit(@tweak[2],@tweak[2],$std);
1454 $code.=<<___;
1455         eor @data[0].16b, @data[0].16b, @tweak[0].16b
1456         eor @data[1].16b, @data[1].16b, @tweak[1].16b
1457         eor @data[2].16b, @data[2].16b, @tweak[2].16b
1458 ___
1459         &rev32(@data[0],@data[0]);
1460         &rev32(@data[1],@data[1]);
1461         &rev32(@data[2],@data[2]);
1462         &transpose(@data,@vtmp);
1463 $code.=<<___;
1464         bl      _${prefix}_enc_4blks
1465 ___
1466         &transpose(@vtmp,@data);
1467 $code.=<<___;
1468         eor @vtmp[0].16b, @vtmp[0].16b, @tweak[0].16b
1469         eor @vtmp[1].16b, @vtmp[1].16b, @tweak[1].16b
1470         eor @vtmp[2].16b, @vtmp[2].16b, @tweak[2].16b
1471         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
1472         // save the last tweak
1473         st1     {@tweak[2].4s},[$ivp]
1474 100:
1475         cmp $remain,0
1476         b.eq .return${std}
1477
1478 // This brance calculates the last two tweaks, 
1479 // while the encryption/decryption length is larger than 32
1480 .last_2blks_tweak${std}:
1481         ld1     {@tweak[0].4s},[$ivp]
1482 ___
1483         &rev32_armeb(@tweak[0],@tweak[0]);
1484         &compute_tweak_vec(@tweak[0],@tweak[1],$std);
1485         &compute_tweak_vec(@tweak[1],@tweak[2],$std);
1486 $code.=<<___;
1487         b .check_dec${std}
1488
1489
1490 // This brance calculates the last two tweaks, 
1491 // while the encryption/decryption length is equal to 32, who only need two tweaks
1492 .only_2blks_tweak${std}:
1493         mov @tweak[1].16b,@tweak[0].16b
1494 ___
1495         &rev32_armeb(@tweak[1],@tweak[1]);
1496         &compute_tweak_vec(@tweak[1],@tweak[2]);
1497 $code.=<<___;
1498         b .check_dec${std}
1499
1500
1501 // Determine whether encryption or decryption is required.
1502 // The last two tweaks need to be swapped for decryption.
1503 .check_dec${std}:
1504         // encryption:1 decryption:0
1505         cmp $enc,1
1506         b.eq .prcess_last_2blks${std}
1507         mov @vtmp[0].16B,@tweak[1].16b
1508         mov @tweak[1].16B,@tweak[2].16b
1509         mov @tweak[2].16B,@vtmp[0].16b
1510
1511 .prcess_last_2blks${std}:
1512 ___
1513         &rev32_armeb(@tweak[1],@tweak[1]);
1514         &rev32_armeb(@tweak[2],@tweak[2]);
1515 $code.=<<___;
1516         ld1     {@data[0].4s},[$inp],#16
1517         eor @data[0].16b, @data[0].16b, @tweak[1].16b
1518 ___
1519         &rev32(@data[0],@data[0]);
1520         &encrypt_1blk(@data[0]);
1521 $code.=<<___;
1522         eor @data[0].16b, @data[0].16b, @tweak[1].16b
1523         st1     {@data[0].4s},[$outp],#16
1524
1525         sub $lastBlk,$outp,16
1526         .loop${std}:
1527                 subs $remain,$remain,1
1528                 ldrb    $wtmp0,[$lastBlk,$remain]
1529                 ldrb    $wtmp1,[$inp,$remain]
1530                 strb    $wtmp1,[$lastBlk,$remain]
1531                 strb    $wtmp0,[$outp,$remain]
1532         b.gt .loop${std}
1533         ld1             {@data[0].4s}, [$lastBlk]       
1534         eor @data[0].16b, @data[0].16b, @tweak[2].16b
1535 ___
1536         &rev32(@data[0],@data[0]);
1537         &encrypt_1blk(@data[0]);
1538 $code.=<<___;
1539         eor @data[0].16b, @data[0].16b, @tweak[2].16b
1540         st1             {@data[0].4s}, [$lastBlk]
1541 .return${std}:
1542         ldp             d14, d15, [sp], #0x10
1543         ldp             d12, d13, [sp], #0x10
1544         ldp             d10, d11, [sp], #0x10
1545         ldp             d8, d9, [sp], #0x10
1546         ldp             x29, x30, [sp], #0x10
1547         ldp             x27, x28, [sp], #0x10
1548         ldp             x25, x26, [sp], #0x10
1549         ldp             x23, x24, [sp], #0x10
1550         ldp             x21, x22, [sp], #0x10
1551         ldp             x19, x20, [sp], #0x10
1552         ldp             x17, x18, [sp], #0x10
1553         ldp             x15, x16, [sp], #0x10
1554         AARCH64_VALIDATE_LINK_REGISTER
1555         ret
1556 .size   ${prefix}_xts_encrypt${std},.-${prefix}_xts_encrypt${std}
1557 ___
1558 } # end of gen_xts_cipher
1559 &gen_xts_cipher("_gb");
1560 &gen_xts_cipher("");
1561 }}}
1562 ########################################
1563 open SELF,$0;
1564 while(<SELF>) {
1565         next if (/^#!/);
1566         last if (!s/^#/\/\// and !/^$/);
1567         print;
1568 }
1569 close SELF;
1570
1571 foreach(split("\n",$code)) {
1572         s/\`([^\`]*)\`/eval($1)/ge;
1573         print $_,"\n";
1574 }
1575
1576 close STDOUT or die "error closing STDOUT: $!";