Update copyright year
[openssl.git] / crypto / sm4 / asm / vpsm4-armv8.pl
1 #! /usr/bin/env perl
2 # Copyright 2020-2022 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the Apache License 2.0 (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # This module implements SM4 with ASIMD on aarch64
11 #
12 # Feb 2022
13 #
14
15 # $output is the last argument if it looks like a file (it has an extension)
16 # $flavour is the first argument if it doesn't look like a file
17 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
18 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
19
20 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
21 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
22 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
23 die "can't locate arm-xlate.pl";
24
25 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
26     or die "can't call $xlate: $!";
27 *STDOUT=*OUT;
28
29 $prefix="vpsm4";
30 my @vtmp=map("v$_",(0..3));
31 my @data=map("v$_",(4..7));
32 my @datax=map("v$_",(8..11));
33 my ($rk0,$rk1)=("v12","v13");
34 my ($rka,$rkb)=("v14","v15");
35 my @vtmpx=map("v$_",(12..15));
36 my @sbox=map("v$_",(16..31));
37 my ($inp,$outp,$blocks,$rks)=("x0","x1","w2","x3");
38 my ($tmpw,$tmp,$wtmp0,$wtmp1,$wtmp2)=("w6","x6","w7","w8","w9");
39 my ($ptr,$counter)=("x10","w11");
40 my ($word0,$word1,$word2,$word3)=("w12","w13","w14","w15");
41
42 sub rev32() {
43         my $dst = shift;
44         my $src = shift;
45
46         if ($src and ("$src" ne "$dst")) {
47 $code.=<<___;
48 #ifndef __ARMEB__
49         rev32   $dst.16b,$src.16b
50 #else
51         mov     $dst.16b,$src.16b
52 #endif
53 ___
54         } else {
55 $code.=<<___;
56 #ifndef __ARMEB__
57         rev32   $dst.16b,$dst.16b
58 #endif
59 ___
60         }
61 }
62
63 sub transpose() {
64         my ($dat0,$dat1,$dat2,$dat3,$vt0,$vt1,$vt2,$vt3) = @_;
65
66 $code.=<<___;
67         zip1    $vt0.4s,$dat0.4s,$dat1.4s
68         zip2    $vt1.4s,$dat0.4s,$dat1.4s
69         zip1    $vt2.4s,$dat2.4s,$dat3.4s
70         zip2    $vt3.4s,$dat2.4s,$dat3.4s
71         zip1    $dat0.2d,$vt0.2d,$vt2.2d
72         zip2    $dat1.2d,$vt0.2d,$vt2.2d
73         zip1    $dat2.2d,$vt1.2d,$vt3.2d
74         zip2    $dat3.2d,$vt1.2d,$vt3.2d
75 ___
76 }
77
78 # sbox operations for 4-lane of words
79 sub sbox() {
80         my $dat = shift;
81
82 $code.=<<___;
83         movi    @vtmp[0].16b,#64
84         movi    @vtmp[1].16b,#128
85         movi    @vtmp[2].16b,#192
86         sub     @vtmp[0].16b,$dat.16b,@vtmp[0].16b
87         sub     @vtmp[1].16b,$dat.16b,@vtmp[1].16b
88         sub     @vtmp[2].16b,$dat.16b,@vtmp[2].16b
89         tbl     $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
90         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
91         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
92         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
93         add     @vtmp[0].2d,@vtmp[0].2d,@vtmp[1].2d
94         add     @vtmp[2].2d,@vtmp[2].2d,$dat.2d
95         add     $dat.2d,@vtmp[0].2d,@vtmp[2].2d
96
97         ushr    @vtmp[0].4s,$dat.4s,32-2
98         sli     @vtmp[0].4s,$dat.4s,2
99         ushr    @vtmp[2].4s,$dat.4s,32-10
100         eor     @vtmp[1].16b,@vtmp[0].16b,$dat.16b
101         sli     @vtmp[2].4s,$dat.4s,10
102         eor     @vtmp[1].16b,@vtmp[2].16b,$vtmp[1].16b
103         ushr    @vtmp[0].4s,$dat.4s,32-18
104         sli     @vtmp[0].4s,$dat.4s,18
105         ushr    @vtmp[2].4s,$dat.4s,32-24
106         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
107         sli     @vtmp[2].4s,$dat.4s,24
108         eor     $dat.16b,@vtmp[2].16b,@vtmp[1].16b
109 ___
110 }
111
112 # sbox operation for 8-lane of words
113 sub sbox_double() {
114         my $dat = shift;
115         my $datx = shift;
116
117 $code.=<<___;
118         movi    @vtmp[3].16b,#64
119         sub     @vtmp[0].16b,$dat.16b,@vtmp[3].16b
120         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
121         sub     @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
122         tbl     $dat.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$dat.16b
123         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
124         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
125         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
126         add     @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
127         add     $dat.2d,@vtmp[2].2d,$dat.2d
128         add     $dat.2d,@vtmp[1].2d,$dat.2d
129
130         sub     @vtmp[0].16b,$datx.16b,@vtmp[3].16b
131         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[3].16b
132         sub     @vtmp[2].16b,@vtmp[1].16b,@vtmp[3].16b
133         tbl     $datx.16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},$datx.16b
134         tbl     @vtmp[0].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[0].16b
135         tbl     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[1].16b
136         tbl     @vtmp[2].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[2].16b
137         add     @vtmp[1].2d,@vtmp[0].2d,@vtmp[1].2d
138         add     $datx.2d,@vtmp[2].2d,$datx.2d
139         add     $datx.2d,@vtmp[1].2d,$datx.2d
140
141         ushr    @vtmp[0].4s,$dat.4s,32-2
142         sli     @vtmp[0].4s,$dat.4s,2
143         ushr    @vtmp[2].4s,$datx.4s,32-2
144         eor     @vtmp[1].16b,@vtmp[0].16b,$dat.16b
145         sli     @vtmp[2].4s,$datx.4s,2
146
147         ushr    @vtmp[0].4s,$dat.4s,32-10
148         eor     @vtmp[3].16b,@vtmp[2].16b,$datx.16b
149         sli     @vtmp[0].4s,$dat.4s,10
150         ushr    @vtmp[2].4s,$datx.4s,32-10
151         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
152         sli     @vtmp[2].4s,$datx.4s,10
153
154         ushr    @vtmp[0].4s,$dat.4s,32-18
155         eor     @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
156         sli     @vtmp[0].4s,$dat.4s,18
157         ushr    @vtmp[2].4s,$datx.4s,32-18
158         eor     @vtmp[1].16b,@vtmp[0].16b,$vtmp[1].16b
159         sli     @vtmp[2].4s,$datx.4s,18
160
161         ushr    @vtmp[0].4s,$dat.4s,32-24
162         eor     @vtmp[3].16b,@vtmp[2].16b,$vtmp[3].16b
163         sli     @vtmp[0].4s,$dat.4s,24
164         ushr    @vtmp[2].4s,$datx.4s,32-24
165         eor     $dat.16b,@vtmp[0].16b,@vtmp[1].16b
166         sli     @vtmp[2].4s,$datx.4s,24
167         eor     $datx.16b,@vtmp[2].16b,@vtmp[3].16b
168 ___
169 }
170
171 # sbox operation for one single word
172 sub sbox_1word () {
173         my $word = shift;
174
175 $code.=<<___;
176         movi    @vtmp[1].16b,#64
177         movi    @vtmp[2].16b,#128
178         movi    @vtmp[3].16b,#192
179         mov     @vtmp[0].s[0],$word
180
181         sub     @vtmp[1].16b,@vtmp[0].16b,@vtmp[1].16b
182         sub     @vtmp[2].16b,@vtmp[0].16b,@vtmp[2].16b
183         sub     @vtmp[3].16b,@vtmp[0].16b,@vtmp[3].16b
184
185         tbl     @vtmp[0].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@vtmp[0].16b
186         tbl     @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@vtmp[1].16b
187         tbl     @vtmp[2].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@vtmp[2].16b
188         tbl     @vtmp[3].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@vtmp[3].16b
189
190         mov     $word,@vtmp[0].s[0]
191         mov     $wtmp0,@vtmp[1].s[0]
192         mov     $wtmp2,@vtmp[2].s[0]
193         add     $wtmp0,$word,$wtmp0
194         mov     $word,@vtmp[3].s[0]
195         add     $wtmp0,$wtmp0,$wtmp2
196         add     $wtmp0,$wtmp0,$word
197
198         eor     $word,$wtmp0,$wtmp0,ror #32-2
199         eor     $word,$word,$wtmp0,ror #32-10
200         eor     $word,$word,$wtmp0,ror #32-18
201         eor     $word,$word,$wtmp0,ror #32-24
202 ___
203 }
204
205 # sm4 for one block of data, in scalar registers word0/word1/word2/word3
206 sub sm4_1blk () {
207         my $kptr = shift;
208
209 $code.=<<___;
210         ldp     $wtmp0,$wtmp1,[$kptr],8
211         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
212         eor     $tmpw,$word2,$word3
213         eor     $wtmp2,$wtmp0,$word1
214         eor     $tmpw,$tmpw,$wtmp2
215 ___
216         &sbox_1word($tmpw);
217 $code.=<<___;
218         eor     $word0,$word0,$tmpw
219         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
220         eor     $tmpw,$word2,$word3
221         eor     $wtmp2,$word0,$wtmp1
222         eor     $tmpw,$tmpw,$wtmp2
223 ___
224         &sbox_1word($tmpw);
225 $code.=<<___;
226         ldp     $wtmp0,$wtmp1,[$kptr],8
227         eor     $word1,$word1,$tmpw
228         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
229         eor     $tmpw,$word0,$word1
230         eor     $wtmp2,$wtmp0,$word3
231         eor     $tmpw,$tmpw,$wtmp2
232 ___
233         &sbox_1word($tmpw);
234 $code.=<<___;
235         eor     $word2,$word2,$tmpw
236         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
237         eor     $tmpw,$word0,$word1
238         eor     $wtmp2,$word2,$wtmp1
239         eor     $tmpw,$tmpw,$wtmp2
240 ___
241         &sbox_1word($tmpw);
242 $code.=<<___;
243         eor     $word3,$word3,$tmpw
244 ___
245 }
246
247 # sm4 for 4-lanes of data, in neon registers data0/data1/data2/data3
248 sub sm4_4blks () {
249         my $kptr = shift;
250
251 $code.=<<___;
252         ldp     $wtmp0,$wtmp1,[$kptr],8
253         dup     $rk0.4s,$wtmp0
254         dup     $rk1.4s,$wtmp1
255
256         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
257         eor     $rka.16b,@data[2].16b,@data[3].16b
258         eor     $rk0.16b,@data[1].16b,$rk0.16b
259         eor     $rk0.16b,$rka.16b,$rk0.16b
260 ___
261         &sbox($rk0);
262 $code.=<<___;
263         eor     @data[0].16b,@data[0].16b,$rk0.16b
264
265         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
266         eor     $rka.16b,$rka.16b,@data[0].16b
267         eor     $rk1.16b,$rka.16b,$rk1.16b
268 ___
269         &sbox($rk1);
270 $code.=<<___;
271         ldp     $wtmp0,$wtmp1,[$kptr],8
272         eor     @data[1].16b,@data[1].16b,$rk1.16b
273
274         dup     $rk0.4s,$wtmp0
275         dup     $rk1.4s,$wtmp1
276
277         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
278         eor     $rka.16b,@data[0].16b,@data[1].16b
279         eor     $rk0.16b,@data[3].16b,$rk0.16b
280         eor     $rk0.16b,$rka.16b,$rk0.16b
281 ___
282         &sbox($rk0);
283 $code.=<<___;
284         eor     @data[2].16b,@data[2].16b,$rk0.16b
285
286         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
287         eor     $rka.16b,$rka.16b,@data[2].16b
288         eor     $rk1.16b,$rka.16b,$rk1.16b
289 ___
290         &sbox($rk1);
291 $code.=<<___;
292         eor     @data[3].16b,@data[3].16b,$rk1.16b
293 ___
294 }
295
296 # sm4 for 8 lanes of data, in neon registers
297 # data0/data1/data2/data3 datax0/datax1/datax2/datax3
298 sub sm4_8blks () {
299         my $kptr = shift;
300
301 $code.=<<___;
302         ldp     $wtmp0,$wtmp1,[$kptr],8
303         // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0)
304         dup     $rk0.4s,$wtmp0
305         eor     $rka.16b,@data[2].16b,@data[3].16b
306         eor     $rkb.16b,@datax[2].16b,@datax[3].16b
307         eor     @vtmp[0].16b,@data[1].16b,$rk0.16b
308         eor     @vtmp[1].16b,@datax[1].16b,$rk0.16b
309         eor     $rk0.16b,$rka.16b,@vtmp[0].16b
310         eor     $rk1.16b,$rkb.16b,@vtmp[1].16b
311 ___
312         &sbox_double($rk0,$rk1);
313 $code.=<<___;
314         eor     @data[0].16b,@data[0].16b,$rk0.16b
315         eor     @datax[0].16b,@datax[0].16b,$rk1.16b
316
317         // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1)
318         dup     $rk1.4s,$wtmp1
319         eor     $rka.16b,$rka.16b,@data[0].16b
320         eor     $rkb.16b,$rkb.16b,@datax[0].16b
321         eor     $rk0.16b,$rka.16b,$rk1.16b
322         eor     $rk1.16b,$rkb.16b,$rk1.16b
323 ___
324         &sbox_double($rk0,$rk1);
325 $code.=<<___;
326         ldp     $wtmp0,$wtmp1,[$kptr],8
327         eor     @data[1].16b,@data[1].16b,$rk0.16b
328         eor     @datax[1].16b,@datax[1].16b,$rk1.16b
329
330         // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2)
331         dup     $rk0.4s,$wtmp0
332         eor     $rka.16b,@data[0].16b,@data[1].16b
333         eor     $rkb.16b,@datax[0].16b,@datax[1].16b
334         eor     @vtmp[0].16b,@data[3].16b,$rk0.16b
335         eor     @vtmp[1].16b,@datax[3].16b,$rk0.16b
336         eor     $rk0.16b,$rka.16b,@vtmp[0].16b
337         eor     $rk1.16b,$rkb.16b,@vtmp[1].16b
338 ___
339         &sbox_double($rk0,$rk1);
340 $code.=<<___;
341         eor     @data[2].16b,@data[2].16b,$rk0.16b
342         eor     @datax[2].16b,@datax[2].16b,$rk1.16b
343
344         // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3)
345         dup     $rk1.4s,$wtmp1
346         eor     $rka.16b,$rka.16b,@data[2].16b
347         eor     $rkb.16b,$rkb.16b,@datax[2].16b
348         eor     $rk0.16b,$rka.16b,$rk1.16b
349         eor     $rk1.16b,$rkb.16b,$rk1.16b
350 ___
351         &sbox_double($rk0,$rk1);
352 $code.=<<___;
353         eor     @data[3].16b,@data[3].16b,$rk0.16b
354         eor     @datax[3].16b,@datax[3].16b,$rk1.16b
355 ___
356 }
357
358 sub encrypt_1blk_norev() {
359         my $dat = shift;
360
361 $code.=<<___;
362         mov     $ptr,$rks
363         mov     $counter,#8
364         mov     $word0,$dat.s[0]
365         mov     $word1,$dat.s[1]
366         mov     $word2,$dat.s[2]
367         mov     $word3,$dat.s[3]
368 10:
369 ___
370         &sm4_1blk($ptr);
371 $code.=<<___;
372         subs    $counter,$counter,#1
373         b.ne    10b
374         mov     $dat.s[0],$word3
375         mov     $dat.s[1],$word2
376         mov     $dat.s[2],$word1
377         mov     $dat.s[3],$word0
378 ___
379 }
380
381 sub encrypt_1blk() {
382         my $dat = shift;
383
384         &encrypt_1blk_norev($dat);
385         &rev32($dat,$dat);
386 }
387
388 sub encrypt_4blks() {
389 $code.=<<___;
390         mov     $ptr,$rks
391         mov     $counter,#8
392 10:
393 ___
394         &sm4_4blks($ptr);
395 $code.=<<___;
396         subs    $counter,$counter,#1
397         b.ne    10b
398 ___
399         &rev32(@vtmp[3],@data[0]);
400         &rev32(@vtmp[2],@data[1]);
401         &rev32(@vtmp[1],@data[2]);
402         &rev32(@vtmp[0],@data[3]);
403 }
404
405 sub encrypt_8blks() {
406 $code.=<<___;
407         mov     $ptr,$rks
408         mov     $counter,#8
409 10:
410 ___
411         &sm4_8blks($ptr);
412 $code.=<<___;
413         subs    $counter,$counter,#1
414         b.ne    10b
415 ___
416         &rev32(@vtmp[3],@data[0]);
417         &rev32(@vtmp[2],@data[1]);
418         &rev32(@vtmp[1],@data[2]);
419         &rev32(@vtmp[0],@data[3]);
420         &rev32(@data[3],@datax[0]);
421         &rev32(@data[2],@datax[1]);
422         &rev32(@data[1],@datax[2]);
423         &rev32(@data[0],@datax[3]);
424 }
425
426 sub load_sbox () {
427         my $data = shift;
428
429 $code.=<<___;
430         adr     $ptr,.Lsbox
431         ld1     {@sbox[0].4s,@sbox[1].4s,@sbox[2].4s,@sbox[3].4s},[$ptr],#64
432         ld1     {@sbox[4].4s,@sbox[5].4s,@sbox[6].4s,@sbox[7].4s},[$ptr],#64
433         ld1     {@sbox[8].4s,@sbox[9].4s,@sbox[10].4s,@sbox[11].4s},[$ptr],#64
434         ld1     {@sbox[12].4s,@sbox[13].4s,@sbox[14].4s,@sbox[15].4s},[$ptr]
435 ___
436 }
437
438 $code=<<___;
439 #include "arm_arch.h"
440 .arch   armv8-a
441 .text
442
443 .type   _vpsm4_consts,%object
444 .align  7
445 _vpsm4_consts:
446 .Lsbox:
447         .byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05
448         .byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99
449         .byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62
450         .byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6
451         .byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8
452         .byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35
453         .byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87
454         .byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E
455         .byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1
456         .byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3
457         .byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F
458         .byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51
459         .byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8
460         .byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0
461         .byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84
462         .byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48
463 .Lck:
464         .long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269
465         .long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9
466         .long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249
467         .long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9
468         .long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229
469         .long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299
470         .long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209
471         .long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279
472 .Lfk:
473         .dword 0x56aa3350a3b1bac6,0xb27022dc677d9197
474 .Lshuffles:
475         .dword 0x0B0A090807060504,0x030201000F0E0D0C
476
477 .size   _vpsm4_consts,.-_vpsm4_consts
478 ___
479
480 {{{
481 my ($key,$keys,$enc)=("x0","x1","w2");
482 my ($pointer,$schedules,$wtmp,$roundkey)=("x5","x6","w7","w8");
483 my ($vkey,$vfk,$vmap)=("v5","v6","v7");
484 $code.=<<___;
485 .type   _vpsm4_set_key,%function
486 .align  4
487 _vpsm4_set_key:
488         AARCH64_VALID_CALL_TARGET
489         ld1     {$vkey.4s},[$key]
490 ___
491         &load_sbox();
492         &rev32($vkey,$vkey);
493 $code.=<<___;
494         adr     $pointer,.Lshuffles
495         ld1     {$vmap.4s},[$pointer]
496         adr     $pointer,.Lfk
497         ld1     {$vfk.4s},[$pointer]
498         eor     $vkey.16b,$vkey.16b,$vfk.16b
499         mov     $schedules,#32
500         adr     $pointer,.Lck
501         movi    @vtmp[0].16b,#64
502         cbnz    $enc,1f
503         add     $keys,$keys,124
504 1:
505         mov     $wtmp,$vkey.s[1]
506         ldr     $roundkey,[$pointer],#4
507         eor     $roundkey,$roundkey,$wtmp
508         mov     $wtmp,$vkey.s[2]
509         eor     $roundkey,$roundkey,$wtmp
510         mov     $wtmp,$vkey.s[3]
511         eor     $roundkey,$roundkey,$wtmp
512         // sbox lookup
513         mov     @data[0].s[0],$roundkey
514         tbl     @vtmp[1].16b,{@sbox[0].16b,@sbox[1].16b,@sbox[2].16b,@sbox[3].16b},@data[0].16b
515         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
516         tbx     @vtmp[1].16b,{@sbox[4].16b,@sbox[5].16b,@sbox[6].16b,@sbox[7].16b},@data[0].16b
517         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
518         tbx     @vtmp[1].16b,{@sbox[8].16b,@sbox[9].16b,@sbox[10].16b,@sbox[11].16b},@data[0].16b
519         sub     @data[0].16b,@data[0].16b,@vtmp[0].16b
520         tbx     @vtmp[1].16b,{@sbox[12].16b,@sbox[13].16b,@sbox[14].16b,@sbox[15].16b},@data[0].16b
521         mov     $wtmp,@vtmp[1].s[0]
522         eor     $roundkey,$wtmp,$wtmp,ror #19
523         eor     $roundkey,$roundkey,$wtmp,ror #9
524         mov     $wtmp,$vkey.s[0]
525         eor     $roundkey,$roundkey,$wtmp
526         mov     $vkey.s[0],$roundkey
527         cbz     $enc,2f
528         str     $roundkey,[$keys],#4
529         b       3f
530 2:
531         str     $roundkey,[$keys],#-4
532 3:
533         tbl     $vkey.16b,{$vkey.16b},$vmap.16b
534         subs    $schedules,$schedules,#1
535         b.ne    1b
536         ret
537 .size   _vpsm4_set_key,.-_vpsm4_set_key
538 ___
539 }}}
540
541
542 {{{
543 $code.=<<___;
544 .type   _vpsm4_enc_4blks,%function
545 .align  4
546 _vpsm4_enc_4blks:
547         AARCH64_VALID_CALL_TARGET
548 ___
549         &encrypt_4blks();
550 $code.=<<___;
551         ret
552 .size   _vpsm4_enc_4blks,.-_vpsm4_enc_4blks
553 ___
554 }}}
555
556 {{{
557 $code.=<<___;
558 .type   _vpsm4_enc_8blks,%function
559 .align  4
560 _vpsm4_enc_8blks:
561         AARCH64_VALID_CALL_TARGET
562 ___
563         &encrypt_8blks();
564 $code.=<<___;
565         ret
566 .size   _vpsm4_enc_8blks,.-_vpsm4_enc_8blks
567 ___
568 }}}
569
570
571 {{{
572 my ($key,$keys)=("x0","x1");
573 $code.=<<___;
574 .globl  ${prefix}_set_encrypt_key
575 .type   ${prefix}_set_encrypt_key,%function
576 .align  5
577 ${prefix}_set_encrypt_key:
578         AARCH64_SIGN_LINK_REGISTER
579         stp     x29,x30,[sp,#-16]!
580         mov     w2,1
581         bl      _vpsm4_set_key
582         ldp     x29,x30,[sp],#16
583         AARCH64_VALIDATE_LINK_REGISTER
584         ret
585 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
586 ___
587 }}}
588
589 {{{
590 my ($key,$keys)=("x0","x1");
591 $code.=<<___;
592 .globl  ${prefix}_set_decrypt_key
593 .type   ${prefix}_set_decrypt_key,%function
594 .align  5
595 ${prefix}_set_decrypt_key:
596         AARCH64_SIGN_LINK_REGISTER
597         stp     x29,x30,[sp,#-16]!
598         mov     w2,0
599         bl      _vpsm4_set_key
600         ldp     x29,x30,[sp],#16
601         AARCH64_VALIDATE_LINK_REGISTER
602         ret
603 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
604 ___
605 }}}
606
607 {{{
608 sub gen_block () {
609         my $dir = shift;
610         my ($inp,$outp,$rk)=map("x$_",(0..2));
611
612 $code.=<<___;
613 .globl  ${prefix}_${dir}crypt
614 .type   ${prefix}_${dir}crypt,%function
615 .align  5
616 ${prefix}_${dir}crypt:
617         AARCH64_VALID_CALL_TARGET
618         ld1     {@data[0].16b},[$inp]
619 ___
620         &load_sbox();
621         &rev32(@data[0],@data[0]);
622 $code.=<<___;
623         mov     $rks,x2
624 ___
625         &encrypt_1blk(@data[0]);
626 $code.=<<___;
627         st1     {@data[0].16b},[$outp]
628         ret
629 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
630 ___
631 }
632 &gen_block("en");
633 &gen_block("de");
634 }}}
635
636 {{{
637 my ($enc) = ("w4");
638 my @dat=map("v$_",(16..23));
639
640 $code.=<<___;
641 .globl  ${prefix}_ecb_encrypt
642 .type   ${prefix}_ecb_encrypt,%function
643 .align  5
644 ${prefix}_ecb_encrypt:
645         AARCH64_SIGN_LINK_REGISTER
646         // convert length into blocks
647         lsr     x2,x2,4
648         stp     d8,d9,[sp,#-80]!
649         stp     d10,d11,[sp,#16]
650         stp     d12,d13,[sp,#32]
651         stp     d14,d15,[sp,#48]
652         stp     x29,x30,[sp,#64]
653 ___
654         &load_sbox();
655 $code.=<<___;
656 .Lecb_8_blocks_process:
657         cmp     $blocks,#8
658         b.lt    .Lecb_4_blocks_process
659         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
660         ld4     {@datax[0].4s,$datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
661 ___
662         &rev32(@data[0],@data[0]);
663         &rev32(@data[1],@data[1]);
664         &rev32(@data[2],@data[2]);
665         &rev32(@data[3],@data[3]);
666         &rev32(@datax[0],@datax[0]);
667         &rev32(@datax[1],@datax[1]);
668         &rev32(@datax[2],@datax[2]);
669         &rev32(@datax[3],@datax[3]);
670 $code.=<<___;
671         bl      _vpsm4_enc_8blks
672         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
673         st4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
674         subs    $blocks,$blocks,#8
675         b.gt    .Lecb_8_blocks_process
676         b       100f
677 .Lecb_4_blocks_process:
678         cmp     $blocks,#4
679         b.lt    1f
680         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
681 ___
682         &rev32(@data[0],@data[0]);
683         &rev32(@data[1],@data[1]);
684         &rev32(@data[2],@data[2]);
685         &rev32(@data[3],@data[3]);
686 $code.=<<___;
687         bl      _vpsm4_enc_4blks
688         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
689         sub     $blocks,$blocks,#4
690 1:
691         // process last block
692         cmp     $blocks,#1
693         b.lt    100f
694         b.gt    1f
695         ld1     {@data[0].16b},[$inp]
696 ___
697         &rev32(@data[0],@data[0]);
698         &encrypt_1blk(@data[0]);
699 $code.=<<___;
700         st1     {@data[0].16b},[$outp]
701         b       100f
702 1:      // process last 2 blocks
703         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp],#16
704         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$inp],#16
705         cmp     $blocks,#2
706         b.gt    1f
707 ___
708         &rev32(@data[0],@data[0]);
709         &rev32(@data[1],@data[1]);
710         &rev32(@data[2],@data[2]);
711         &rev32(@data[3],@data[3]);
712 $code.=<<___;
713         bl      _vpsm4_enc_4blks
714         st4     {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
715         st4     {@vtmp[0].s-@vtmp[3].s}[1],[$outp]
716         b       100f
717 1:      // process last 3 blocks
718         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$inp],#16
719 ___
720         &rev32(@data[0],@data[0]);
721         &rev32(@data[1],@data[1]);
722         &rev32(@data[2],@data[2]);
723         &rev32(@data[3],@data[3]);
724 $code.=<<___;
725         bl      _vpsm4_enc_4blks
726         st4     {@vtmp[0].s-@vtmp[3].s}[0],[$outp],#16
727         st4     {@vtmp[0].s-@vtmp[3].s}[1],[$outp],#16
728         st4     {@vtmp[0].s-@vtmp[3].s}[2],[$outp]
729 100:
730         ldp     d10,d11,[sp,#16]
731         ldp     d12,d13,[sp,#32]
732         ldp     d14,d15,[sp,#48]
733         ldp     x29,x30,[sp,#64]
734         ldp     d8,d9,[sp],#80
735         AARCH64_VALIDATE_LINK_REGISTER
736         ret
737 .size   ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
738 ___
739 }}}
740
741 {{{
742 my ($len,$ivp,$enc)=("x2","x4","w5");
743 my $ivec0=("v3");
744 my $ivec1=("v15");
745
746 $code.=<<___;
747 .globl  ${prefix}_cbc_encrypt
748 .type   ${prefix}_cbc_encrypt,%function
749 .align  5
750 ${prefix}_cbc_encrypt:
751         AARCH64_VALID_CALL_TARGET
752         lsr     $len,$len,4
753 ___
754         &load_sbox();
755 $code.=<<___;
756         cbz     $enc,.Ldec
757         ld1     {$ivec0.4s},[$ivp]
758 .Lcbc_4_blocks_enc:
759         cmp     $blocks,#4
760         b.lt    1f
761         ld1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
762         eor     @data[0].16b,@data[0].16b,$ivec0.16b
763 ___
764         &rev32(@data[1],@data[1]);
765         &rev32(@data[0],@data[0]);
766         &rev32(@data[2],@data[2]);
767         &rev32(@data[3],@data[3]);
768         &encrypt_1blk_norev(@data[0]);
769 $code.=<<___;
770         eor     @data[1].16b,@data[1].16b,@data[0].16b
771 ___
772         &encrypt_1blk_norev(@data[1]);
773         &rev32(@data[0],@data[0]);
774
775 $code.=<<___;
776         eor     @data[2].16b,@data[2].16b,@data[1].16b
777 ___
778         &encrypt_1blk_norev(@data[2]);
779         &rev32(@data[1],@data[1]);
780 $code.=<<___;
781         eor     @data[3].16b,@data[3].16b,@data[2].16b
782 ___
783         &encrypt_1blk_norev(@data[3]);
784         &rev32(@data[2],@data[2]);
785         &rev32(@data[3],@data[3]);
786 $code.=<<___;
787         orr     $ivec0.16b,@data[3].16b,@data[3].16b
788         st1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
789         subs    $blocks,$blocks,#4
790         b.ne    .Lcbc_4_blocks_enc
791         b       2f
792 1:
793         subs    $blocks,$blocks,#1
794         b.lt    2f
795         ld1     {@data[0].4s},[$inp],#16
796         eor     $ivec0.16b,$ivec0.16b,@data[0].16b
797 ___
798         &rev32($ivec0,$ivec0);
799         &encrypt_1blk($ivec0);
800 $code.=<<___;
801         st1     {$ivec0.16b},[$outp],#16
802         b       1b
803 2:
804         // save back IV
805         st1     {$ivec0.16b},[$ivp]
806         ret
807
808 .Ldec:
809         // decryption mode starts
810         AARCH64_SIGN_LINK_REGISTER
811         stp     d8,d9,[sp,#-80]!
812         stp     d10,d11,[sp,#16]
813         stp     d12,d13,[sp,#32]
814         stp     d14,d15,[sp,#48]
815         stp     x29,x30,[sp,#64]
816 .Lcbc_8_blocks_dec:
817         cmp     $blocks,#8
818         b.lt    1f
819         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
820         add     $ptr,$inp,#64
821         ld4     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$ptr]
822 ___
823         &rev32(@data[0],@data[0]);
824         &rev32(@data[1],@data[1]);
825         &rev32(@data[2],@data[2]);
826         &rev32(@data[3],$data[3]);
827         &rev32(@datax[0],@datax[0]);
828         &rev32(@datax[1],@datax[1]);
829         &rev32(@datax[2],@datax[2]);
830         &rev32(@datax[3],$datax[3]);
831 $code.=<<___;
832         bl      _vpsm4_enc_8blks
833 ___
834         &transpose(@vtmp,@datax);
835         &transpose(@data,@datax);
836 $code.=<<___;
837         ld1     {$ivec1.16b},[$ivp]
838         ld1     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
839         // note ivec1 and vtmpx[3] are resuing the same register
840         // care needs to be taken to avoid conflict
841         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
842         ld1     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
843         eor     @vtmp[1].16b,@vtmp[1].16b,@datax[0].16b
844         eor     @vtmp[2].16b,@vtmp[2].16b,@datax[1].16b
845         eor     @vtmp[3].16b,$vtmp[3].16b,@datax[2].16b
846         // save back IV
847         st1     {$vtmpx[3].16b}, [$ivp]
848         eor     @data[0].16b,@data[0].16b,$datax[3].16b
849         eor     @data[1].16b,@data[1].16b,@vtmpx[0].16b
850         eor     @data[2].16b,@data[2].16b,@vtmpx[1].16b
851         eor     @data[3].16b,$data[3].16b,@vtmpx[2].16b
852         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
853         st1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
854         subs    $blocks,$blocks,#8
855         b.gt    .Lcbc_8_blocks_dec
856         b.eq    100f
857 1:
858         ld1     {$ivec1.16b},[$ivp]
859 .Lcbc_4_blocks_dec:
860         cmp     $blocks,#4
861         b.lt    1f
862         ld4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp]
863 ___
864         &rev32(@data[0],@data[0]);
865         &rev32(@data[1],@data[1]);
866         &rev32(@data[2],@data[2]);
867         &rev32(@data[3],$data[3]);
868 $code.=<<___;
869         bl      _vpsm4_enc_4blks
870         ld1     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$inp],#64
871 ___
872         &transpose(@vtmp,@datax);
873 $code.=<<___;
874         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
875         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
876         orr     $ivec1.16b,@data[3].16b,@data[3].16b
877         eor     @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
878         eor     @vtmp[3].16b,$vtmp[3].16b,@data[2].16b
879         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
880         subs    $blocks,$blocks,#4
881         b.gt    .Lcbc_4_blocks_dec
882         // save back IV
883         st1     {@vtmp[3].16b}, [$ivp]
884         b       100f
885 1:      // last block
886         subs    $blocks,$blocks,#1
887         b.lt    100f
888         b.gt    1f
889         ld1     {@data[0].4s},[$inp],#16
890         // save back IV
891         st1     {$data[0].16b}, [$ivp]
892 ___
893         &rev32(@datax[0],@data[0]);
894         &encrypt_1blk(@datax[0]);
895 $code.=<<___;
896         eor     @datax[0].16b,@datax[0].16b,$ivec1.16b
897         st1     {@datax[0].16b},[$outp],#16
898         b       100f
899 1:      // last two blocks
900         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[0],[$inp]
901         add     $ptr,$inp,#16
902         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[1],[$ptr],#16
903         subs    $blocks,$blocks,1
904         b.gt    1f
905 ___
906         &rev32(@data[0],@data[0]);
907         &rev32(@data[1],@data[1]);
908         &rev32(@data[2],@data[2]);
909         &rev32(@data[3],@data[3]);
910 $code.=<<___;
911         bl      _vpsm4_enc_4blks
912         ld1     {@data[0].4s,@data[1].4s},[$inp],#32
913 ___
914         &transpose(@vtmp,@datax);
915 $code.=<<___;
916         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
917         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
918         st1     {@vtmp[0].4s,@vtmp[1].4s},[$outp],#32
919         // save back IV
920         st1     {@data[1].16b}, [$ivp]
921         b       100f
922 1:      // last 3 blocks
923         ld4     {@data[0].s,@data[1].s,@data[2].s,@data[3].s}[2],[$ptr]
924 ___
925         &rev32(@data[0],@data[0]);
926         &rev32(@data[1],@data[1]);
927         &rev32(@data[2],@data[2]);
928         &rev32(@data[3],@data[3]);
929 $code.=<<___;
930         bl      _vpsm4_enc_4blks
931         ld1     {@data[0].4s,@data[1].4s,@data[2].4s},[$inp],#48
932 ___
933         &transpose(@vtmp,@datax);
934 $code.=<<___;
935         eor     @vtmp[0].16b,@vtmp[0].16b,$ivec1.16b
936         eor     @vtmp[1].16b,@vtmp[1].16b,@data[0].16b
937         eor     @vtmp[2].16b,@vtmp[2].16b,@data[1].16b
938         st1     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s},[$outp],#48
939         // save back IV
940         st1     {@data[2].16b}, [$ivp]
941 100:
942         ldp     d10,d11,[sp,#16]
943         ldp     d12,d13,[sp,#32]
944         ldp     d14,d15,[sp,#48]
945         ldp     x29,x30,[sp,#64]
946         ldp     d8,d9,[sp],#80
947         AARCH64_VALIDATE_LINK_REGISTER
948         ret
949 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
950 ___
951 }}}
952
953 {{{
954 my ($ivp)=("x4");
955 my ($ctr)=("w5");
956 my $ivec=("v3");
957
958 $code.=<<___;
959 .globl  ${prefix}_ctr32_encrypt_blocks
960 .type   ${prefix}_ctr32_encrypt_blocks,%function
961 .align  5
962 ${prefix}_ctr32_encrypt_blocks:
963         AARCH64_VALID_CALL_TARGET
964         ld1     {$ivec.4s},[$ivp]
965 ___
966         &rev32($ivec,$ivec);
967         &load_sbox();
968 $code.=<<___;
969         cmp     $blocks,#1
970         b.ne    1f
971         // fast processing for one single block without
972         // context saving overhead
973 ___
974         &encrypt_1blk($ivec);
975 $code.=<<___;
976         ld1     {@data[0].16b},[$inp]
977         eor     @data[0].16b,@data[0].16b,$ivec.16b
978         st1     {@data[0].16b},[$outp]
979         ret
980 1:
981         AARCH64_SIGN_LINK_REGISTER
982         stp     d8,d9,[sp,#-80]!
983         stp     d10,d11,[sp,#16]
984         stp     d12,d13,[sp,#32]
985         stp     d14,d15,[sp,#48]
986         stp     x29,x30,[sp,#64]
987         mov     $word0,$ivec.s[0]
988         mov     $word1,$ivec.s[1]
989         mov     $word2,$ivec.s[2]
990         mov     $ctr,$ivec.s[3]
991 .Lctr32_4_blocks_process:
992         cmp     $blocks,#4
993         b.lt    1f
994         dup     @data[0].4s,$word0
995         dup     @data[1].4s,$word1
996         dup     @data[2].4s,$word2
997         mov     @data[3].s[0],$ctr
998         add     $ctr,$ctr,#1
999         mov     $data[3].s[1],$ctr
1000         add     $ctr,$ctr,#1
1001         mov     @data[3].s[2],$ctr
1002         add     $ctr,$ctr,#1
1003         mov     @data[3].s[3],$ctr
1004         add     $ctr,$ctr,#1
1005         cmp     $blocks,#8
1006         b.ge    .Lctr32_8_blocks_process
1007         bl      _vpsm4_enc_4blks
1008         ld4     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1009         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1010         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1011         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1012         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1013         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1014         subs    $blocks,$blocks,#4
1015         b.ne    .Lctr32_4_blocks_process
1016         b       100f
1017 .Lctr32_8_blocks_process:
1018         dup     @datax[0].4s,$word0
1019         dup     @datax[1].4s,$word1
1020         dup     @datax[2].4s,$word2
1021         mov     @datax[3].s[0],$ctr
1022         add     $ctr,$ctr,#1
1023         mov     $datax[3].s[1],$ctr
1024         add     $ctr,$ctr,#1
1025         mov     @datax[3].s[2],$ctr
1026         add     $ctr,$ctr,#1
1027         mov     @datax[3].s[3],$ctr
1028         add     $ctr,$ctr,#1
1029         bl      _vpsm4_enc_8blks
1030         ld4     {@vtmpx[0].4s,@vtmpx[1].4s,@vtmpx[2].4s,@vtmpx[3].4s},[$inp],#64
1031         ld4     {@datax[0].4s,@datax[1].4s,@datax[2].4s,@datax[3].4s},[$inp],#64
1032         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1033         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1034         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1035         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1036         eor     @data[0].16b,@data[0].16b,@datax[0].16b
1037         eor     @data[1].16b,@data[1].16b,@datax[1].16b
1038         eor     @data[2].16b,@data[2].16b,@datax[2].16b
1039         eor     @data[3].16b,@data[3].16b,@datax[3].16b
1040         st4     {@vtmp[0].4s,@vtmp[1].4s,@vtmp[2].4s,@vtmp[3].4s},[$outp],#64
1041         st4     {@data[0].4s,@data[1].4s,@data[2].4s,@data[3].4s},[$outp],#64
1042         subs    $blocks,$blocks,#8
1043         b.ne    .Lctr32_4_blocks_process
1044         b       100f
1045 1:      // last block processing
1046         subs    $blocks,$blocks,#1
1047         b.lt    100f
1048         b.gt    1f
1049         mov     $ivec.s[0],$word0
1050         mov     $ivec.s[1],$word1
1051         mov     $ivec.s[2],$word2
1052         mov     $ivec.s[3],$ctr
1053 ___
1054         &encrypt_1blk($ivec);
1055 $code.=<<___;
1056         ld1     {@data[0].16b},[$inp]
1057         eor     @data[0].16b,@data[0].16b,$ivec.16b
1058         st1     {@data[0].16b},[$outp]
1059         b       100f
1060 1:      // last 2 blocks processing
1061         dup     @data[0].4s,$word0
1062         dup     @data[1].4s,$word1
1063         dup     @data[2].4s,$word2
1064         mov     @data[3].s[0],$ctr
1065         add     $ctr,$ctr,#1
1066         mov     @data[3].s[1],$ctr
1067         subs    $blocks,$blocks,#1
1068         b.ne    1f
1069         bl      _vpsm4_enc_4blks
1070         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1071         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1072         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1073         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1074         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1075         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1076         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1077         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1078         b       100f
1079 1:      // last 3 blocks processing
1080         add     $ctr,$ctr,#1
1081         mov     @data[3].s[2],$ctr
1082         bl      _vpsm4_enc_4blks
1083         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[0],[$inp],#16
1084         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[1],[$inp],#16
1085         ld4     {@vtmpx[0].s,@vtmpx[1].s,@vtmpx[2].s,@vtmpx[3].s}[2],[$inp],#16
1086         eor     @vtmp[0].16b,@vtmp[0].16b,@vtmpx[0].16b
1087         eor     @vtmp[1].16b,@vtmp[1].16b,@vtmpx[1].16b
1088         eor     @vtmp[2].16b,@vtmp[2].16b,@vtmpx[2].16b
1089         eor     @vtmp[3].16b,@vtmp[3].16b,@vtmpx[3].16b
1090         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[0],[$outp],#16
1091         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[1],[$outp],#16
1092         st4     {@vtmp[0].s,@vtmp[1].s,@vtmp[2].s,@vtmp[3].s}[2],[$outp],#16
1093 100:
1094         ldp     d10,d11,[sp,#16]
1095         ldp     d12,d13,[sp,#32]
1096         ldp     d14,d15,[sp,#48]
1097         ldp     x29,x30,[sp,#64]
1098         ldp     d8,d9,[sp],#80
1099         AARCH64_VALIDATE_LINK_REGISTER
1100         ret
1101 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1102 ___
1103 }}}
1104 ########################################
1105 open SELF,$0;
1106 while(<SELF>) {
1107         next if (/^#!/);
1108         last if (!s/^#/\/\// and !/^$/);
1109         print;
1110 }
1111 close SELF;
1112
1113 foreach(split("\n",$code)) {
1114         s/\`([^\`]*)\`/eval($1)/ge;
1115         print $_,"\n";
1116 }
1117
1118 close STDOUT or die "error closing STDOUT: $!";