06754090b1128e9d2a0fe700c4a9edfeecb4f681
[openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
22 #
23 # Performance in cycles per byte processed with 128-bit key:
24 #
25 #               CBC enc         CBC dec         CTR
26 # Apple A7      2.39            1.20            1.20
27 # Cortex-A53    2.45            1.87            1.94
28 # Cortex-A57    3.64            1.34            1.32
29
30 $flavour = shift;
31 $output  = shift;
32
33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
35 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
36 die "can't locate arm-xlate.pl";
37
38 open OUT,"| \"$^X\" $xlate $flavour $output";
39 *STDOUT=*OUT;
40
41 $prefix="aes_v8";
42
43 $code=<<___;
44 #include "arm_arch.h"
45
46 #if __ARM_MAX_ARCH__>=7
47 .text
48 ___
49 $code.=".arch   armv8-a+crypto\n"                       if ($flavour =~ /64/);
50 $code.=".arch   armv7-a\n.fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
51                 #^^^^^^ this is done to simplify adoption by not depending
52                 #       on latest binutils.
53
54 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
55 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
56 # maintain both 32- and 64-bit codes within single module and
57 # transliterate common code to either flavour with regex vodoo.
58 #
59 {{{
60 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
61 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
62         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
63
64
65 $code.=<<___;
66 .align  5
67 .Lrcon:
68 .long   0x01,0x01,0x01,0x01
69 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
70 .long   0x1b,0x1b,0x1b,0x1b
71
72 .globl  ${prefix}_set_encrypt_key
73 .type   ${prefix}_set_encrypt_key,%function
74 .align  5
75 ${prefix}_set_encrypt_key:
76 .Lenc_key:
77 ___
78 $code.=<<___    if ($flavour =~ /64/);
79         stp     x29,x30,[sp,#-16]!
80         add     x29,sp,#0
81 ___
82 $code.=<<___;
83         mov     $ptr,#-1
84         cmp     $inp,#0
85         b.eq    .Lenc_key_abort
86         cmp     $out,#0
87         b.eq    .Lenc_key_abort
88         mov     $ptr,#-2
89         cmp     $bits,#128
90         b.lt    .Lenc_key_abort
91         cmp     $bits,#256
92         b.gt    .Lenc_key_abort
93         tst     $bits,#0x3f
94         b.ne    .Lenc_key_abort
95
96         adr     $ptr,.Lrcon
97         cmp     $bits,#192
98
99         veor    $zero,$zero,$zero
100         vld1.8  {$in0},[$inp],#16
101         mov     $bits,#8                // reuse $bits
102         vld1.32 {$rcon,$mask},[$ptr],#32
103
104         b.lt    .Loop128
105         b.eq    .L192
106         b       .L256
107
108 .align  4
109 .Loop128:
110         vtbl.8  $key,{$in0},$mask
111         vext.8  $tmp,$zero,$in0,#12
112         vst1.32 {$in0},[$out],#16
113         aese    $key,$zero
114         subs    $bits,$bits,#1
115
116         veor    $in0,$in0,$tmp
117         vext.8  $tmp,$zero,$tmp,#12
118         veor    $in0,$in0,$tmp
119         vext.8  $tmp,$zero,$tmp,#12
120          veor   $key,$key,$rcon
121         veor    $in0,$in0,$tmp
122         vshl.u8 $rcon,$rcon,#1
123         veor    $in0,$in0,$key
124         b.ne    .Loop128
125
126         vld1.32 {$rcon},[$ptr]
127
128         vtbl.8  $key,{$in0},$mask
129         vext.8  $tmp,$zero,$in0,#12
130         vst1.32 {$in0},[$out],#16
131         aese    $key,$zero
132
133         veor    $in0,$in0,$tmp
134         vext.8  $tmp,$zero,$tmp,#12
135         veor    $in0,$in0,$tmp
136         vext.8  $tmp,$zero,$tmp,#12
137          veor   $key,$key,$rcon
138         veor    $in0,$in0,$tmp
139         vshl.u8 $rcon,$rcon,#1
140         veor    $in0,$in0,$key
141
142         vtbl.8  $key,{$in0},$mask
143         vext.8  $tmp,$zero,$in0,#12
144         vst1.32 {$in0},[$out],#16
145         aese    $key,$zero
146
147         veor    $in0,$in0,$tmp
148         vext.8  $tmp,$zero,$tmp,#12
149         veor    $in0,$in0,$tmp
150         vext.8  $tmp,$zero,$tmp,#12
151          veor   $key,$key,$rcon
152         veor    $in0,$in0,$tmp
153         veor    $in0,$in0,$key
154         vst1.32 {$in0},[$out]
155         add     $out,$out,#0x50
156
157         mov     $rounds,#10
158         b       .Ldone
159
160 .align  4
161 .L192:
162         vld1.8  {$in1},[$inp],#8
163         vmov.i8 $key,#8                 // borrow $key
164         vst1.32 {$in0},[$out],#16
165         vsub.i8 $mask,$mask,$key        // adjust the mask
166
167 .Loop192:
168         vtbl.8  $key,{$in1},$mask
169         vext.8  $tmp,$zero,$in0,#12
170         vst1.32 {$in1},[$out],#8
171         aese    $key,$zero
172         subs    $bits,$bits,#1
173
174         veor    $in0,$in0,$tmp
175         vext.8  $tmp,$zero,$tmp,#12
176         veor    $in0,$in0,$tmp
177         vext.8  $tmp,$zero,$tmp,#12
178         veor    $in0,$in0,$tmp
179
180         vdup.32 $tmp,${in0}[3]
181         veor    $tmp,$tmp,$in1
182          veor   $key,$key,$rcon
183         vext.8  $in1,$zero,$in1,#12
184         vshl.u8 $rcon,$rcon,#1
185         veor    $in1,$in1,$tmp
186         veor    $in0,$in0,$key
187         veor    $in1,$in1,$key
188         vst1.32 {$in0},[$out],#16
189         b.ne    .Loop192
190
191         mov     $rounds,#12
192         add     $out,$out,#0x20
193         b       .Ldone
194
195 .align  4
196 .L256:
197         vld1.8  {$in1},[$inp]
198         mov     $bits,#7
199         mov     $rounds,#14
200         vst1.32 {$in0},[$out],#16
201
202 .Loop256:
203         vtbl.8  $key,{$in1},$mask
204         vext.8  $tmp,$zero,$in0,#12
205         vst1.32 {$in1},[$out],#16
206         aese    $key,$zero
207         subs    $bits,$bits,#1
208
209         veor    $in0,$in0,$tmp
210         vext.8  $tmp,$zero,$tmp,#12
211         veor    $in0,$in0,$tmp
212         vext.8  $tmp,$zero,$tmp,#12
213          veor   $key,$key,$rcon
214         veor    $in0,$in0,$tmp
215         vshl.u8 $rcon,$rcon,#1
216         veor    $in0,$in0,$key
217         vst1.32 {$in0},[$out],#16
218         b.eq    .Ldone
219
220         vdup.32 $key,${in0}[3]          // just splat
221         vext.8  $tmp,$zero,$in1,#12
222         aese    $key,$zero
223
224         veor    $in1,$in1,$tmp
225         vext.8  $tmp,$zero,$tmp,#12
226         veor    $in1,$in1,$tmp
227         vext.8  $tmp,$zero,$tmp,#12
228         veor    $in1,$in1,$tmp
229
230         veor    $in1,$in1,$key
231         b       .Loop256
232
233 .Ldone:
234         str     $rounds,[$out]
235         mov     $ptr,#0
236
237 .Lenc_key_abort:
238         mov     x0,$ptr                 // return value
239         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
240         ret
241 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
242
243 .globl  ${prefix}_set_decrypt_key
244 .type   ${prefix}_set_decrypt_key,%function
245 .align  5
246 ${prefix}_set_decrypt_key:
247 ___
248 $code.=<<___    if ($flavour =~ /64/);
249         stp     x29,x30,[sp,#-16]!
250         add     x29,sp,#0
251 ___
252 $code.=<<___    if ($flavour !~ /64/);
253         stmdb   sp!,{r4,lr}
254 ___
255 $code.=<<___;
256         bl      .Lenc_key
257
258         cmp     x0,#0
259         b.ne    .Ldec_key_abort
260
261         sub     $out,$out,#240          // restore original $out
262         mov     x4,#-16
263         add     $inp,$out,x12,lsl#4     // end of key schedule
264
265         vld1.32 {v0.16b},[$out]
266         vld1.32 {v1.16b},[$inp]
267         vst1.32 {v0.16b},[$inp],x4
268         vst1.32 {v1.16b},[$out],#16
269
270 .Loop_imc:
271         vld1.32 {v0.16b},[$out]
272         vld1.32 {v1.16b},[$inp]
273         aesimc  v0.16b,v0.16b
274         aesimc  v1.16b,v1.16b
275         vst1.32 {v0.16b},[$inp],x4
276         vst1.32 {v1.16b},[$out],#16
277         cmp     $inp,$out
278         b.hi    .Loop_imc
279
280         vld1.32 {v0.16b},[$out]
281         aesimc  v0.16b,v0.16b
282         vst1.32 {v0.16b},[$inp]
283
284         eor     x0,x0,x0                // return value
285 .Ldec_key_abort:
286 ___
287 $code.=<<___    if ($flavour !~ /64/);
288         ldmia   sp!,{r4,pc}
289 ___
290 $code.=<<___    if ($flavour =~ /64/);
291         ldp     x29,x30,[sp],#16
292         ret
293 ___
294 $code.=<<___;
295 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
296 ___
297 }}}
298 {{{
299 sub gen_block () {
300 my $dir = shift;
301 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
302 my ($inp,$out,$key)=map("x$_",(0..2));
303 my $rounds="w3";
304 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
305
306 $code.=<<___;
307 .globl  ${prefix}_${dir}crypt
308 .type   ${prefix}_${dir}crypt,%function
309 .align  5
310 ${prefix}_${dir}crypt:
311         ldr     $rounds,[$key,#240]
312         vld1.32 {$rndkey0},[$key],#16
313         vld1.8  {$inout},[$inp]
314         sub     $rounds,$rounds,#2
315         vld1.32 {$rndkey1},[$key],#16
316
317 .Loop_${dir}c:
318         aes$e   $inout,$rndkey0
319         vld1.32 {$rndkey0},[$key],#16
320         aes$mc  $inout,$inout
321         subs    $rounds,$rounds,#2
322         aes$e   $inout,$rndkey1
323         vld1.32 {$rndkey1},[$key],#16
324         aes$mc  $inout,$inout
325         b.gt    .Loop_${dir}c
326
327         aes$e   $inout,$rndkey0
328         vld1.32 {$rndkey0},[$key]
329         aes$mc  $inout,$inout
330         aes$e   $inout,$rndkey1
331         veor    $inout,$inout,$rndkey0
332
333         vst1.8  {$inout},[$out]
334         ret
335 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
336 ___
337 }
338 &gen_block("en");
339 &gen_block("de");
340 }}}
341 {{{
342 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
343 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
344 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
345
346 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
347
348 ### q8-q15      preloaded key schedule
349
350 $code.=<<___;
351 .globl  ${prefix}_cbc_encrypt
352 .type   ${prefix}_cbc_encrypt,%function
353 .align  5
354 ${prefix}_cbc_encrypt:
355 ___
356 $code.=<<___    if ($flavour =~ /64/);
357         stp     x29,x30,[sp,#-16]!
358         add     x29,sp,#0
359 ___
360 $code.=<<___    if ($flavour !~ /64/);
361         mov     ip,sp
362         stmdb   sp!,{r4-r8,lr}
363         vstmdb  sp!,{d8-d15}            @ ABI specification says so
364         ldmia   ip,{r4-r5}              @ load remaining args
365 ___
366 $code.=<<___;
367         subs    $len,$len,#16
368         mov     $step,#16
369         b.lo    .Lcbc_abort
370         cclr    $step,eq
371
372         cmp     $enc,#0                 // en- or decrypting?
373         ldr     $rounds,[$key,#240]
374         and     $len,$len,#-16
375         vld1.8  {$ivec},[$ivp]
376         vld1.8  {$dat},[$inp],$step
377
378         vld1.32 {q8-q9},[$key]          // load key schedule...
379         sub     $rounds,$rounds,#6
380         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
381         sub     $rounds,$rounds,#2
382         vld1.32 {q10-q11},[$key_],#32
383         vld1.32 {q12-q13},[$key_],#32
384         vld1.32 {q14-q15},[$key_],#32
385         vld1.32 {$rndlast},[$key_]
386
387         add     $key_,$key,#32
388         mov     $cnt,$rounds
389         b.eq    .Lcbc_dec
390
391         cmp     $rounds,#2
392         veor    $dat,$dat,$ivec
393         veor    $rndzero_n_last,q8,$rndlast
394         b.eq    .Lcbc_enc128
395
396 .Loop_cbc_enc:
397         aese    $dat,q8
398         vld1.32 {q8},[$key_],#16
399         aesmc   $dat,$dat
400         subs    $cnt,$cnt,#2
401         aese    $dat,q9
402         vld1.32 {q9},[$key_],#16
403         aesmc   $dat,$dat
404         b.gt    .Loop_cbc_enc
405
406         aese    $dat,q8
407         aesmc   $dat,$dat
408          subs   $len,$len,#16
409         aese    $dat,q9
410         aesmc   $dat,$dat
411          cclr   $step,eq
412         aese    $dat,q10
413         aesmc   $dat,$dat
414          add    $key_,$key,#16
415         aese    $dat,q11
416         aesmc   $dat,$dat
417          vld1.8 {q8},[$inp],$step
418         aese    $dat,q12
419         aesmc   $dat,$dat
420          veor   q8,q8,$rndzero_n_last
421         aese    $dat,q13
422         aesmc   $dat,$dat
423          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
424         aese    $dat,q14
425         aesmc   $dat,$dat
426         aese    $dat,q15
427
428          mov    $cnt,$rounds
429         veor    $ivec,$dat,$rndlast
430         vst1.8  {$ivec},[$out],#16
431         b.hs    .Loop_cbc_enc
432
433         b       .Lcbc_done
434
435 .align  5
436 .Lcbc_enc128:
437         vld1.32 {$in0-$in1},[$key_]
438         aese    $dat,q8
439         aesmc   $dat,$dat
440         b       .Lenter_cbc_enc128
441 .Loop_cbc_enc128:
442         aese    $dat,q8
443         aesmc   $dat,$dat
444          vst1.8 {$ivec},[$out],#16
445 .Lenter_cbc_enc128:
446         aese    $dat,q9
447         aesmc   $dat,$dat
448          subs   $len,$len,#16
449         aese    $dat,$in0
450         aesmc   $dat,$dat
451          cclr   $step,eq
452         aese    $dat,$in1
453         aesmc   $dat,$dat
454         aese    $dat,q10
455         aesmc   $dat,$dat
456         aese    $dat,q11
457         aesmc   $dat,$dat
458          vld1.8 {q8},[$inp],$step
459         aese    $dat,q12
460         aesmc   $dat,$dat
461         aese    $dat,q13
462         aesmc   $dat,$dat
463         aese    $dat,q14
464         aesmc   $dat,$dat
465          veor   q8,q8,$rndzero_n_last
466         aese    $dat,q15
467         veor    $ivec,$dat,$rndlast
468         b.hs    .Loop_cbc_enc128
469
470         vst1.8  {$ivec},[$out],#16
471         b       .Lcbc_done
472 ___
473 {
474 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
475 $code.=<<___;
476 .align  5
477 .Lcbc_dec:
478         vld1.8  {$dat2},[$inp],#16
479         subs    $len,$len,#32           // bias
480         add     $cnt,$rounds,#2
481         vorr    $in1,$dat,$dat
482         vorr    $dat1,$dat,$dat
483         vorr    $in2,$dat2,$dat2
484         b.lo    .Lcbc_dec_tail
485
486         vorr    $dat1,$dat2,$dat2
487         vld1.8  {$dat2},[$inp],#16
488         vorr    $in0,$dat,$dat
489         vorr    $in1,$dat1,$dat1
490         vorr    $in2,$dat2,$dat2
491
492 .Loop3x_cbc_dec:
493         aesd    $dat0,q8
494         aesd    $dat1,q8
495         aesd    $dat2,q8
496         vld1.32 {q8},[$key_],#16
497         aesimc  $dat0,$dat0
498         aesimc  $dat1,$dat1
499         aesimc  $dat2,$dat2
500         subs    $cnt,$cnt,#2
501         aesd    $dat0,q9
502         aesd    $dat1,q9
503         aesd    $dat2,q9
504         vld1.32 {q9},[$key_],#16
505         aesimc  $dat0,$dat0
506         aesimc  $dat1,$dat1
507         aesimc  $dat2,$dat2
508         b.gt    .Loop3x_cbc_dec
509
510         aesd    $dat0,q8
511         aesd    $dat1,q8
512         aesd    $dat2,q8
513          veor   $tmp0,$ivec,$rndlast
514         aesimc  $dat0,$dat0
515         aesimc  $dat1,$dat1
516         aesimc  $dat2,$dat2
517          veor   $tmp1,$in0,$rndlast
518         aesd    $dat0,q9
519         aesd    $dat1,q9
520         aesd    $dat2,q9
521          veor   $tmp2,$in1,$rndlast
522          subs   $len,$len,#0x30
523         aesimc  $dat0,$dat0
524         aesimc  $dat1,$dat1
525         aesimc  $dat2,$dat2
526          vorr   $ivec,$in2,$in2
527          mov.lo x6,$len                 // x6, $cnt, is zero at this point
528         aesd    $dat0,q12
529         aesd    $dat1,q12
530         aesd    $dat2,q12
531          add    $inp,$inp,x6            // $inp is adjusted in such way that
532                                         // at exit from the loop $dat1-$dat2
533                                         // are loaded with last "words"
534         aesimc  $dat0,$dat0
535         aesimc  $dat1,$dat1
536         aesimc  $dat2,$dat2
537          mov    $key_,$key
538         aesd    $dat0,q13
539         aesd    $dat1,q13
540         aesd    $dat2,q13
541          vld1.8 {$in0},[$inp],#16
542         aesimc  $dat0,$dat0
543         aesimc  $dat1,$dat1
544         aesimc  $dat2,$dat2
545          vld1.8 {$in1},[$inp],#16
546         aesd    $dat0,q14
547         aesd    $dat1,q14
548         aesd    $dat2,q14
549          vld1.8 {$in2},[$inp],#16
550         aesimc  $dat0,$dat0
551         aesimc  $dat1,$dat1
552         aesimc  $dat2,$dat2
553          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
554         aesd    $dat0,q15
555         aesd    $dat1,q15
556         aesd    $dat2,q15
557
558          add    $cnt,$rounds,#2
559         veor    $tmp0,$tmp0,$dat0
560         veor    $tmp1,$tmp1,$dat1
561         veor    $dat2,$dat2,$tmp2
562          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
563          vorr   $dat0,$in0,$in0
564         vst1.8  {$tmp0},[$out],#16
565          vorr   $dat1,$in1,$in1
566         vst1.8  {$tmp1},[$out],#16
567         vst1.8  {$dat2},[$out],#16
568          vorr   $dat2,$in2,$in2
569         b.hs    .Loop3x_cbc_dec
570
571         cmn     $len,#0x30
572         b.eq    .Lcbc_done
573         nop
574
575 .Lcbc_dec_tail:
576         aesd    $dat1,q8
577         aesd    $dat2,q8
578         vld1.32 {q8},[$key_],#16
579         aesimc  $dat1,$dat1
580         aesimc  $dat2,$dat2
581         subs    $cnt,$cnt,#2
582         aesd    $dat1,q9
583         aesd    $dat2,q9
584         vld1.32 {q9},[$key_],#16
585         aesimc  $dat1,$dat1
586         aesimc  $dat2,$dat2
587         b.gt    .Lcbc_dec_tail
588
589         aesd    $dat1,q8
590         aesd    $dat2,q8
591         aesimc  $dat1,$dat1
592         aesimc  $dat2,$dat2
593         aesd    $dat1,q9
594         aesd    $dat2,q9
595         aesimc  $dat1,$dat1
596         aesimc  $dat2,$dat2
597         aesd    $dat1,q12
598         aesd    $dat2,q12
599         aesimc  $dat1,$dat1
600         aesimc  $dat2,$dat2
601          cmn    $len,#0x20
602         aesd    $dat1,q13
603         aesd    $dat2,q13
604         aesimc  $dat1,$dat1
605         aesimc  $dat2,$dat2
606          veor   $tmp1,$ivec,$rndlast
607         aesd    $dat1,q14
608         aesd    $dat2,q14
609         aesimc  $dat1,$dat1
610         aesimc  $dat2,$dat2
611          veor   $tmp2,$in1,$rndlast
612         aesd    $dat1,q15
613         aesd    $dat2,q15
614         b.eq    .Lcbc_dec_one
615         veor    $tmp1,$tmp1,$dat1
616         veor    $tmp2,$tmp2,$dat2
617          vorr   $ivec,$in2,$in2
618         vst1.8  {$tmp1},[$out],#16
619         vst1.8  {$tmp2},[$out],#16
620         b       .Lcbc_done
621
622 .Lcbc_dec_one:
623         veor    $tmp1,$tmp1,$dat2
624          vorr   $ivec,$in2,$in2
625         vst1.8  {$tmp1},[$out],#16
626
627 .Lcbc_done:
628         vst1.8  {$ivec},[$ivp]
629 .Lcbc_abort:
630 ___
631 }
632 $code.=<<___    if ($flavour !~ /64/);
633         vldmia  sp!,{d8-d15}
634         ldmia   sp!,{r4-r8,pc}
635 ___
636 $code.=<<___    if ($flavour =~ /64/);
637         ldr     x29,[sp],#16
638         ret
639 ___
640 $code.=<<___;
641 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
642 ___
643 }}}
644 {{{
645 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
646 my ($rounds,$cnt,$key_)=("w5","w6","x7");
647 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
648 my $step="x12";         # aliases with $tctr2
649
650 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
651 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
652
653 my ($dat,$tmp)=($dat0,$tmp0);
654
655 ### q8-q15      preloaded key schedule
656
657 $code.=<<___;
658 .globl  ${prefix}_ctr32_encrypt_blocks
659 .type   ${prefix}_ctr32_encrypt_blocks,%function
660 .align  5
661 ${prefix}_ctr32_encrypt_blocks:
662 ___
663 $code.=<<___    if ($flavour =~ /64/);
664         stp             x29,x30,[sp,#-16]!
665         add             x29,sp,#0
666 ___
667 $code.=<<___    if ($flavour !~ /64/);
668         mov             ip,sp
669         stmdb           sp!,{r4-r10,lr}
670         vstmdb          sp!,{d8-d15}            @ ABI specification says so
671         ldr             r4, [ip]                @ load remaining arg
672 ___
673 $code.=<<___;
674         ldr             $rounds,[$key,#240]
675
676         ldr             $ctr, [$ivp, #12]
677         vld1.32         {$dat0},[$ivp]
678
679         vld1.32         {q8-q9},[$key]          // load key schedule...
680         sub             $rounds,$rounds,#4
681         mov             $step,#16
682         cmp             $len,#2
683         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
684         sub             $rounds,$rounds,#2
685         vld1.32         {q12-q13},[$key_],#32
686         vld1.32         {q14-q15},[$key_],#32
687         vld1.32         {$rndlast},[$key_]
688         add             $key_,$key,#32
689         mov             $cnt,$rounds
690         cclr            $step,lo
691 #ifndef __ARMEB__
692         rev             $ctr, $ctr
693 #endif
694         vorr            $dat1,$dat0,$dat0
695         add             $tctr1, $ctr, #1
696         vorr            $dat2,$dat0,$dat0
697         add             $ctr, $ctr, #2
698         vorr            $ivec,$dat0,$dat0
699         rev             $tctr1, $tctr1
700         vmov.32         ${dat1}[3],$tctr1
701         b.ls            .Lctr32_tail
702         rev             $tctr2, $ctr
703         sub             $len,$len,#3            // bias
704         vmov.32         ${dat2}[3],$tctr2
705         b               .Loop3x_ctr32
706
707 .align  4
708 .Loop3x_ctr32:
709         aese            $dat0,q8
710         aese            $dat1,q8
711         aese            $dat2,q8
712         vld1.32         {q8},[$key_],#16
713         aesmc           $dat0,$dat0
714         aesmc           $dat1,$dat1
715         aesmc           $dat2,$dat2
716         subs            $cnt,$cnt,#2
717         aese            $dat0,q9
718         aese            $dat1,q9
719         aese            $dat2,q9
720         vld1.32         {q9},[$key_],#16
721         aesmc           $dat0,$dat0
722         aesmc           $dat1,$dat1
723         aesmc           $dat2,$dat2
724         b.gt            .Loop3x_ctr32
725
726         aese            $dat0,q8
727         aese            $dat1,q8
728         aese            $dat2,q8
729          mov            $key_,$key
730         aesmc           $tmp0,$dat0
731          vld1.8         {$in0},[$inp],#16
732         aesmc           $tmp1,$dat1
733         aesmc           $dat2,$dat2
734          vorr           $dat0,$ivec,$ivec
735         aese            $tmp0,q9
736          vld1.8         {$in1},[$inp],#16
737         aese            $tmp1,q9
738         aese            $dat2,q9
739          vorr           $dat1,$ivec,$ivec
740         aesmc           $tmp0,$tmp0
741          vld1.8         {$in2},[$inp],#16
742         aesmc           $tmp1,$tmp1
743         aesmc           $tmp2,$dat2
744          vorr           $dat2,$ivec,$ivec
745          add            $tctr0,$ctr,#1
746         aese            $tmp0,q12
747         aese            $tmp1,q12
748         aese            $tmp2,q12
749          veor           $in0,$in0,$rndlast
750          add            $tctr1,$ctr,#2
751         aesmc           $tmp0,$tmp0
752         aesmc           $tmp1,$tmp1
753         aesmc           $tmp2,$tmp2
754          veor           $in1,$in1,$rndlast
755          add            $ctr,$ctr,#3
756         aese            $tmp0,q13
757         aese            $tmp1,q13
758         aese            $tmp2,q13
759          veor           $in2,$in2,$rndlast
760          rev            $tctr0,$tctr0
761         aesmc           $tmp0,$tmp0
762          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
763         aesmc           $tmp1,$tmp1
764         aesmc           $tmp2,$tmp2
765          vmov.32        ${dat0}[3], $tctr0
766          rev            $tctr1,$tctr1
767         aese            $tmp0,q14
768         aese            $tmp1,q14
769         aese            $tmp2,q14
770          vmov.32        ${dat1}[3], $tctr1
771          rev            $tctr2,$ctr
772         aesmc           $tmp0,$tmp0
773         aesmc           $tmp1,$tmp1
774         aesmc           $tmp2,$tmp2
775          vmov.32        ${dat2}[3], $tctr2
776          subs           $len,$len,#3
777         aese            $tmp0,q15
778         aese            $tmp1,q15
779         aese            $tmp2,q15
780
781          mov            $cnt,$rounds
782         veor            $in0,$in0,$tmp0
783         veor            $in1,$in1,$tmp1
784         veor            $in2,$in2,$tmp2
785          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
786         vst1.8          {$in0},[$out],#16
787         vst1.8          {$in1},[$out],#16
788         vst1.8          {$in2},[$out],#16
789         b.hs            .Loop3x_ctr32
790
791         adds            $len,$len,#3
792         b.eq            .Lctr32_done
793         cmp             $len,#1
794         mov             $step,#16
795         cclr            $step,eq
796
797 .Lctr32_tail:
798         aese            $dat0,q8
799         aese            $dat1,q8
800         vld1.32         {q8},[$key_],#16
801         aesmc           $dat0,$dat0
802         aesmc           $dat1,$dat1
803         subs            $cnt,$cnt,#2
804         aese            $dat0,q9
805         aese            $dat1,q9
806         vld1.32         {q9},[$key_],#16
807         aesmc           $dat0,$dat0
808         aesmc           $dat1,$dat1
809         b.gt            .Lctr32_tail
810
811         aese            $dat0,q8
812         aese            $dat1,q8
813         aesmc           $dat0,$dat0
814         aesmc           $dat1,$dat1
815         aese            $dat0,q9
816         aese            $dat1,q9
817         aesmc           $dat0,$dat0
818         aesmc           $dat1,$dat1
819          vld1.8         {$in0},[$inp],$step
820         aese            $dat0,q12
821         aese            $dat1,q12
822          vld1.8         {$in1},[$inp]
823         aesmc           $dat0,$dat0
824         aesmc           $dat1,$dat1
825         aese            $dat0,q13
826         aese            $dat1,q13
827         aesmc           $dat0,$dat0
828         aesmc           $dat1,$dat1
829         aese            $dat0,q14
830         aese            $dat1,q14
831          veor           $in0,$in0,$rndlast
832         aesmc           $dat0,$dat0
833         aesmc           $dat1,$dat1
834          veor           $in1,$in1,$rndlast
835         aese            $dat0,q15
836         aese            $dat1,q15
837
838         cmp             $len,#1
839         veor            $in0,$in0,$dat0
840         veor            $in1,$in1,$dat1
841         vst1.8          {$in0},[$out],#16
842         b.eq            .Lctr32_done
843         vst1.8          {$in1},[$out]
844
845 .Lctr32_done:
846 ___
847 $code.=<<___    if ($flavour !~ /64/);
848         vldmia          sp!,{d8-d15}
849         ldmia           sp!,{r4-r10,pc}
850 ___
851 $code.=<<___    if ($flavour =~ /64/);
852         ldr             x29,[sp],#16
853         ret
854 ___
855 $code.=<<___;
856 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
857 ___
858 }}}
859 $code.=<<___;
860 #endif
861 ___
862 ########################################
863 if ($flavour =~ /64/) {                 ######## 64-bit code
864     my %opcode = (
865         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
866         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
867
868     local *unaes = sub {
869         my ($mnemonic,$arg)=@_;
870
871         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
872         sprintf ".inst\t0x%08x\t//%s %s",
873                         $opcode{$mnemonic}|$1|($2<<5),
874                         $mnemonic,$arg;
875     };
876
877     foreach(split("\n",$code)) {
878         s/\`([^\`]*)\`/eval($1)/geo;
879
880         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
881         s/@\s/\/\//o;                   # old->new style commentary
882
883         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
884         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
885         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
886         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
887         s/vext\.8/ext/o         or
888         s/vrev32\.8/rev32/o     or
889         s/vtst\.8/cmtst/o       or
890         s/vshr/ushr/o           or
891         s/^(\s+)v/$1/o          or      # strip off v prefix
892         s/\bbx\s+lr\b/ret/o;
893
894         # fix up remainig legacy suffixes
895         s/\.[ui]?8//o;
896         m/\],#8/o and s/\.16b/\.8b/go;
897         s/\.[ui]?32//o and s/\.16b/\.4s/go;
898         s/\.[ui]?64//o and s/\.16b/\.2d/go;
899         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
900
901         print $_,"\n";
902     }
903 } else {                                ######## 32-bit code
904     my %opcode = (
905         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
906         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
907
908     local *unaes = sub {
909         my ($mnemonic,$arg)=@_;
910
911         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
912             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
913                                          |(($2&7)<<1) |(($2&8)<<2);
914             # since ARMv7 instructions are always encoded little-endian.
915             # correct solution is to use .inst directive, but older
916             # assemblers don't implement it:-(
917             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
918                         $word&0xff,($word>>8)&0xff,
919                         ($word>>16)&0xff,($word>>24)&0xff,
920                         $mnemonic,$arg;
921         }
922     };
923
924     sub unvtbl {
925         my $arg=shift;
926
927         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
928         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
929                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
930     }
931
932     sub unvdup32 {
933         my $arg=shift;
934
935         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
936         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;      
937     }
938
939     sub unvmov32 {
940         my $arg=shift;
941
942         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
943         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;       
944     }
945
946     foreach(split("\n",$code)) {
947         s/\`([^\`]*)\`/eval($1)/geo;
948
949         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
950         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
951         s/\/\/\s?/@ /o;                         # new->old style commentary
952
953         # fix up remainig new-style suffixes
954         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
955         s/\],#[0-9]+/]!/o;
956
957         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
958         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
959         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
960         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
961         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
962         s/^(\s+)b\./$1b/o                               or
963         s/^(\s+)mov\./$1mov/o                           or
964         s/^(\s+)ret/$1bx\tlr/o;
965
966         print $_,"\n";
967     }
968 }
969
970 close STDOUT;