Engage ARMv8 AES support.
[openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional instructions. This has
15 # no effect on mighty Apple A7, as results are literally equal to
16 # the theoretical estimates based on instruction latencies and issue
17 # rate. It remains to be seen how does it affect other platforms...
18 #
19 # Performance in cycles per byte processed with 128-bit key:
20 #
21 #               CBC enc         CBC dec
22 # Apple A7      2.39            1.20
23 # Cortex-A5x    n/a             n/a
24
25 $flavour = shift;
26 open STDOUT,">".shift;
27
28 $prefix="aes_v8";
29
30 $code=<<___;
31 #include "arm_arch.h"
32
33 #if __ARM_ARCH__>=7
34 .text
35 ___
36 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
37 $code.=".fpu    neon\n.code     32\n"   if ($flavour !~ /64/);
38
39 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
40 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
41 # maintain both 32- and 64-bit codes within single module and
42 # transliterate common code to either flavour with regex vodoo.
43 #
44 {{{
45 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
46 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
47         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
48
49
50 $code.=<<___;
51 .align  5
52 rcon:
53 .long   0x01,0x01,0x01,0x01
54 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
55 .long   0x1b,0x1b,0x1b,0x1b
56
57 .globl  ${prefix}_set_encrypt_key
58 .type   ${prefix}_set_encrypt_key,%function
59 .align  5
60 ${prefix}_set_encrypt_key:
61 .Lenc_key:
62 ___
63 $code.=<<___    if ($flavour =~ /64/);
64         stp     x29,x30,[sp,#-16]!
65         add     x29,sp,#0
66 ___
67 $code.=<<___;
68         adr     $ptr,rcon
69         cmp     $bits,#192
70
71         veor    $zero,$zero,$zero
72         vld1.8  {$in0},[$inp],#16
73         mov     $bits,#8                // reuse $bits
74         vld1.32 {$rcon,$mask},[$ptr],#32
75
76         b.lt    .Loop128
77         b.eq    .L192
78         b       .L256
79
80 .align  4
81 .Loop128:
82         vtbl.8  $key,{$in0},$mask
83         vext.8  $tmp,$zero,$in0,#12
84         vst1.32 {$in0},[$out],#16
85         aese    $key,$zero
86         subs    $bits,$bits,#1
87
88         veor    $in0,$in0,$tmp
89         vext.8  $tmp,$zero,$tmp,#12
90         veor    $in0,$in0,$tmp
91         vext.8  $tmp,$zero,$tmp,#12
92          veor   $key,$key,$rcon
93         veor    $in0,$in0,$tmp
94         vshl.u8 $rcon,$rcon,#1
95         veor    $in0,$in0,$key
96         b.ne    .Loop128
97
98         vld1.32 {$rcon},[$ptr]
99
100         vtbl.8  $key,{$in0},$mask
101         vext.8  $tmp,$zero,$in0,#12
102         vst1.32 {$in0},[$out],#16
103         aese    $key,$zero
104
105         veor    $in0,$in0,$tmp
106         vext.8  $tmp,$zero,$tmp,#12
107         veor    $in0,$in0,$tmp
108         vext.8  $tmp,$zero,$tmp,#12
109          veor   $key,$key,$rcon
110         veor    $in0,$in0,$tmp
111         vshl.u8 $rcon,$rcon,#1
112         veor    $in0,$in0,$key
113
114         vtbl.8  $key,{$in0},$mask
115         vext.8  $tmp,$zero,$in0,#12
116         vst1.32 {$in0},[$out],#16
117         aese    $key,$zero
118
119         veor    $in0,$in0,$tmp
120         vext.8  $tmp,$zero,$tmp,#12
121         veor    $in0,$in0,$tmp
122         vext.8  $tmp,$zero,$tmp,#12
123          veor   $key,$key,$rcon
124         veor    $in0,$in0,$tmp
125         veor    $in0,$in0,$key
126         vst1.32 {$in0},[$out]
127         add     $out,$out,#0x50
128
129         mov     $rounds,#10
130         b       .Ldone
131
132 .align  4
133 .L192:
134         vld1.8  {$in1},[$inp],#8
135         vmov.i8 $key,#8                 // borrow $key
136         vst1.32 {$in0},[$out],#16
137         vsub.i8 $mask,$mask,$key        // adjust the mask
138
139 .Loop192:
140         vtbl.8  $key,{$in1},$mask
141         vext.8  $tmp,$zero,$in0,#12
142         vst1.32 {$in1},[$out],#8
143         aese    $key,$zero
144         subs    $bits,$bits,#1
145
146         veor    $in0,$in0,$tmp
147         vext.8  $tmp,$zero,$tmp,#12
148         veor    $in0,$in0,$tmp
149         vext.8  $tmp,$zero,$tmp,#12
150         veor    $in0,$in0,$tmp
151
152         vdup.32 $tmp,${in0}[3]
153         veor    $tmp,$tmp,$in1
154          veor   $key,$key,$rcon
155         vext.8  $in1,$zero,$in1,#12
156         vshl.u8 $rcon,$rcon,#1
157         veor    $in1,$in1,$tmp
158         veor    $in0,$in0,$key
159         veor    $in1,$in1,$key
160         vst1.32 {$in0},[$out],#16
161         b.ne    .Loop192
162
163         mov     $rounds,#12
164         add     $out,$out,#0x20
165         b       .Ldone
166
167 .align  4
168 .L256:
169         vld1.8  {$in1},[$inp]
170         mov     $bits,#7
171         mov     $rounds,#14
172         vst1.32 {$in0},[$out],#16
173
174 .Loop256:
175         vtbl.8  $key,{$in1},$mask
176         vext.8  $tmp,$zero,$in0,#12
177         vst1.32 {$in1},[$out],#16
178         aese    $key,$zero
179         subs    $bits,$bits,#1
180
181         veor    $in0,$in0,$tmp
182         vext.8  $tmp,$zero,$tmp,#12
183         veor    $in0,$in0,$tmp
184         vext.8  $tmp,$zero,$tmp,#12
185          veor   $key,$key,$rcon
186         veor    $in0,$in0,$tmp
187         vshl.u8 $rcon,$rcon,#1
188         veor    $in0,$in0,$key
189         vst1.32 {$in0},[$out],#16
190         b.eq    .Ldone
191
192         vdup.32 $key,${in0}[3]          // just splat
193         vext.8  $tmp,$zero,$in1,#12
194         aese    $key,$zero
195
196         veor    $in1,$in1,$tmp
197         vext.8  $tmp,$zero,$tmp,#12
198         veor    $in1,$in1,$tmp
199         vext.8  $tmp,$zero,$tmp,#12
200         veor    $in1,$in1,$tmp
201
202         veor    $in1,$in1,$key
203         b       .Loop256
204
205 .Ldone:
206         str     $rounds,[$out]
207
208         eor     x0,x0,x0                // return value
209         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
210         ret
211 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
212
213 .globl  ${prefix}_set_decrypt_key
214 .type   ${prefix}_set_decrypt_key,%function
215 .align  5
216 ${prefix}_set_decrypt_key:
217 ___
218 $code.=<<___    if ($flavour =~ /64/);
219         stp     x29,x30,[sp,#-16]!
220         add     x29,sp,#0
221 ___
222 $code.=<<___    if ($flavour !~ /64/);
223         stmdb   sp!,{r4,lr}
224 ___
225 $code.=<<___;
226         bl      .Lenc_key
227
228         sub     $out,$out,#240          // restore original $out
229         mov     x4,#-16
230         add     $inp,$out,x12,lsl#4     // end of key schedule
231
232         vld1.32 {v0.16b},[$out]
233         vld1.32 {v1.16b},[$inp]
234         vst1.32 {v0.16b},[$inp],x4
235         vst1.32 {v1.16b},[$out],#16
236
237 .Loop_imc:
238         vld1.32 {v0.16b},[$out]
239         vld1.32 {v1.16b},[$inp]
240         aesimc  v0.16b,v0.16b
241         aesimc  v1.16b,v1.16b
242         vst1.32 {v0.16b},[$inp],x4
243         vst1.32 {v1.16b},[$out],#16
244         cmp     $inp,$out
245         b.hi    .Loop_imc
246
247         vld1.32 {v0.16b},[$out]
248         aesimc  v0.16b,v0.16b
249         vst1.32 {v0.16b},[$inp]
250
251         eor     x0,x0,x0                // return value
252 ___
253 $code.=<<___    if ($flavour !~ /64/);
254         ldmia   sp!,{r4,pc}
255 ___
256 $code.=<<___    if ($flavour =~ /64/);
257         ldp     x29,x30,[sp],#16
258         ret
259 ___
260 $code.=<<___;
261 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
262 ___
263 }}}
264 {{{
265 sub gen_block () {
266 my $dir = shift;
267 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
268 my ($inp,$out,$key)=map("x$_",(0..2));
269 my $rounds="w3";
270 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
271
272 $code.=<<___;
273 .globl  ${prefix}_${dir}crypt
274 .type   ${prefix}_${dir}crypt,%function
275 .align  5
276 ${prefix}_${dir}crypt:
277         ldr     $rounds,[$key,#240]
278         vld1.32 {$rndkey0},[$key],#16
279         vld1.8  {$inout},[$inp]
280         sub     $rounds,$rounds,#2
281         vld1.32 {$rndkey1},[$key],#16
282
283 .Loop_${dir}c:
284         aes$e   $inout,$rndkey0
285         vld1.32 {$rndkey0},[$key],#16
286         aes$mc  $inout,$inout
287         subs    $rounds,$rounds,#2
288         aes$e   $inout,$rndkey1
289         vld1.32 {$rndkey1},[$key],#16
290         aes$mc  $inout,$inout
291         b.gt    .Loop_${dir}c
292
293         aes$e   $inout,$rndkey0
294         vld1.32 {$rndkey0},[$key]
295         aes$mc  $inout,$inout
296         aes$e   $inout,$rndkey1
297         veor    $inout,$inout,$rndkey0
298
299         vst1.8  {$inout},[$out]
300         ret
301 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
302 ___
303 }
304 &gen_block("en");
305 &gen_block("de");
306 }}}
307 {{{
308 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
309 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
310 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
311
312 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
313
314 ### q8-q15      preloaded key schedule
315
316 $code.=<<___;
317 .globl  ${prefix}_cbc_encrypt
318 .type   ${prefix}_cbc_encrypt,%function
319 .align  5
320 ${prefix}_cbc_encrypt:
321 ___
322 $code.=<<___    if ($flavour =~ /64/);
323         stp     x29,x30,[sp,#-16]!
324         add     x29,sp,#0
325 ___
326 $code.=<<___    if ($flavour !~ /64/);
327         mov     ip,sp
328         stmdb   sp!,{r4-r8,lr}
329         vstmdb  sp!,{d8-d15}            @ ABI specification says so
330         ldmia   ip,{r4-r5}              @ load remaining args
331 ___
332 $code.=<<___;
333         subs    $len,$len,#16
334         mov     $step,#16
335         b.lo    .Lcbc_abort
336         cclr    $step,eq
337
338         cmp     $enc,#0                 // en- or decrypting?
339         ldr     $rounds,[$key,#240]
340         and     $len,$len,#-16
341         vld1.8  {$ivec},[$ivp]
342         vld1.8  {$dat},[$inp],$step
343
344         vld1.32 {q8-q9},[$key]          // load key schedule...
345         sub     $rounds,$rounds,#6
346         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
347         sub     $rounds,$rounds,#2
348         vld1.32 {q10-q11},[$key_],#32
349         vld1.32 {q12-q13},[$key_],#32
350         vld1.32 {q14-q15},[$key_],#32
351         vld1.32 {$rndlast},[$key_]
352
353         add     $key_,$key,#32
354         mov     $cnt,$rounds
355         b.eq    .Lcbc_dec
356
357         cmp     $rounds,#2
358         veor    $dat,$dat,$ivec
359         veor    $rndzero_n_last,q8,$rndlast
360         b.eq    .Lcbc_enc128
361
362 .Loop_cbc_enc:
363         aese    $dat,q8
364         vld1.32 {q8},[$key_],#16
365         aesmc   $dat,$dat
366         subs    $cnt,$cnt,#2
367         aese    $dat,q9
368         vld1.32 {q9},[$key_],#16
369         aesmc   $dat,$dat
370         b.gt    .Loop_cbc_enc
371
372         aese    $dat,q8
373         aesmc   $dat,$dat
374          subs   $len,$len,#16
375         aese    $dat,q9
376         aesmc   $dat,$dat
377          cclr   $step,eq
378         aese    $dat,q10
379         aesmc   $dat,$dat
380          add    $key_,$key,#16
381         aese    $dat,q11
382         aesmc   $dat,$dat
383          vld1.8 {q8},[$inp],$step
384         aese    $dat,q12
385         aesmc   $dat,$dat
386          veor   q8,q8,$rndzero_n_last
387         aese    $dat,q13
388         aesmc   $dat,$dat
389          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
390         aese    $dat,q14
391         aesmc   $dat,$dat
392         aese    $dat,q15
393
394          mov    $cnt,$rounds
395         veor    $ivec,$dat,$rndlast
396         vst1.8  {$ivec},[$out],#16
397         b.hs    .Loop_cbc_enc
398
399         b       .Lcbc_done
400
401 .align  5
402 .Lcbc_enc128:
403         vld1.32 {$in0-$in1},[$key_]
404         aese    $dat,q8
405         aesmc   $dat,$dat
406         b       .Lenter_cbc_enc128
407 .Loop_cbc_enc128:
408         aese    $dat,q8
409         aesmc   $dat,$dat
410          vst1.8 {$ivec},[$out],#16
411 .Lenter_cbc_enc128:
412         aese    $dat,q9
413         aesmc   $dat,$dat
414          subs   $len,$len,#16
415         aese    $dat,$in0
416         aesmc   $dat,$dat
417          cclr   $step,eq
418         aese    $dat,$in1
419         aesmc   $dat,$dat
420         aese    $dat,q10
421         aesmc   $dat,$dat
422         aese    $dat,q11
423         aesmc   $dat,$dat
424          vld1.8 {q8},[$inp],$step
425         aese    $dat,q12
426         aesmc   $dat,$dat
427         aese    $dat,q13
428         aesmc   $dat,$dat
429         aese    $dat,q14
430         aesmc   $dat,$dat
431          veor   q8,q8,$rndzero_n_last
432         aese    $dat,q15
433         veor    $ivec,$dat,$rndlast
434         b.hs    .Loop_cbc_enc128
435
436         vst1.8  {$ivec},[$out],#16
437         b       .Lcbc_done
438
439 .align  5
440 .Lcbc_dec128:
441         vld1.32 {$tmp0-$tmp1},[$key_]
442         veor    $ivec,$ivec,$rndlast
443         veor    $in0,$dat0,$rndlast
444         mov     $step1,$step
445
446 .Loop2x_cbc_dec128:
447         aesd    $dat0,q8
448         aesd    $dat1,q8
449         aesimc  $dat0,$dat0
450         aesimc  $dat1,$dat1
451          subs   $len,$len,#32
452         aesd    $dat0,q9
453         aesd    $dat1,q9
454         aesimc  $dat0,$dat0
455         aesimc  $dat1,$dat1
456          cclr   $step,lo
457         aesd    $dat0,$tmp0
458         aesd    $dat1,$tmp0
459         aesimc  $dat0,$dat0
460         aesimc  $dat1,$dat1
461          cclr   $step1,ls
462         aesd    $dat0,$tmp1
463         aesd    $dat1,$tmp1
464         aesimc  $dat0,$dat0
465         aesimc  $dat1,$dat1
466         aesd    $dat0,q10
467         aesd    $dat1,q10
468         aesimc  $dat0,$dat0
469         aesimc  $dat1,$dat1
470         aesd    $dat0,q11
471         aesd    $dat1,q11
472         aesimc  $dat0,$dat0
473         aesimc  $dat1,$dat1
474         aesd    $dat0,q12
475         aesd    $dat1,q12
476         aesimc  $dat0,$dat0
477         aesimc  $dat1,$dat1
478         aesd    $dat0,q13
479         aesd    $dat1,q13
480         aesimc  $dat0,$dat0
481         aesimc  $dat1,$dat1
482         aesd    $dat0,q14
483         aesd    $dat1,q14
484         aesimc  $dat0,$dat0
485         aesimc  $dat1,$dat1
486         aesd    $dat0,q15
487         aesd    $dat1,q15
488
489         veor    $ivec,$ivec,$dat0
490         vld1.8  {$dat0},[$inp],$step
491         veor    $in0,$in0,$dat1
492         vld1.8  {$dat1},[$inp],$step1
493         vst1.8  {$ivec},[$out],#16
494         veor    $ivec,$in1,$rndlast
495         vst1.8  {$in0},[$out],#16
496         veor    $in0,$dat0,$rndlast
497         vorr    $in1,$dat1,$dat1
498         b.hs    .Loop2x_cbc_dec128
499
500         adds    $len,$len,#32
501         veor    $ivec,$ivec,$rndlast
502         b.eq    .Lcbc_done
503         veor    $in0,$in0,$rndlast
504         b       .Lcbc_dec_tail
505
506 .align  5
507 .Lcbc_dec:
508         subs    $len,$len,#16
509         vorr    $in0,$dat,$dat
510         b.lo    .Lcbc_dec_tail
511
512         cclr    $step,eq
513         cmp     $rounds,#2
514         vld1.8  {$dat1},[$inp],$step
515         vorr    $in1,$dat1,$dat1
516         b.eq    .Lcbc_dec128
517
518 .Loop2x_cbc_dec:
519         aesd    $dat0,q8
520         aesd    $dat1,q8
521         vld1.32 {q8},[$key_],#16
522         aesimc  $dat0,$dat0
523         aesimc  $dat1,$dat1
524         subs    $cnt,$cnt,#2
525         aesd    $dat0,q9
526         aesd    $dat1,q9
527         vld1.32 {q9},[$key_],#16
528         aesimc  $dat0,$dat0
529         aesimc  $dat1,$dat1
530         b.gt    .Loop2x_cbc_dec
531
532         aesd    $dat0,q8
533         aesd    $dat1,q8
534         aesimc  $dat0,$dat0
535         aesimc  $dat1,$dat1
536          veor   $tmp0,$ivec,$rndlast
537          veor   $tmp1,$in0,$rndlast
538         aesd    $dat0,q9
539         aesd    $dat1,q9
540         aesimc  $dat0,$dat0
541         aesimc  $dat1,$dat1
542          vorr   $ivec,$in1,$in1
543          subs   $len,$len,#32
544         aesd    $dat0,q10
545         aesd    $dat1,q10
546         aesimc  $dat0,$dat0
547          cclr   $step,lo
548         aesimc  $dat1,$dat1
549          mov    $key_,$key
550         aesd    $dat0,q11
551         aesd    $dat1,q11
552         aesimc  $dat0,$dat0
553          vld1.8 {$in0},[$inp],$step
554         aesimc  $dat1,$dat1
555          cclr   $step,ls
556         aesd    $dat0,q12
557         aesd    $dat1,q12
558         aesimc  $dat0,$dat0
559         aesimc  $dat1,$dat1
560          vld1.8 {$in1},[$inp],$step
561         aesd    $dat0,q13
562         aesd    $dat1,q13
563         aesimc  $dat0,$dat0
564         aesimc  $dat1,$dat1
565          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
566         aesd    $dat0,q14
567         aesd    $dat1,q14
568         aesimc  $dat0,$dat0
569         aesimc  $dat1,$dat1
570          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
571         aesd    $dat0,q15
572         aesd    $dat1,q15
573
574          mov    $cnt,$rounds
575         veor    $tmp0,$tmp0,$dat0
576         veor    $tmp1,$tmp1,$dat1
577          vorr   $dat0,$in0,$in0
578         vst1.8  {$tmp0},[$out],#16
579          vorr   $dat1,$in1,$in1
580         vst1.8  {$tmp1},[$out],#16
581         b.hs    .Loop2x_cbc_dec
582
583         adds    $len,$len,#32
584         b.eq    .Lcbc_done
585
586 .Lcbc_dec_tail:
587         aesd    $dat,q8
588         vld1.32 {q8},[$key_],#16
589         aesimc  $dat,$dat
590         subs    $cnt,$cnt,#2
591         aesd    $dat,q9
592         vld1.32 {q9},[$key_],#16
593         aesimc  $dat,$dat
594         b.gt    .Lcbc_dec_tail
595
596         aesd    $dat,q8
597         aesimc  $dat,$dat
598         aesd    $dat,q9
599         aesimc  $dat,$dat
600          veor   $tmp,$ivec,$rndlast
601         aesd    $dat,q10
602         aesimc  $dat,$dat
603          vorr   $ivec,$in0,$in0
604         aesd    $dat,q11
605         aesimc  $dat,$dat
606         aesd    $dat,q12
607         aesimc  $dat,$dat
608         aesd    $dat,q13
609         aesimc  $dat,$dat
610         aesd    $dat,q14
611         aesimc  $dat,$dat
612         aesd    $dat,q15
613
614         veor    $tmp,$tmp,$dat
615         vst1.8  {$tmp},[$out],#16
616
617 .Lcbc_done:
618         vst1.8  {$ivec},[$ivp]
619 .Lcbc_abort:
620 ___
621 $code.=<<___    if ($flavour !~ /64/);
622         vldmia  sp!,{d8-d15}
623         ldmia   sp!,{r4-r8,pc}
624 ___
625 $code.=<<___    if ($flavour =~ /64/);
626         ldr     x29,[sp],#16
627         ret
628 ___
629 $code.=<<___;
630 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
631 ___
632 }}}
633 {{{
634 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
635 my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
636 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
637
638 my ($dat,$tmp)=($dat0,$tmp0);
639
640 ### q8-q15      preloaded key schedule
641
642 $code.=<<___;
643 .globl  ${prefix}_ctr32_encrypt_blocks
644 .type   ${prefix}_ctr32_encrypt_blocks,%function
645 .align  5
646 ${prefix}_ctr32_encrypt_blocks:
647 ___
648 $code.=<<___    if ($flavour =~ /64/);
649         stp             x29,x30,[sp,#-16]!
650         add             x29,sp,#0
651 ___
652 $code.=<<___    if ($flavour !~ /64/);
653         mov             ip,sp
654         stmdb           sp!,{r4-r10,lr}
655         vstmdb          sp!,{d8-d15}            @ ABI specification says so
656         ldr             r4, [ip]                @ load remaining arg
657 ___
658 $code.=<<___;
659         ldr             $rounds,[$key,#240]
660
661         ldr             $ctr, [$ivp, #12]
662         vld1.32         {$dat0},[$ivp]
663
664         vld1.32         {q8-q9},[$key]          // load key schedule...
665         sub             $rounds,$rounds,#6
666         add             $key_,$key,x5,lsl#4     // pointer to last 7 round keys
667         sub             $rounds,$rounds,#2
668         vld1.32         {q10-q11},[$key_],#32
669         vld1.32         {q12-q13},[$key_],#32
670         vld1.32         {q14-q15},[$key_],#32
671         vld1.32         {$rndlast},[$key_]
672
673         add             $key_,$key,#32
674         mov             $cnt,$rounds
675
676         subs            $len,$len,#2
677         b.lo            .Lctr32_tail
678
679 #ifndef __ARMEB__
680         rev             $ctr, $ctr
681 #endif
682         vorr            $dat1,$dat0,$dat0
683         add             $ctr, $ctr, #1
684         vorr            $ivec,$dat0,$dat0
685         rev             $tctr1, $ctr
686         cmp             $rounds,#2
687         vmov.32         ${dat1}[3],$tctr1
688         b.eq            .Lctr32_128
689
690 .Loop2x_ctr32:
691         aese            $dat0,q8
692         aese            $dat1,q8
693         vld1.32         {q8},[$key_],#16
694         aesmc           $dat0,$dat0
695         aesmc           $dat1,$dat1
696         subs            $cnt,$cnt,#2
697         aese            $dat0,q9
698         aese            $dat1,q9
699         vld1.32         {q9},[$key_],#16
700         aesmc           $dat0,$dat0
701         aesmc           $dat1,$dat1
702         b.gt            .Loop2x_ctr32
703
704         aese            $dat0,q8
705         aese            $dat1,q8
706         aesmc           $tmp0,$dat0
707          vorr           $dat0,$ivec,$ivec
708         aesmc           $tmp1,$dat1
709          vorr           $dat1,$ivec,$ivec
710         aese            $tmp0,q9
711         aese            $tmp1,q9
712          vld1.8         {$in0},[$inp],#16
713         aesmc           $tmp0,$tmp0
714          vld1.8         {$in1},[$inp],#16
715         aesmc           $tmp1,$tmp1
716          add            $ctr,$ctr,#1
717         aese            $tmp0,q10
718         aese            $tmp1,q10
719          rev            $tctr,$ctr
720         aesmc           $tmp0,$tmp0
721         aesmc           $tmp1,$tmp1
722          add            $ctr,$ctr,#1
723         aese            $tmp0,q11
724         aese            $tmp1,q11
725          veor           $in0,$in0,$rndlast
726          rev            $tctr1,$ctr
727         aesmc           $tmp0,$tmp0
728         aesmc           $tmp1,$tmp1
729          veor           $in1,$in1,$rndlast
730          mov            $key_,$key
731         aese            $tmp0,q12
732         aese            $tmp1,q12
733          subs           $len,$len,#2
734         aesmc           $tmp0,$tmp0
735         aesmc           $tmp1,$tmp1
736          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
737         aese            $tmp0,q13
738         aese            $tmp1,q13
739         aesmc           $tmp0,$tmp0
740         aesmc           $tmp1,$tmp1
741          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
742         aese            $tmp0,q14
743         aese            $tmp1,q14
744          vmov.32        ${dat0}[3], $tctr
745         aesmc           $tmp0,$tmp0
746          vmov.32        ${dat1}[3], $tctr1
747         aesmc           $tmp1,$tmp1
748         aese            $tmp0,q15
749         aese            $tmp1,q15
750
751          mov            $cnt,$rounds
752         veor            $in0,$in0,$tmp0
753         veor            $in1,$in1,$tmp1
754         vst1.8          {$in0},[$out],#16
755         vst1.8          {$in1},[$out],#16
756         b.hs            .Loop2x_ctr32
757
758         adds            $len,$len,#2
759         b.eq            .Lctr32_done
760         b               .Lctr32_tail
761
762 .Lctr32_128:
763         vld1.32         {$tmp0-$tmp1},[$key_]
764
765 .Loop2x_ctr32_128:
766         aese            $dat0,q8
767         aese            $dat1,q8
768         aesmc           $dat0,$dat0
769          vld1.8         {$in0},[$inp],#16
770         aesmc           $dat1,$dat1
771          vld1.8         {$in1},[$inp],#16
772         aese            $dat0,q9
773         aese            $dat1,q9
774          add            $ctr,$ctr,#1
775         aesmc           $dat0,$dat0
776         aesmc           $dat1,$dat1
777          rev            $tctr,$ctr
778         aese            $dat0,$tmp0
779         aese            $dat1,$tmp0
780          add            $ctr,$ctr,#1
781         aesmc           $dat0,$dat0
782         aesmc           $dat1,$dat1
783          rev            $tctr1,$ctr
784         aese            $dat0,$tmp1
785         aese            $dat1,$tmp1
786          subs           $len,$len,#2
787         aesmc           $dat0,$dat0
788         aesmc           $dat1,$dat1
789         aese            $dat0,q10
790         aese            $dat1,q10
791         aesmc           $dat0,$dat0
792         aesmc           $dat1,$dat1
793         aese            $dat0,q11
794         aese            $dat1,q11
795         aesmc           $dat0,$dat0
796         aesmc           $dat1,$dat1
797         aese            $dat0,q12
798         aese            $dat1,q12
799         aesmc           $dat0,$dat0
800         aesmc           $dat1,$dat1
801         aese            $dat0,q13
802         aese            $dat1,q13
803         aesmc           $dat0,$dat0
804         aesmc           $dat1,$dat1
805         aese            $dat0,q14
806         aese            $dat1,q14
807         aesmc           $dat0,$dat0
808         aesmc           $dat1,$dat1
809          veor           $in0,$in0,$rndlast
810         aese            $dat0,q15
811          veor           $in1,$in1,$rndlast
812         aese            $dat1,q15
813
814         veor            $in0,$in0,$dat0
815         vorr            $dat0,$ivec,$ivec
816         veor            $in1,$in1,$dat1
817         vorr            $dat1,$ivec,$ivec
818         vst1.8          {$in0},[$out],#16
819         vmov.32         ${dat0}[3], $tctr
820         vst1.8          {$in1},[$out],#16
821         vmov.32         ${dat1}[3], $tctr1
822         b.hs            .Loop2x_ctr32_128
823
824         adds            $len,$len,#2
825         b.eq            .Lctr32_done
826
827 .Lctr32_tail:
828         aese            $dat,q8
829         vld1.32         {q8},[$key_],#16
830         aesmc           $dat,$dat
831         subs            $cnt,$cnt,#2
832         aese            $dat,q9
833         vld1.32         {q9},[$key_],#16
834         aesmc           $dat,$dat
835         b.gt            .Lctr32_tail
836
837         aese            $dat,q8
838         aesmc           $dat,$dat
839         aese            $dat,q9
840         aesmc           $dat,$dat
841          vld1.8         {$in0},[$inp]
842         aese            $dat,q10
843         aesmc           $dat,$dat
844         aese            $dat,q11
845         aesmc           $dat,$dat
846         aese            $dat,q12
847         aesmc           $dat,$dat
848         aese            $dat,q13
849         aesmc           $dat,$dat
850         aese            $dat,q14
851         aesmc           $dat,$dat
852          veor           $in0,$in0,$rndlast
853         aese            $dat,q15
854
855         veor            $in0,$in0,$dat
856         vst1.8          {$in0},[$out]
857
858 .Lctr32_done:
859 ___
860 $code.=<<___    if ($flavour !~ /64/);
861         vldmia          sp!,{d8-d15}
862         ldmia           sp!,{r4-r10,pc}
863 ___
864 $code.=<<___    if ($flavour =~ /64/);
865         ldr             x29,[sp],#16
866         ret
867 ___
868 $code.=<<___;
869 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
870 ___
871 }}}
872 $code.=<<___;
873 #endif
874 ___
875 ########################################
876 if ($flavour =~ /64/) {                 ######## 64-bit code
877     my %opcode = (
878         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
879         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
880
881     sub unaes {
882         my ($mnemonic,$arg)=@_;
883
884         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
885         sprintf ".long\t0x%08x\t//%s %s",
886                         $opcode{$mnemonic}|$1|($2<<5),
887                         $mnemonic,$arg;
888     }
889
890     foreach(split("\n",$code)) {
891         s/\`([^\`]*)\`/eval($1)/geo;
892
893         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
894         s/@\s/\/\//o;                   # old->new style commentary
895
896         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
897         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
898         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
899         s/vext\.8/ext/o         or
900         s/vrev32\.8/rev32/o     or
901         s/vtst\.8/cmtst/o       or
902         s/vshr/ushr/o           or
903         s/^(\s+)v/$1/o          or      # strip off v prefix
904         s/\bbx\s+lr\b/ret/o;
905
906         # fix up remainig legacy suffixes
907         s/\.[ui]?8//o;
908         m/\],#8/o and s/\.16b/\.8b/go;
909         s/\.[ui]?32//o and s/\.16b/\.4s/go;
910         s/\.[ui]?64//o and s/\.16b/\.2d/go;
911         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
912
913         print $_,"\n";
914     }
915 } else {                                ######## 32-bit code
916     my %opcode = (
917         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
918         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
919
920     sub unaes {
921         my ($mnemonic,$arg)=@_;
922
923         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
924         sprintf ".long\t0x%08x\t@ %s %s",
925                         $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
926                                           |(($2&7)<<1) |(($2&8)<<2),
927                         $mnemonic,$arg;
928     }
929
930     sub unvtbl {
931         my $arg=shift;
932
933         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
934         sprintf "vtbl.8 d%d,{q%d},d%d\n\tvtbl.8 d%d,{q%d},d%d",2*$1,$2,2*$3,2*$1+1,$2,2*$3+1;   
935     }
936
937     sub unvdup32 {
938         my $arg=shift;
939
940         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
941         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;        
942     }
943
944     sub unvmov32 {
945         my $arg=shift;
946
947         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
948         sprintf "vmov.32        d%d[%d],%s",2*$1+$2>>1,$2&1,$3; 
949     }
950
951     foreach(split("\n",$code)) {
952         s/\`([^\`]*)\`/eval($1)/geo;
953
954         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
955         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
956         s/\/\/\s?/@ /o;                         # new->old style commentary
957
958         # fix up remainig new-style suffixes
959         s/\],#[0-9]+/]!/o;
960
961         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
962         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
963         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
964         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
965         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
966         s/^(\s+)b\./$1b/o                               or
967         s/^(\s+)ret/$1bx\tlr/o;
968
969         print $_,"\n";
970     }
971 }
972
973 close STDOUT;