aesv8-armx.pl: inclrease interleave factor.
[openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
22 #
23 # Performance in cycles per byte processed with 128-bit key:
24 #
25 #               CBC enc         CBC dec         CTR
26 # Apple A7      2.39            1.20            1.20
27 # Cortex-A53    2.45            1.87            1.94
28 # Cortex-A57    3.64            1.34            1.32
29
30 $flavour = shift;
31 open STDOUT,">".shift;
32
33 $prefix="aes_v8";
34
35 $code=<<___;
36 #include "arm_arch.h"
37
38 #if __ARM_ARCH__>=7
39 .text
40 ___
41 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
42 $code.=".fpu    neon\n.code     32\n"   if ($flavour !~ /64/);
43
44 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
45 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
46 # maintain both 32- and 64-bit codes within single module and
47 # transliterate common code to either flavour with regex vodoo.
48 #
49 {{{
50 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
51 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
52         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
53
54
55 $code.=<<___;
56 .align  5
57 rcon:
58 .long   0x01,0x01,0x01,0x01
59 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
60 .long   0x1b,0x1b,0x1b,0x1b
61
62 .globl  ${prefix}_set_encrypt_key
63 .type   ${prefix}_set_encrypt_key,%function
64 .align  5
65 ${prefix}_set_encrypt_key:
66 .Lenc_key:
67 ___
68 $code.=<<___    if ($flavour =~ /64/);
69         stp     x29,x30,[sp,#-16]!
70         add     x29,sp,#0
71 ___
72 $code.=<<___;
73         adr     $ptr,rcon
74         cmp     $bits,#192
75
76         veor    $zero,$zero,$zero
77         vld1.8  {$in0},[$inp],#16
78         mov     $bits,#8                // reuse $bits
79         vld1.32 {$rcon,$mask},[$ptr],#32
80
81         b.lt    .Loop128
82         b.eq    .L192
83         b       .L256
84
85 .align  4
86 .Loop128:
87         vtbl.8  $key,{$in0},$mask
88         vext.8  $tmp,$zero,$in0,#12
89         vst1.32 {$in0},[$out],#16
90         aese    $key,$zero
91         subs    $bits,$bits,#1
92
93         veor    $in0,$in0,$tmp
94         vext.8  $tmp,$zero,$tmp,#12
95         veor    $in0,$in0,$tmp
96         vext.8  $tmp,$zero,$tmp,#12
97          veor   $key,$key,$rcon
98         veor    $in0,$in0,$tmp
99         vshl.u8 $rcon,$rcon,#1
100         veor    $in0,$in0,$key
101         b.ne    .Loop128
102
103         vld1.32 {$rcon},[$ptr]
104
105         vtbl.8  $key,{$in0},$mask
106         vext.8  $tmp,$zero,$in0,#12
107         vst1.32 {$in0},[$out],#16
108         aese    $key,$zero
109
110         veor    $in0,$in0,$tmp
111         vext.8  $tmp,$zero,$tmp,#12
112         veor    $in0,$in0,$tmp
113         vext.8  $tmp,$zero,$tmp,#12
114          veor   $key,$key,$rcon
115         veor    $in0,$in0,$tmp
116         vshl.u8 $rcon,$rcon,#1
117         veor    $in0,$in0,$key
118
119         vtbl.8  $key,{$in0},$mask
120         vext.8  $tmp,$zero,$in0,#12
121         vst1.32 {$in0},[$out],#16
122         aese    $key,$zero
123
124         veor    $in0,$in0,$tmp
125         vext.8  $tmp,$zero,$tmp,#12
126         veor    $in0,$in0,$tmp
127         vext.8  $tmp,$zero,$tmp,#12
128          veor   $key,$key,$rcon
129         veor    $in0,$in0,$tmp
130         veor    $in0,$in0,$key
131         vst1.32 {$in0},[$out]
132         add     $out,$out,#0x50
133
134         mov     $rounds,#10
135         b       .Ldone
136
137 .align  4
138 .L192:
139         vld1.8  {$in1},[$inp],#8
140         vmov.i8 $key,#8                 // borrow $key
141         vst1.32 {$in0},[$out],#16
142         vsub.i8 $mask,$mask,$key        // adjust the mask
143
144 .Loop192:
145         vtbl.8  $key,{$in1},$mask
146         vext.8  $tmp,$zero,$in0,#12
147         vst1.32 {$in1},[$out],#8
148         aese    $key,$zero
149         subs    $bits,$bits,#1
150
151         veor    $in0,$in0,$tmp
152         vext.8  $tmp,$zero,$tmp,#12
153         veor    $in0,$in0,$tmp
154         vext.8  $tmp,$zero,$tmp,#12
155         veor    $in0,$in0,$tmp
156
157         vdup.32 $tmp,${in0}[3]
158         veor    $tmp,$tmp,$in1
159          veor   $key,$key,$rcon
160         vext.8  $in1,$zero,$in1,#12
161         vshl.u8 $rcon,$rcon,#1
162         veor    $in1,$in1,$tmp
163         veor    $in0,$in0,$key
164         veor    $in1,$in1,$key
165         vst1.32 {$in0},[$out],#16
166         b.ne    .Loop192
167
168         mov     $rounds,#12
169         add     $out,$out,#0x20
170         b       .Ldone
171
172 .align  4
173 .L256:
174         vld1.8  {$in1},[$inp]
175         mov     $bits,#7
176         mov     $rounds,#14
177         vst1.32 {$in0},[$out],#16
178
179 .Loop256:
180         vtbl.8  $key,{$in1},$mask
181         vext.8  $tmp,$zero,$in0,#12
182         vst1.32 {$in1},[$out],#16
183         aese    $key,$zero
184         subs    $bits,$bits,#1
185
186         veor    $in0,$in0,$tmp
187         vext.8  $tmp,$zero,$tmp,#12
188         veor    $in0,$in0,$tmp
189         vext.8  $tmp,$zero,$tmp,#12
190          veor   $key,$key,$rcon
191         veor    $in0,$in0,$tmp
192         vshl.u8 $rcon,$rcon,#1
193         veor    $in0,$in0,$key
194         vst1.32 {$in0},[$out],#16
195         b.eq    .Ldone
196
197         vdup.32 $key,${in0}[3]          // just splat
198         vext.8  $tmp,$zero,$in1,#12
199         aese    $key,$zero
200
201         veor    $in1,$in1,$tmp
202         vext.8  $tmp,$zero,$tmp,#12
203         veor    $in1,$in1,$tmp
204         vext.8  $tmp,$zero,$tmp,#12
205         veor    $in1,$in1,$tmp
206
207         veor    $in1,$in1,$key
208         b       .Loop256
209
210 .Ldone:
211         str     $rounds,[$out]
212
213         eor     x0,x0,x0                // return value
214         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
215         ret
216 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
217
218 .globl  ${prefix}_set_decrypt_key
219 .type   ${prefix}_set_decrypt_key,%function
220 .align  5
221 ${prefix}_set_decrypt_key:
222 ___
223 $code.=<<___    if ($flavour =~ /64/);
224         stp     x29,x30,[sp,#-16]!
225         add     x29,sp,#0
226 ___
227 $code.=<<___    if ($flavour !~ /64/);
228         stmdb   sp!,{r4,lr}
229 ___
230 $code.=<<___;
231         bl      .Lenc_key
232
233         sub     $out,$out,#240          // restore original $out
234         mov     x4,#-16
235         add     $inp,$out,x12,lsl#4     // end of key schedule
236
237         vld1.32 {v0.16b},[$out]
238         vld1.32 {v1.16b},[$inp]
239         vst1.32 {v0.16b},[$inp],x4
240         vst1.32 {v1.16b},[$out],#16
241
242 .Loop_imc:
243         vld1.32 {v0.16b},[$out]
244         vld1.32 {v1.16b},[$inp]
245         aesimc  v0.16b,v0.16b
246         aesimc  v1.16b,v1.16b
247         vst1.32 {v0.16b},[$inp],x4
248         vst1.32 {v1.16b},[$out],#16
249         cmp     $inp,$out
250         b.hi    .Loop_imc
251
252         vld1.32 {v0.16b},[$out]
253         aesimc  v0.16b,v0.16b
254         vst1.32 {v0.16b},[$inp]
255
256         eor     x0,x0,x0                // return value
257 ___
258 $code.=<<___    if ($flavour !~ /64/);
259         ldmia   sp!,{r4,pc}
260 ___
261 $code.=<<___    if ($flavour =~ /64/);
262         ldp     x29,x30,[sp],#16
263         ret
264 ___
265 $code.=<<___;
266 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
267 ___
268 }}}
269 {{{
270 sub gen_block () {
271 my $dir = shift;
272 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
273 my ($inp,$out,$key)=map("x$_",(0..2));
274 my $rounds="w3";
275 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
276
277 $code.=<<___;
278 .globl  ${prefix}_${dir}crypt
279 .type   ${prefix}_${dir}crypt,%function
280 .align  5
281 ${prefix}_${dir}crypt:
282         ldr     $rounds,[$key,#240]
283         vld1.32 {$rndkey0},[$key],#16
284         vld1.8  {$inout},[$inp]
285         sub     $rounds,$rounds,#2
286         vld1.32 {$rndkey1},[$key],#16
287
288 .Loop_${dir}c:
289         aes$e   $inout,$rndkey0
290         vld1.32 {$rndkey0},[$key],#16
291         aes$mc  $inout,$inout
292         subs    $rounds,$rounds,#2
293         aes$e   $inout,$rndkey1
294         vld1.32 {$rndkey1},[$key],#16
295         aes$mc  $inout,$inout
296         b.gt    .Loop_${dir}c
297
298         aes$e   $inout,$rndkey0
299         vld1.32 {$rndkey0},[$key]
300         aes$mc  $inout,$inout
301         aes$e   $inout,$rndkey1
302         veor    $inout,$inout,$rndkey0
303
304         vst1.8  {$inout},[$out]
305         ret
306 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
307 ___
308 }
309 &gen_block("en");
310 &gen_block("de");
311 }}}
312 {{{
313 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
314 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
315 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
316
317 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
318
319 ### q8-q15      preloaded key schedule
320
321 $code.=<<___;
322 .globl  ${prefix}_cbc_encrypt
323 .type   ${prefix}_cbc_encrypt,%function
324 .align  5
325 ${prefix}_cbc_encrypt:
326 ___
327 $code.=<<___    if ($flavour =~ /64/);
328         stp     x29,x30,[sp,#-16]!
329         add     x29,sp,#0
330 ___
331 $code.=<<___    if ($flavour !~ /64/);
332         mov     ip,sp
333         stmdb   sp!,{r4-r8,lr}
334         vstmdb  sp!,{d8-d15}            @ ABI specification says so
335         ldmia   ip,{r4-r5}              @ load remaining args
336 ___
337 $code.=<<___;
338         subs    $len,$len,#16
339         mov     $step,#16
340         b.lo    .Lcbc_abort
341         cclr    $step,eq
342
343         cmp     $enc,#0                 // en- or decrypting?
344         ldr     $rounds,[$key,#240]
345         and     $len,$len,#-16
346         vld1.8  {$ivec},[$ivp]
347         vld1.8  {$dat},[$inp],$step
348
349         vld1.32 {q8-q9},[$key]          // load key schedule...
350         sub     $rounds,$rounds,#6
351         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
352         sub     $rounds,$rounds,#2
353         vld1.32 {q10-q11},[$key_],#32
354         vld1.32 {q12-q13},[$key_],#32
355         vld1.32 {q14-q15},[$key_],#32
356         vld1.32 {$rndlast},[$key_]
357
358         add     $key_,$key,#32
359         mov     $cnt,$rounds
360         b.eq    .Lcbc_dec
361
362         cmp     $rounds,#2
363         veor    $dat,$dat,$ivec
364         veor    $rndzero_n_last,q8,$rndlast
365         b.eq    .Lcbc_enc128
366
367 .Loop_cbc_enc:
368         aese    $dat,q8
369         vld1.32 {q8},[$key_],#16
370         aesmc   $dat,$dat
371         subs    $cnt,$cnt,#2
372         aese    $dat,q9
373         vld1.32 {q9},[$key_],#16
374         aesmc   $dat,$dat
375         b.gt    .Loop_cbc_enc
376
377         aese    $dat,q8
378         aesmc   $dat,$dat
379          subs   $len,$len,#16
380         aese    $dat,q9
381         aesmc   $dat,$dat
382          cclr   $step,eq
383         aese    $dat,q10
384         aesmc   $dat,$dat
385          add    $key_,$key,#16
386         aese    $dat,q11
387         aesmc   $dat,$dat
388          vld1.8 {q8},[$inp],$step
389         aese    $dat,q12
390         aesmc   $dat,$dat
391          veor   q8,q8,$rndzero_n_last
392         aese    $dat,q13
393         aesmc   $dat,$dat
394          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
395         aese    $dat,q14
396         aesmc   $dat,$dat
397         aese    $dat,q15
398
399          mov    $cnt,$rounds
400         veor    $ivec,$dat,$rndlast
401         vst1.8  {$ivec},[$out],#16
402         b.hs    .Loop_cbc_enc
403
404         b       .Lcbc_done
405
406 .align  5
407 .Lcbc_enc128:
408         vld1.32 {$in0-$in1},[$key_]
409         aese    $dat,q8
410         aesmc   $dat,$dat
411         b       .Lenter_cbc_enc128
412 .Loop_cbc_enc128:
413         aese    $dat,q8
414         aesmc   $dat,$dat
415          vst1.8 {$ivec},[$out],#16
416 .Lenter_cbc_enc128:
417         aese    $dat,q9
418         aesmc   $dat,$dat
419          subs   $len,$len,#16
420         aese    $dat,$in0
421         aesmc   $dat,$dat
422          cclr   $step,eq
423         aese    $dat,$in1
424         aesmc   $dat,$dat
425         aese    $dat,q10
426         aesmc   $dat,$dat
427         aese    $dat,q11
428         aesmc   $dat,$dat
429          vld1.8 {q8},[$inp],$step
430         aese    $dat,q12
431         aesmc   $dat,$dat
432         aese    $dat,q13
433         aesmc   $dat,$dat
434         aese    $dat,q14
435         aesmc   $dat,$dat
436          veor   q8,q8,$rndzero_n_last
437         aese    $dat,q15
438         veor    $ivec,$dat,$rndlast
439         b.hs    .Loop_cbc_enc128
440
441         vst1.8  {$ivec},[$out],#16
442         b       .Lcbc_done
443 ___
444 {
445 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
446 $code.=<<___;
447 .align  5
448 .Lcbc_dec:
449         vld1.8  {$dat2},[$inp],#16
450         subs    $len,$len,#32           // bias
451         add     $cnt,$rounds,#2
452         vorr    $in1,$dat,$dat
453         vorr    $dat1,$dat,$dat
454         vorr    $in2,$dat2,$dat2
455         b.lo    .Lcbc_dec_tail
456
457         vorr    $dat1,$dat2,$dat2
458         vld1.8  {$dat2},[$inp],#16
459         vorr    $in0,$dat,$dat
460         vorr    $in1,$dat1,$dat1
461         vorr    $in2,$dat2,$dat2
462
463 .Loop3x_cbc_dec:
464         aesd    $dat0,q8
465         aesd    $dat1,q8
466         aesd    $dat2,q8
467         vld1.32 {q8},[$key_],#16
468         aesimc  $dat0,$dat0
469         aesimc  $dat1,$dat1
470         aesimc  $dat2,$dat2
471         subs    $cnt,$cnt,#2
472         aesd    $dat0,q9
473         aesd    $dat1,q9
474         aesd    $dat2,q9
475         vld1.32 {q9},[$key_],#16
476         aesimc  $dat0,$dat0
477         aesimc  $dat1,$dat1
478         aesimc  $dat2,$dat2
479         b.gt    .Loop3x_cbc_dec
480
481         aesd    $dat0,q8
482         aesd    $dat1,q8
483         aesd    $dat2,q8
484          veor   $tmp0,$ivec,$rndlast
485         aesimc  $dat0,$dat0
486         aesimc  $dat1,$dat1
487         aesimc  $dat2,$dat2
488          veor   $tmp1,$in0,$rndlast
489         aesd    $dat0,q9
490         aesd    $dat1,q9
491         aesd    $dat2,q9
492          veor   $tmp2,$in1,$rndlast
493          subs   $len,$len,#0x30
494         aesimc  $dat0,$dat0
495         aesimc  $dat1,$dat1
496         aesimc  $dat2,$dat2
497          vorr   $ivec,$in2,$in2
498          mov.lo x6,$len                 // x6, $cnt, is zero at this point
499         aesd    $dat0,q12
500         aesd    $dat1,q12
501         aesd    $dat2,q12
502          add    $inp,$inp,x6            // $inp is adjusted in such way that
503                                         // at exit from the loop $dat1-$dat2
504                                         // are loaded with last "words"
505         aesimc  $dat0,$dat0
506         aesimc  $dat1,$dat1
507         aesimc  $dat2,$dat2
508          mov    $key_,$key
509         aesd    $dat0,q13
510         aesd    $dat1,q13
511         aesd    $dat2,q13
512          vld1.8 {$in0},[$inp],#16
513         aesimc  $dat0,$dat0
514         aesimc  $dat1,$dat1
515         aesimc  $dat2,$dat2
516          vld1.8 {$in1},[$inp],#16
517         aesd    $dat0,q14
518         aesd    $dat1,q14
519         aesd    $dat2,q14
520          vld1.8 {$in2},[$inp],#16
521         aesimc  $dat0,$dat0
522         aesimc  $dat1,$dat1
523         aesimc  $dat2,$dat2
524          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
525         aesd    $dat0,q15
526         aesd    $dat1,q15
527         aesd    $dat2,q15
528
529          add    $cnt,$rounds,#2
530         veor    $tmp0,$tmp0,$dat0
531         veor    $tmp1,$tmp1,$dat1
532         veor    $dat2,$dat2,$tmp2
533          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
534          vorr   $dat0,$in0,$in0
535         vst1.8  {$tmp0},[$out],#16
536          vorr   $dat1,$in1,$in1
537         vst1.8  {$tmp1},[$out],#16
538         vst1.8  {$dat2},[$out],#16
539          vorr   $dat2,$in2,$in2
540         b.hs    .Loop3x_cbc_dec
541
542         cmn     $len,#0x30
543         b.eq    .Lcbc_done
544         nop
545
546 .Lcbc_dec_tail:
547         aesd    $dat1,q8
548         aesd    $dat2,q8
549         vld1.32 {q8},[$key_],#16
550         aesimc  $dat1,$dat1
551         aesimc  $dat2,$dat2
552         subs    $cnt,$cnt,#2
553         aesd    $dat1,q9
554         aesd    $dat2,q9
555         vld1.32 {q9},[$key_],#16
556         aesimc  $dat1,$dat1
557         aesimc  $dat2,$dat2
558         b.gt    .Lcbc_dec_tail
559
560         aesd    $dat1,q8
561         aesd    $dat2,q8
562         aesimc  $dat1,$dat1
563         aesimc  $dat2,$dat2
564         aesd    $dat1,q9
565         aesd    $dat2,q9
566         aesimc  $dat1,$dat1
567         aesimc  $dat2,$dat2
568         aesd    $dat1,q12
569         aesd    $dat2,q12
570         aesimc  $dat1,$dat1
571         aesimc  $dat2,$dat2
572          cmn    $len,#0x20
573         aesd    $dat1,q13
574         aesd    $dat2,q13
575         aesimc  $dat1,$dat1
576         aesimc  $dat2,$dat2
577          veor   $tmp1,$ivec,$rndlast
578         aesd    $dat1,q14
579         aesd    $dat2,q14
580         aesimc  $dat1,$dat1
581         aesimc  $dat2,$dat2
582          veor   $tmp2,$in1,$rndlast
583         aesd    $dat1,q15
584         aesd    $dat2,q15
585         b.eq    .Lcbc_dec_one
586         veor    $tmp1,$tmp1,$dat1
587         veor    $tmp2,$tmp2,$dat2
588          vorr   $ivec,$in2,$in2
589         vst1.8  {$tmp1},[$out],#16
590         vst1.8  {$tmp2},[$out],#16
591         b       .Lcbc_done
592
593 .Lcbc_dec_one:
594         veor    $tmp1,$tmp1,$dat2
595          vorr   $ivec,$in2,$in2
596         vst1.8  {$tmp1},[$out],#16
597
598 .Lcbc_done:
599         vst1.8  {$ivec},[$ivp]
600 .Lcbc_abort:
601 ___
602 }
603 $code.=<<___    if ($flavour !~ /64/);
604         vldmia  sp!,{d8-d15}
605         ldmia   sp!,{r4-r8,pc}
606 ___
607 $code.=<<___    if ($flavour =~ /64/);
608         ldr     x29,[sp],#16
609         ret
610 ___
611 $code.=<<___;
612 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
613 ___
614 }}}
615 {{{
616 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
617 my ($rounds,$cnt,$key_)=("w5","w6","x7");
618 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
619 my $step="x12";         # aliases with $tctr2
620
621 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
622 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
623
624 my ($dat,$tmp)=($dat0,$tmp0);
625
626 ### q8-q15      preloaded key schedule
627
628 $code.=<<___;
629 .globl  ${prefix}_ctr32_encrypt_blocks
630 .type   ${prefix}_ctr32_encrypt_blocks,%function
631 .align  5
632 ${prefix}_ctr32_encrypt_blocks:
633 ___
634 $code.=<<___    if ($flavour =~ /64/);
635         stp             x29,x30,[sp,#-16]!
636         add             x29,sp,#0
637 ___
638 $code.=<<___    if ($flavour !~ /64/);
639         mov             ip,sp
640         stmdb           sp!,{r4-r10,lr}
641         vstmdb          sp!,{d8-d15}            @ ABI specification says so
642         ldr             r4, [ip]                @ load remaining arg
643 ___
644 $code.=<<___;
645         ldr             $rounds,[$key,#240]
646
647         ldr             $ctr, [$ivp, #12]
648         vld1.32         {$dat0},[$ivp]
649
650         vld1.32         {q8-q9},[$key]          // load key schedule...
651         sub             $rounds,$rounds,#4
652         mov             $step,#16
653         cmp             $len,#2
654         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
655         sub             $rounds,$rounds,#2
656         vld1.32         {q12-q13},[$key_],#32
657         vld1.32         {q14-q15},[$key_],#32
658         vld1.32         {$rndlast},[$key_]
659         add             $key_,$key,#32
660         mov             $cnt,$rounds
661         cclr            $step,lo
662 #ifndef __ARMEB__
663         rev             $ctr, $ctr
664 #endif
665         vorr            $dat1,$dat0,$dat0
666         add             $tctr1, $ctr, #1
667         vorr            $dat2,$dat0,$dat0
668         add             $ctr, $ctr, #2
669         vorr            $ivec,$dat0,$dat0
670         rev             $tctr1, $tctr1
671         vmov.32         ${dat1}[3],$tctr1
672         b.ls            .Lctr32_tail
673         rev             $tctr2, $ctr
674         sub             $len,$len,#3            // bias
675         vmov.32         ${dat2}[3],$tctr2
676         b               .Loop3x_ctr32
677
678 .align  4
679 .Loop3x_ctr32:
680         aese            $dat0,q8
681         aese            $dat1,q8
682         aese            $dat2,q8
683         vld1.32         {q8},[$key_],#16
684         aesmc           $dat0,$dat0
685         aesmc           $dat1,$dat1
686         aesmc           $dat2,$dat2
687         subs            $cnt,$cnt,#2
688         aese            $dat0,q9
689         aese            $dat1,q9
690         aese            $dat2,q9
691         vld1.32         {q9},[$key_],#16
692         aesmc           $dat0,$dat0
693         aesmc           $dat1,$dat1
694         aesmc           $dat2,$dat2
695         b.gt            .Loop3x_ctr32
696
697         aese            $dat0,q8
698         aese            $dat1,q8
699         aese            $dat2,q8
700          mov            $key_,$key
701         aesmc           $tmp0,$dat0
702          vld1.8         {$in0},[$inp],#16
703         aesmc           $tmp1,$dat1
704         aesmc           $dat2,$dat2
705          vorr           $dat0,$ivec,$ivec
706         aese            $tmp0,q9
707          vld1.8         {$in1},[$inp],#16
708         aese            $tmp1,q9
709         aese            $dat2,q9
710          vorr           $dat1,$ivec,$ivec
711         aesmc           $tmp0,$tmp0
712          vld1.8         {$in2},[$inp],#16
713         aesmc           $tmp1,$tmp1
714         aesmc           $tmp2,$dat2
715          vorr           $dat2,$ivec,$ivec
716          add            $tctr0,$ctr,#1
717         aese            $tmp0,q12
718         aese            $tmp1,q12
719         aese            $tmp2,q12
720          veor           $in0,$in0,$rndlast
721          add            $tctr1,$ctr,#2
722         aesmc           $tmp0,$tmp0
723         aesmc           $tmp1,$tmp1
724         aesmc           $tmp2,$tmp2
725          veor           $in1,$in1,$rndlast
726          add            $ctr,$ctr,#3
727         aese            $tmp0,q13
728         aese            $tmp1,q13
729         aese            $tmp2,q13
730          veor           $in2,$in2,$rndlast
731          rev            $tctr0,$tctr0
732         aesmc           $tmp0,$tmp0
733          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
734         aesmc           $tmp1,$tmp1
735         aesmc           $tmp2,$tmp2
736          vmov.32        ${dat0}[3], $tctr0
737          rev            $tctr1,$tctr1
738         aese            $tmp0,q14
739         aese            $tmp1,q14
740         aese            $tmp2,q14
741          vmov.32        ${dat1}[3], $tctr1
742          rev            $tctr2,$ctr
743         aesmc           $tmp0,$tmp0
744         aesmc           $tmp1,$tmp1
745         aesmc           $tmp2,$tmp2
746          vmov.32        ${dat2}[3], $tctr2
747          subs           $len,$len,#3
748         aese            $tmp0,q15
749         aese            $tmp1,q15
750         aese            $tmp2,q15
751
752          mov            $cnt,$rounds
753         veor            $in0,$in0,$tmp0
754         veor            $in1,$in1,$tmp1
755         veor            $in2,$in2,$tmp2
756          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
757         vst1.8          {$in0},[$out],#16
758         vst1.8          {$in1},[$out],#16
759         vst1.8          {$in2},[$out],#16
760         b.hs            .Loop3x_ctr32
761
762         adds            $len,$len,#3
763         b.eq            .Lctr32_done
764         cmp             $len,#1
765         mov             $step,#16
766         cclr            $step,eq
767
768 .Lctr32_tail:
769         aese            $dat0,q8
770         aese            $dat1,q8
771         vld1.32         {q8},[$key_],#16
772         aesmc           $dat0,$dat0
773         aesmc           $dat1,$dat1
774         subs            $cnt,$cnt,#2
775         aese            $dat0,q9
776         aese            $dat1,q9
777         vld1.32         {q9},[$key_],#16
778         aesmc           $dat0,$dat0
779         aesmc           $dat1,$dat1
780         b.gt            .Lctr32_tail
781
782         aese            $dat0,q8
783         aese            $dat1,q8
784         aesmc           $dat0,$dat0
785         aesmc           $dat1,$dat1
786         aese            $dat0,q9
787         aese            $dat1,q9
788         aesmc           $dat0,$dat0
789         aesmc           $dat1,$dat1
790          vld1.8         {$in0},[$inp],$step
791         aese            $dat0,q12
792         aese            $dat1,q12
793          vld1.8         {$in1},[$inp]
794         aesmc           $dat0,$dat0
795         aesmc           $dat1,$dat1
796         aese            $dat0,q13
797         aese            $dat1,q13
798         aesmc           $dat0,$dat0
799         aesmc           $dat1,$dat1
800         aese            $dat0,q14
801         aese            $dat1,q14
802          veor           $in0,$in0,$rndlast
803         aesmc           $dat0,$dat0
804         aesmc           $dat1,$dat1
805          veor           $in1,$in1,$rndlast
806         aese            $dat0,q15
807         aese            $dat1,q15
808
809         cmp             $len,#1
810         veor            $in0,$in0,$dat0
811         veor            $in1,$in1,$dat1
812         vst1.8          {$in0},[$out],#16
813         b.eq            .Lctr32_done
814         vst1.8          {$in1},[$out]
815
816 .Lctr32_done:
817 ___
818 $code.=<<___    if ($flavour !~ /64/);
819         vldmia          sp!,{d8-d15}
820         ldmia           sp!,{r4-r10,pc}
821 ___
822 $code.=<<___    if ($flavour =~ /64/);
823         ldr             x29,[sp],#16
824         ret
825 ___
826 $code.=<<___;
827 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
828 ___
829 }}}
830 $code.=<<___;
831 #endif
832 ___
833 ########################################
834 if ($flavour =~ /64/) {                 ######## 64-bit code
835     my %opcode = (
836         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
837         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
838
839     local *unaes = sub {
840         my ($mnemonic,$arg)=@_;
841
842         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
843         sprintf ".inst\t0x%08x\t//%s %s",
844                         $opcode{$mnemonic}|$1|($2<<5),
845                         $mnemonic,$arg;
846     };
847
848     foreach(split("\n",$code)) {
849         s/\`([^\`]*)\`/eval($1)/geo;
850
851         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
852         s/@\s/\/\//o;                   # old->new style commentary
853
854         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
855         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
856         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
857         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
858         s/vext\.8/ext/o         or
859         s/vrev32\.8/rev32/o     or
860         s/vtst\.8/cmtst/o       or
861         s/vshr/ushr/o           or
862         s/^(\s+)v/$1/o          or      # strip off v prefix
863         s/\bbx\s+lr\b/ret/o;
864
865         # fix up remainig legacy suffixes
866         s/\.[ui]?8//o;
867         m/\],#8/o and s/\.16b/\.8b/go;
868         s/\.[ui]?32//o and s/\.16b/\.4s/go;
869         s/\.[ui]?64//o and s/\.16b/\.2d/go;
870         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
871
872         print $_,"\n";
873     }
874 } else {                                ######## 32-bit code
875     my %opcode = (
876         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
877         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
878
879     local *unaes = sub {
880         my ($mnemonic,$arg)=@_;
881
882         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
883             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
884                                          |(($2&7)<<1) |(($2&8)<<2);
885             # since ARMv7 instructions are always encoded little-endian.
886             # correct solution is to use .inst directive, but older
887             # assemblers don't implement it:-(
888             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
889                         $word&0xff,($word>>8)&0xff,
890                         ($word>>16)&0xff,($word>>24)&0xff,
891                         $mnemonic,$arg;
892         }
893     };
894
895     sub unvtbl {
896         my $arg=shift;
897
898         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
899         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
900                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
901     }
902
903     sub unvdup32 {
904         my $arg=shift;
905
906         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
907         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;      
908     }
909
910     sub unvmov32 {
911         my $arg=shift;
912
913         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
914         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;       
915     }
916
917     foreach(split("\n",$code)) {
918         s/\`([^\`]*)\`/eval($1)/geo;
919
920         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
921         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
922         s/\/\/\s?/@ /o;                         # new->old style commentary
923
924         # fix up remainig new-style suffixes
925         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
926         s/\],#[0-9]+/]!/o;
927
928         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
929         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
930         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
931         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
932         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
933         s/^(\s+)b\./$1b/o                               or
934         s/^(\s+)mov\./$1mov/o                           or
935         s/^(\s+)ret/$1bx\tlr/o;
936
937         print $_,"\n";
938     }
939 }
940
941 close STDOUT;