aesv8-armx.pl: add CTR implementation.
[openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional instructions. This has
15 # no effect on mighty Apple A7, as results are literally equal to
16 # the theoretical estimates based on instruction latencies and issue
17 # rate. It remains to be seen how does it affect other platforms...
18 #
19 # Performance in cycles per byte processed with 128-bit key:
20 #
21 #               CBC enc         CBC dec
22 # Apple A7      2.39            1.20
23 # Cortex-A5x    n/a             n/a
24
25 $flavour = shift;
26 $prefix="AES";
27
28 $code=".text\n";
29 $code.=".arch   armv8-a+crypto\n"       if ($flavour =~ /64/);
30 $code.=".fpu    neon\n.code     32\n"   if ($flavour !~ /64/);
31
32 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
33 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
34 # maintain both 32- and 64-bit codes within single module and
35 # transliterate common code to either flavour with regex vodoo.
36 #
37 {{{
38 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
39 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
40         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
41
42
43 $code.=<<___;
44 .align  5
45 rcon:
46 .long   0x01,0x01,0x01,0x01
47 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
48 .long   0x1b,0x1b,0x1b,0x1b
49
50 .globl  ${prefix}_set_encrypt_key
51 .type   ${prefix}_set_encrypt_key,%function
52 .align  5
53 ${prefix}_set_encrypt_key:
54 .Lenc_key:
55 ___
56 $code.=<<___    if ($flavour =~ /64/);
57         stp     x29,x30,[sp,#-16]!
58         add     x29,sp,#0
59 ___
60 $code.=<<___;
61         adr     $ptr,rcon
62         cmp     $bits,#192
63
64         veor    $zero,$zero,$zero
65         vld1.8  {$in0},[$inp],#16
66         mov     $bits,#8                // reuse $bits
67         vld1.32 {$rcon,$mask},[$ptr],#32
68
69         b.lt    .Loop128
70         b.eq    .L192
71         b       .L256
72
73 .align  4
74 .Loop128:
75         vtbl.8  $key,{$in0},$mask
76         vext.8  $tmp,$zero,$in0,#12
77         vst1.32 {$in0},[$out],#16
78         aese    $key,$zero
79         subs    $bits,$bits,#1
80
81         veor    $in0,$in0,$tmp
82         vext.8  $tmp,$zero,$tmp,#12
83         veor    $in0,$in0,$tmp
84         vext.8  $tmp,$zero,$tmp,#12
85          veor   $key,$key,$rcon
86         veor    $in0,$in0,$tmp
87         vshl.u8 $rcon,$rcon,#1
88         veor    $in0,$in0,$key
89         b.ne    .Loop128
90
91         vld1.32 {$rcon},[$ptr]
92
93         vtbl.8  $key,{$in0},$mask
94         vext.8  $tmp,$zero,$in0,#12
95         vst1.32 {$in0},[$out],#16
96         aese    $key,$zero
97
98         veor    $in0,$in0,$tmp
99         vext.8  $tmp,$zero,$tmp,#12
100         veor    $in0,$in0,$tmp
101         vext.8  $tmp,$zero,$tmp,#12
102          veor   $key,$key,$rcon
103         veor    $in0,$in0,$tmp
104         vshl.u8 $rcon,$rcon,#1
105         veor    $in0,$in0,$key
106
107         vtbl.8  $key,{$in0},$mask
108         vext.8  $tmp,$zero,$in0,#12
109         vst1.32 {$in0},[$out],#16
110         aese    $key,$zero
111
112         veor    $in0,$in0,$tmp
113         vext.8  $tmp,$zero,$tmp,#12
114         veor    $in0,$in0,$tmp
115         vext.8  $tmp,$zero,$tmp,#12
116          veor   $key,$key,$rcon
117         veor    $in0,$in0,$tmp
118         veor    $in0,$in0,$key
119         vst1.32 {$in0},[$out]
120         add     $out,$out,#0x50
121
122         mov     $rounds,#10
123         b       .Ldone
124
125 .align  4
126 .L192:
127         vld1.8  {$in1},[$inp],#8
128         vmov.i8 $key,#8                 // borrow $key
129         vst1.32 {$in0},[$out],#16
130         vsub.i8 $mask,$mask,$key        // adjust the mask
131
132 .Loop192:
133         vtbl.8  $key,{$in1},$mask
134         vext.8  $tmp,$zero,$in0,#12
135         vst1.32 {$in1},[$out],#8
136         aese    $key,$zero
137         subs    $bits,$bits,#1
138
139         veor    $in0,$in0,$tmp
140         vext.8  $tmp,$zero,$tmp,#12
141         veor    $in0,$in0,$tmp
142         vext.8  $tmp,$zero,$tmp,#12
143         veor    $in0,$in0,$tmp
144
145         vdup.32 $tmp,${in0}[3]
146         veor    $tmp,$tmp,$in1
147          veor   $key,$key,$rcon
148         vext.8  $in1,$zero,$in1,#12
149         vshl.u8 $rcon,$rcon,#1
150         veor    $in1,$in1,$tmp
151         veor    $in0,$in0,$key
152         veor    $in1,$in1,$key
153         vst1.32 {$in0},[$out],#16
154         b.ne    .Loop192
155
156         mov     $rounds,#12
157         add     $out,$out,#0x20
158         b       .Ldone
159
160 .align  4
161 .L256:
162         vld1.8  {$in1},[$inp]
163         mov     $bits,#7
164         mov     $rounds,#14
165         vst1.32 {$in0},[$out],#16
166
167 .Loop256:
168         vtbl.8  $key,{$in1},$mask
169         vext.8  $tmp,$zero,$in0,#12
170         vst1.32 {$in1},[$out],#16
171         aese    $key,$zero
172         subs    $bits,$bits,#1
173
174         veor    $in0,$in0,$tmp
175         vext.8  $tmp,$zero,$tmp,#12
176         veor    $in0,$in0,$tmp
177         vext.8  $tmp,$zero,$tmp,#12
178          veor   $key,$key,$rcon
179         veor    $in0,$in0,$tmp
180         vshl.u8 $rcon,$rcon,#1
181         veor    $in0,$in0,$key
182         vst1.32 {$in0},[$out],#16
183         b.eq    .Ldone
184
185         vdup.32 $key,${in0}[3]          // just splat
186         vext.8  $tmp,$zero,$in1,#12
187         aese    $key,$zero
188
189         veor    $in1,$in1,$tmp
190         vext.8  $tmp,$zero,$tmp,#12
191         veor    $in1,$in1,$tmp
192         vext.8  $tmp,$zero,$tmp,#12
193         veor    $in1,$in1,$tmp
194
195         veor    $in1,$in1,$key
196         b       .Loop256
197
198 .Ldone:
199         str     $rounds,[$out]
200
201         eor     x0,x0,x0                // return value
202         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
203         ret
204 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
205
206 .globl  ${prefix}_set_decrypt_key
207 .type   ${prefix}_set_decrypt_key,%function
208 .align  5
209 ${prefix}_set_decrypt_key:
210 ___
211 $code.=<<___    if ($flavour =~ /64/);
212         stp     x29,x30,[sp,#-16]!
213         add     x29,sp,#0
214 ___
215 $code.=<<___    if ($flavour !~ /64/);
216         stmdb   sp!,{r4,lr}
217 ___
218 $code.=<<___;
219         bl      .Lenc_key
220
221         sub     $out,$out,#240          // restore original $out
222         mov     x4,#-16
223         add     $inp,$out,x12,lsl#4     // end of key schedule
224
225         vld1.32 {v0.16b},[$out]
226         vld1.32 {v1.16b},[$inp]
227         vst1.32 {v0.16b},[$inp],x4
228         vst1.32 {v1.16b},[$out],#16
229
230 .Loop_imc:
231         vld1.32 {v0.16b},[$out]
232         vld1.32 {v1.16b},[$inp]
233         aesimc  v0.16b,v0.16b
234         aesimc  v1.16b,v1.16b
235         vst1.32 {v0.16b},[$inp],x4
236         vst1.32 {v1.16b},[$out],#16
237         cmp     $inp,$out
238         b.hi    .Loop_imc
239
240         vld1.32 {v0.16b},[$out]
241         aesimc  v0.16b,v0.16b
242         vst1.32 {v0.16b},[$inp]
243
244         eor     x0,x0,x0                // return value
245 ___
246 $code.=<<___    if ($flavour !~ /64/);
247         ldmia   sp!,{r4,pc}
248 ___
249 $code.=<<___    if ($flavour =~ /64/);
250         ldp     x29,x30,[sp],#16
251         ret
252 ___
253 $code.=<<___;
254 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
255 ___
256 }}}
257 {{{
258 sub gen_block () {
259 my $dir = shift;
260 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
261 my ($inp,$out,$key)=map("x$_",(0..2));
262 my $rounds="w3";
263 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
264
265 $code.=<<___;
266 .globl  ${prefix}_${dir}crypt
267 .type   ${prefix}_${dir}crypt,%function
268 .align  5
269 ${prefix}_${dir}crypt:
270         ldr     $rounds,[$key,#240]
271         vld1.32 {$rndkey0},[$key],#16
272         vld1.8  {$inout},[$inp]
273         sub     $rounds,$rounds,#2
274         vld1.32 {$rndkey1},[$key],#16
275
276 .Loop_${dir}c:
277         aes$e   $inout,$rndkey0
278         vld1.32 {$rndkey0},[$key],#16
279         aes$mc  $inout,$inout
280         subs    $rounds,$rounds,#2
281         aes$e   $inout,$rndkey1
282         vld1.32 {$rndkey1},[$key],#16
283         aes$mc  $inout,$inout
284         b.gt    .Loop_${dir}c
285
286         aes$e   $inout,$rndkey0
287         vld1.32 {$rndkey0},[$key]
288         aes$mc  $inout,$inout
289         aes$e   $inout,$rndkey1
290         veor    $inout,$inout,$rndkey0
291
292         vst1.8  {$inout},[$out]
293         ret
294 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
295 ___
296 }
297 &gen_block("en");
298 &gen_block("de");
299 }}}
300 {{{
301 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
302 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
303 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
304
305 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
306
307 ### q8-q15      preloaded key schedule
308
309 $code.=<<___;
310 .globl  ${prefix}_cbc_encrypt
311 .type   ${prefix}_cbc_encrypt,%function
312 .align  5
313 ${prefix}_cbc_encrypt:
314 ___
315 $code.=<<___    if ($flavour =~ /64/);
316         stp     x29,x30,[sp,#-16]!
317         add     x29,sp,#0
318 ___
319 $code.=<<___    if ($flavour !~ /64/);
320         mov     ip,sp
321         stmdb   sp!,{r4-r8,lr}
322         vstmdb  sp!,{d8-d15}            @ ABI specification says so
323         ldmia   ip,{r4-r5}              @ load remaining args
324 ___
325 $code.=<<___;
326         subs    $len,$len,#16
327         mov     $step,#16
328         b.lo    .Lcbc_abort
329         cclr    $step,eq
330
331         cmp     $enc,#0                 // en- or decrypting?
332         ldr     $rounds,[$key,#240]
333         and     $len,$len,#-16
334         vld1.8  {$ivec},[$ivp]
335         vld1.8  {$dat},[$inp],$step
336
337         vld1.32 {q8-q9},[$key]          // load key schedule...
338         sub     $rounds,$rounds,#6
339         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
340         sub     $rounds,$rounds,#2
341         vld1.32 {q10-q11},[$key_],#32
342         vld1.32 {q12-q13},[$key_],#32
343         vld1.32 {q14-q15},[$key_],#32
344         vld1.32 {$rndlast},[$key_]
345
346         add     $key_,$key,#32
347         mov     $cnt,$rounds
348         b.eq    .Lcbc_dec
349
350         cmp     $rounds,#2
351         veor    $dat,$dat,$ivec
352         veor    $rndzero_n_last,q8,$rndlast
353         b.eq    .Lcbc_enc128
354
355 .Loop_cbc_enc:
356         aese    $dat,q8
357         vld1.32 {q8},[$key_],#16
358         aesmc   $dat,$dat
359         subs    $cnt,$cnt,#2
360         aese    $dat,q9
361         vld1.32 {q9},[$key_],#16
362         aesmc   $dat,$dat
363         b.gt    .Loop_cbc_enc
364
365         aese    $dat,q8
366         aesmc   $dat,$dat
367          subs   $len,$len,#16
368         aese    $dat,q9
369         aesmc   $dat,$dat
370          cclr   $step,eq
371         aese    $dat,q10
372         aesmc   $dat,$dat
373          add    $key_,$key,#16
374         aese    $dat,q11
375         aesmc   $dat,$dat
376          vld1.8 {q8},[$inp],$step
377         aese    $dat,q12
378         aesmc   $dat,$dat
379          veor   q8,q8,$rndzero_n_last
380         aese    $dat,q13
381         aesmc   $dat,$dat
382          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
383         aese    $dat,q14
384         aesmc   $dat,$dat
385         aese    $dat,q15
386
387          mov    $cnt,$rounds
388         veor    $ivec,$dat,$rndlast
389         vst1.8  {$ivec},[$out],#16
390         b.hs    .Loop_cbc_enc
391
392         b       .Lcbc_done
393
394 .align  5
395 .Lcbc_enc128:
396         vld1.32 {$in0-$in1},[$key_]
397         aese    $dat,q8
398         aesmc   $dat,$dat
399         b       .Lenter_cbc_enc128
400 .Loop_cbc_enc128:
401         aese    $dat,q8
402         aesmc   $dat,$dat
403          vst1.8 {$ivec},[$out],#16
404 .Lenter_cbc_enc128:
405         aese    $dat,q9
406         aesmc   $dat,$dat
407          subs   $len,$len,#16
408         aese    $dat,$in0
409         aesmc   $dat,$dat
410          cclr   $step,eq
411         aese    $dat,$in1
412         aesmc   $dat,$dat
413         aese    $dat,q10
414         aesmc   $dat,$dat
415         aese    $dat,q11
416         aesmc   $dat,$dat
417          vld1.8 {q8},[$inp],$step
418         aese    $dat,q12
419         aesmc   $dat,$dat
420         aese    $dat,q13
421         aesmc   $dat,$dat
422         aese    $dat,q14
423         aesmc   $dat,$dat
424          veor   q8,q8,$rndzero_n_last
425         aese    $dat,q15
426         veor    $ivec,$dat,$rndlast
427         b.hs    .Loop_cbc_enc128
428
429         vst1.8  {$ivec},[$out],#16
430         b       .Lcbc_done
431
432 .align  5
433 .Lcbc_dec128:
434         vld1.32 {$tmp0-$tmp1},[$key_]
435         veor    $ivec,$ivec,$rndlast
436         veor    $in0,$dat0,$rndlast
437         mov     $step1,$step
438
439 .Loop2x_cbc_dec128:
440         aesd    $dat0,q8
441         aesd    $dat1,q8
442         aesimc  $dat0,$dat0
443         aesimc  $dat1,$dat1
444          subs   $len,$len,#32
445         aesd    $dat0,q9
446         aesd    $dat1,q9
447         aesimc  $dat0,$dat0
448         aesimc  $dat1,$dat1
449          cclr   $step,lo
450         aesd    $dat0,$tmp0
451         aesd    $dat1,$tmp0
452         aesimc  $dat0,$dat0
453         aesimc  $dat1,$dat1
454          cclr   $step1,ls
455         aesd    $dat0,$tmp1
456         aesd    $dat1,$tmp1
457         aesimc  $dat0,$dat0
458         aesimc  $dat1,$dat1
459         aesd    $dat0,q10
460         aesd    $dat1,q10
461         aesimc  $dat0,$dat0
462         aesimc  $dat1,$dat1
463         aesd    $dat0,q11
464         aesd    $dat1,q11
465         aesimc  $dat0,$dat0
466         aesimc  $dat1,$dat1
467         aesd    $dat0,q12
468         aesd    $dat1,q12
469         aesimc  $dat0,$dat0
470         aesimc  $dat1,$dat1
471         aesd    $dat0,q13
472         aesd    $dat1,q13
473         aesimc  $dat0,$dat0
474         aesimc  $dat1,$dat1
475         aesd    $dat0,q14
476         aesd    $dat1,q14
477         aesimc  $dat0,$dat0
478         aesimc  $dat1,$dat1
479         aesd    $dat0,q15
480         aesd    $dat1,q15
481
482         veor    $ivec,$ivec,$dat0
483         vld1.8  {$dat0},[$inp],$step
484         veor    $in0,$in0,$dat1
485         vld1.8  {$dat1},[$inp],$step1
486         vst1.8  {$ivec},[$out],#16
487         veor    $ivec,$in1,$rndlast
488         vst1.8  {$in0},[$out],#16
489         veor    $in0,$dat0,$rndlast
490         vorr    $in1,$dat1,$dat1
491         b.hs    .Loop2x_cbc_dec128
492
493         adds    $len,$len,#32
494         veor    $ivec,$ivec,$rndlast
495         b.eq    .Lcbc_done
496         veor    $in0,$in0,$rndlast
497         b       .Lcbc_dec_tail
498
499 .align  5
500 .Lcbc_dec:
501         subs    $len,$len,#16
502         vorr    $in0,$dat,$dat
503         b.lo    .Lcbc_dec_tail
504
505         cclr    $step,eq
506         cmp     $rounds,#2
507         vld1.8  {$dat1},[$inp],$step
508         vorr    $in1,$dat1,$dat1
509         b.eq    .Lcbc_dec128
510
511 .Loop2x_cbc_dec:
512         aesd    $dat0,q8
513         aesd    $dat1,q8
514         vld1.32 {q8},[$key_],#16
515         aesimc  $dat0,$dat0
516         aesimc  $dat1,$dat1
517         subs    $cnt,$cnt,#2
518         aesd    $dat0,q9
519         aesd    $dat1,q9
520         vld1.32 {q9},[$key_],#16
521         aesimc  $dat0,$dat0
522         aesimc  $dat1,$dat1
523         b.gt    .Loop2x_cbc_dec
524
525         aesd    $dat0,q8
526         aesd    $dat1,q8
527         aesimc  $dat0,$dat0
528         aesimc  $dat1,$dat1
529          veor   $tmp0,$ivec,$rndlast
530          veor   $tmp1,$in0,$rndlast
531         aesd    $dat0,q9
532         aesd    $dat1,q9
533         aesimc  $dat0,$dat0
534         aesimc  $dat1,$dat1
535          vorr   $ivec,$in1,$in1
536          subs   $len,$len,#32
537         aesd    $dat0,q10
538         aesd    $dat1,q10
539         aesimc  $dat0,$dat0
540          cclr   $step,lo
541         aesimc  $dat1,$dat1
542          mov    $key_,$key
543         aesd    $dat0,q11
544         aesd    $dat1,q11
545         aesimc  $dat0,$dat0
546          vld1.8 {$in0},[$inp],$step
547         aesimc  $dat1,$dat1
548          cclr   $step,ls
549         aesd    $dat0,q12
550         aesd    $dat1,q12
551         aesimc  $dat0,$dat0
552         aesimc  $dat1,$dat1
553          vld1.8 {$in1},[$inp],$step
554         aesd    $dat0,q13
555         aesd    $dat1,q13
556         aesimc  $dat0,$dat0
557         aesimc  $dat1,$dat1
558          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
559         aesd    $dat0,q14
560         aesd    $dat1,q14
561         aesimc  $dat0,$dat0
562         aesimc  $dat1,$dat1
563          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
564         aesd    $dat0,q15
565         aesd    $dat1,q15
566
567          mov    $cnt,$rounds
568         veor    $tmp0,$tmp0,$dat0
569         veor    $tmp1,$tmp1,$dat1
570          vorr   $dat0,$in0,$in0
571         vst1.8  {$tmp0},[$out],#16
572          vorr   $dat1,$in1,$in1
573         vst1.8  {$tmp1},[$out],#16
574         b.hs    .Loop2x_cbc_dec
575
576         adds    $len,$len,#32
577         b.eq    .Lcbc_done
578
579 .Lcbc_dec_tail:
580         aesd    $dat,q8
581         vld1.32 {q8},[$key_],#16
582         aesimc  $dat,$dat
583         subs    $cnt,$cnt,#2
584         aesd    $dat,q9
585         vld1.32 {q9},[$key_],#16
586         aesimc  $dat,$dat
587         b.gt    .Lcbc_dec_tail
588
589         aesd    $dat,q8
590         aesimc  $dat,$dat
591         aesd    $dat,q9
592         aesimc  $dat,$dat
593          veor   $tmp,$ivec,$rndlast
594         aesd    $dat,q10
595         aesimc  $dat,$dat
596          vorr   $ivec,$in0,$in0
597         aesd    $dat,q11
598         aesimc  $dat,$dat
599         aesd    $dat,q12
600         aesimc  $dat,$dat
601         aesd    $dat,q13
602         aesimc  $dat,$dat
603         aesd    $dat,q14
604         aesimc  $dat,$dat
605         aesd    $dat,q15
606
607         veor    $tmp,$tmp,$dat
608         vst1.8  {$tmp},[$out],#16
609
610 .Lcbc_done:
611         vst1.8  {$ivec},[$ivp]
612 .Lcbc_abort:
613 ___
614 $code.=<<___    if ($flavour !~ /64/);
615         vldmia  sp!,{d8-d15}
616         ldmia   sp!,{r4-r8,pc}
617 ___
618 $code.=<<___    if ($flavour =~ /64/);
619         ldr     x29,[sp],#16
620         ret
621 ___
622 $code.=<<___;
623 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
624 ___
625 }}}
626 {{{
627 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
628 my ($rounds,$cnt,$key_,$ctr,$tctr,$tctr1)=("w5","w6","x7","w8","w9","w10");
629 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
630
631 my ($dat,$tmp)=($dat0,$tmp0);
632
633 ### q8-q15      preloaded key schedule
634
635 $code.=<<___;
636 .globl  ${prefix}_ctr32_encrypt_blocks
637 .type   ${prefix}_ctr32_encrypt_blocks,%function
638 .align  5
639 ${prefix}_ctr32_encrypt_blocks:
640 ___
641 $code.=<<___    if ($flavour =~ /64/);
642         stp             x29,x30,[sp,#-16]!
643         add             x29,sp,#0
644 ___
645 $code.=<<___    if ($flavour !~ /64/);
646         mov             ip,sp
647         stmdb           sp!,{r4-r10,lr}
648         vstmdb          sp!,{d8-d15}            @ ABI specification says so
649         ldr             r4, [ip]                @ load remaining arg
650 ___
651 $code.=<<___;
652         ldr             $rounds,[$key,#240]
653
654         ldr             $ctr, [$ivp, #12]
655         vld1.32         {$dat0},[$ivp]
656
657         vld1.32         {q8-q9},[$key]          // load key schedule...
658         sub             $rounds,$rounds,#6
659         add             $key_,$key,x5,lsl#4     // pointer to last 7 round keys
660         sub             $rounds,$rounds,#2
661         vld1.32         {q10-q11},[$key_],#32
662         vld1.32         {q12-q13},[$key_],#32
663         vld1.32         {q14-q15},[$key_],#32
664         vld1.32         {$rndlast},[$key_]
665
666         add             $key_,$key,#32
667         mov             $cnt,$rounds
668
669         subs            $len,$len,#2
670         b.lo            .Lctr32_tail
671
672 #ifndef BIG_ENDIAN
673         rev             $ctr, $ctr
674 #endif
675         vorr            $dat1,$dat0,$dat0
676         add             $ctr, $ctr, #1
677         vorr            $ivec,$dat0,$dat0
678         rev             $tctr1, $ctr
679         cmp             $rounds,#2
680         vmov.32         ${dat1}[3],$tctr1
681         b.eq            .Lctr32_128
682
683 .Loop2x_ctr32:
684         aese            $dat0,q8
685         aese            $dat1,q8
686         vld1.32         {q8},[$key_],#16
687         aesmc           $dat0,$dat0
688         aesmc           $dat1,$dat1
689         subs            $cnt,$cnt,#2
690         aese            $dat0,q9
691         aese            $dat1,q9
692         vld1.32         {q9},[$key_],#16
693         aesmc           $dat0,$dat0
694         aesmc           $dat1,$dat1
695         b.gt            .Loop2x_ctr32
696
697         aese            $dat0,q8
698         aese            $dat1,q8
699         aesmc           $tmp0,$dat0
700          vorr           $dat0,$ivec,$ivec
701         aesmc           $tmp1,$dat1
702          vorr           $dat1,$ivec,$ivec
703         aese            $tmp0,q9
704         aese            $tmp1,q9
705          vld1.8         {$in0},[$inp],#16
706         aesmc           $tmp0,$tmp0
707          vld1.8         {$in1},[$inp],#16
708         aesmc           $tmp1,$tmp1
709          add            $ctr,$ctr,#1
710         aese            $tmp0,q10
711         aese            $tmp1,q10
712          rev            $tctr,$ctr
713         aesmc           $tmp0,$tmp0
714         aesmc           $tmp1,$tmp1
715          add            $ctr,$ctr,#1
716         aese            $tmp0,q11
717         aese            $tmp1,q11
718          veor           $in0,$in0,$rndlast
719          rev            $tctr1,$ctr
720         aesmc           $tmp0,$tmp0
721         aesmc           $tmp1,$tmp1
722          veor           $in1,$in1,$rndlast
723          mov            $key_,$key
724         aese            $tmp0,q12
725         aese            $tmp1,q12
726          subs           $len,$len,#2
727         aesmc           $tmp0,$tmp0
728         aesmc           $tmp1,$tmp1
729          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
730         aese            $tmp0,q13
731         aese            $tmp1,q13
732         aesmc           $tmp0,$tmp0
733         aesmc           $tmp1,$tmp1
734          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
735         aese            $tmp0,q14
736         aese            $tmp1,q14
737          vmov.32        ${dat0}[3], $tctr
738         aesmc           $tmp0,$tmp0
739          vmov.32        ${dat1}[3], $tctr1
740         aesmc           $tmp1,$tmp1
741         aese            $tmp0,q15
742         aese            $tmp1,q15
743
744          mov            $cnt,$rounds
745         veor            $in0,$in0,$tmp0
746         veor            $in1,$in1,$tmp1
747         vst1.8          {$in0},[$out],#16
748         vst1.8          {$in1},[$out],#16
749         b.hs            .Loop2x_ctr32
750
751         adds            $len,$len,#2
752         b.eq            .Lctr32_done
753         b               .Lctr32_tail
754
755 .Lctr32_128:
756         vld1.32         {$tmp0-$tmp1},[$key_]
757
758 .Loop2x_ctr32_128:
759         aese            $dat0,q8
760         aese            $dat1,q8
761         aesmc           $dat0,$dat0
762          vld1.8         {$in0},[$inp],#16
763         aesmc           $dat1,$dat1
764          vld1.8         {$in1},[$inp],#16
765         aese            $dat0,q9
766         aese            $dat1,q9
767          add            $ctr,$ctr,#1
768         aesmc           $dat0,$dat0
769         aesmc           $dat1,$dat1
770          rev            $tctr,$ctr
771         aese            $dat0,$tmp0
772         aese            $dat1,$tmp0
773          add            $ctr,$ctr,#1
774         aesmc           $dat0,$dat0
775         aesmc           $dat1,$dat1
776          rev            $tctr1,$ctr
777         aese            $dat0,$tmp1
778         aese            $dat1,$tmp1
779          subs           $len,$len,#2
780         aesmc           $dat0,$dat0
781         aesmc           $dat1,$dat1
782         aese            $dat0,q10
783         aese            $dat1,q10
784         aesmc           $dat0,$dat0
785         aesmc           $dat1,$dat1
786         aese            $dat0,q11
787         aese            $dat1,q11
788         aesmc           $dat0,$dat0
789         aesmc           $dat1,$dat1
790         aese            $dat0,q12
791         aese            $dat1,q12
792         aesmc           $dat0,$dat0
793         aesmc           $dat1,$dat1
794         aese            $dat0,q13
795         aese            $dat1,q13
796         aesmc           $dat0,$dat0
797         aesmc           $dat1,$dat1
798         aese            $dat0,q14
799         aese            $dat1,q14
800         aesmc           $dat0,$dat0
801         aesmc           $dat1,$dat1
802          veor           $in0,$in0,$rndlast
803         aese            $dat0,q15
804          veor           $in1,$in1,$rndlast
805         aese            $dat1,q15
806
807         veor            $in0,$in0,$dat0
808         vorr            $dat0,$ivec,$ivec
809         veor            $in1,$in1,$dat1
810         vorr            $dat1,$ivec,$ivec
811         vst1.8          {$in0},[$out],#16
812         vmov.32         ${dat0}[3], $tctr
813         vst1.8          {$in1},[$out],#16
814         vmov.32         ${dat1}[3], $tctr1
815         b.hs            .Loop2x_ctr32_128
816
817         adds            $len,$len,#2
818         b.eq            .Lctr32_done
819
820 .Lctr32_tail:
821         aese            $dat,q8
822         vld1.32         {q8},[$key_],#16
823         aesmc           $dat,$dat
824         subs            $cnt,$cnt,#2
825         aese            $dat,q9
826         vld1.32         {q9},[$key_],#16
827         aesmc           $dat,$dat
828         b.gt            .Lctr32_tail
829
830         aese            $dat,q8
831         aesmc           $dat,$dat
832         aese            $dat,q9
833         aesmc           $dat,$dat
834          vld1.8         {$in0},[$inp]
835         aese            $dat,q10
836         aesmc           $dat,$dat
837         aese            $dat,q11
838         aesmc           $dat,$dat
839         aese            $dat,q12
840         aesmc           $dat,$dat
841         aese            $dat,q13
842         aesmc           $dat,$dat
843         aese            $dat,q14
844         aesmc           $dat,$dat
845          veor           $in0,$in0,$rndlast
846         aese            $dat,q15
847
848         veor            $in0,$in0,$dat
849         vst1.8          {$in0},[$out]
850
851 .Lctr32_done:
852 ___
853 $code.=<<___    if ($flavour !~ /64/);
854         vldmia          sp!,{d8-d15}
855         ldmia           sp!,{r4-r10,pc}
856 ___
857 $code.=<<___    if ($flavour =~ /64/);
858         ldr             x29,[sp],#16
859         ret
860 ___
861 $code.=<<___;
862 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
863 ___
864 }}}
865 ########################################
866 if ($flavour =~ /64/) {                 ######## 64-bit code
867     my %opcode = (
868         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
869         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
870
871     sub unaes {
872         my ($mnemonic,$arg)=@_;
873
874         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
875         sprintf ".long\t0x%08x\t//%s %s",
876                         $opcode{$mnemonic}|$1|($2<<5),
877                         $mnemonic,$arg;
878     }
879
880     foreach(split("\n",$code)) {
881         s/\`([^\`]*)\`/eval($1)/geo;
882
883         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
884         s/@\s/\/\//o;                   # old->new style commentary
885
886         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
887         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
888         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
889         s/vext\.8/ext/o         or
890         s/vrev32\.8/rev32/o     or
891         s/vtst\.8/cmtst/o       or
892         s/vshr/ushr/o           or
893         s/^(\s+)v/$1/o          or      # strip off v prefix
894         s/\bbx\s+lr\b/ret/o;
895
896         # fix up remainig legacy suffixes
897         s/\.[ui]?8//o;
898         m/\],#8/o and s/\.16b/\.8b/go;
899         s/\.[ui]?32//o and s/\.16b/\.4s/go;
900         s/\.[ui]?64//o and s/\.16b/\.2d/go;
901         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
902
903         print $_,"\n";
904     }
905 } else {                                ######## 32-bit code
906     my %opcode = (
907         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
908         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
909
910     sub unaes {
911         my ($mnemonic,$arg)=@_;
912
913         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
914         sprintf ".long\t0x%08x\t@ %s %s",
915                         $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
916                                           |(($2&7)<<1) |(($2&8)<<2),
917                         $mnemonic,$arg;
918     }
919
920     sub unvtbl {
921         my $arg=shift;
922
923         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
924         sprintf "vtbl.8 d%d,{q%d},d%d\n\tvtbl.8 d%d,{q%d},d%d",2*$1,$2,2*$3,2*$1+1,$2,2*$3+1;   
925     }
926
927     sub unvdup32 {
928         my $arg=shift;
929
930         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
931         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+$3>>1,$3&1;        
932     }
933
934     sub unvmov32 {
935         my $arg=shift;
936
937         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
938         sprintf "vmov.32        d%d[%d],%s",2*$1+$2>>1,$2&1,$3; 
939     }
940
941     foreach(split("\n",$code)) {
942         s/\`([^\`]*)\`/eval($1)/geo;
943
944         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
945         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
946         s/\/\/\s?/@ /o;                         # new->old style commentary
947
948         # fix up remainig new-style suffixes
949         s/\],#[0-9]+/]!/o;
950
951         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
952         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
953         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
954         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
955         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
956         s/^(\s+)b\./$1b/o                               or
957         s/^(\s+)ret/$1bx\tlr/o;
958
959         print $_,"\n";
960     }
961 }
962
963 close STDOUT;