GH601: Various spelling fixes.
[openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements support for ARMv8 AES instructions. The
11 # module is endian-agnostic in sense that it supports both big- and
12 # little-endian cases. As does it support both 32- and 64-bit modes
13 # of operation. Latter is achieved by limiting amount of utilized
14 # registers to 16, which implies additional NEON load and integer
15 # instructions. This has no effect on mighty Apple A7, where results
16 # are literally equal to the theoretical estimates based on AES
17 # instruction latencies and issue rates. On Cortex-A53, an in-order
18 # execution core, this costs up to 10-15%, which is partially
19 # compensated by implementing dedicated code path for 128-bit
20 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
21 # seems to be limited by sheer amount of NEON instructions...
22 #
23 # Performance in cycles per byte processed with 128-bit key:
24 #
25 #               CBC enc         CBC dec         CTR
26 # Apple A7      2.39            1.20            1.20
27 # Cortex-A53    1.32            1.29            1.46
28 # Cortex-A57(*) 1.95            0.85            0.93
29 # Denver        1.96            0.86            0.80
30 #
31 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
32 #       and are still same even for updated module;
33
34 $flavour = shift;
35 $output  = shift;
36
37 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
38 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
39 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
40 die "can't locate arm-xlate.pl";
41
42 open OUT,"| \"$^X\" $xlate $flavour $output";
43 *STDOUT=*OUT;
44
45 $prefix="aes_v8";
46
47 $code=<<___;
48 #include "arm_arch.h"
49
50 #if __ARM_MAX_ARCH__>=7
51 .text
52 ___
53 $code.=".arch   armv8-a+crypto\n"                       if ($flavour =~ /64/);
54 $code.=".arch   armv7-a\n.fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
55                 #^^^^^^ this is done to simplify adoption by not depending
56                 #       on latest binutils.
57
58 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
59 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
60 # maintain both 32- and 64-bit codes within single module and
61 # transliterate common code to either flavour with regex vodoo.
62 #
63 {{{
64 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
65 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
66         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
67
68
69 $code.=<<___;
70 .align  5
71 .Lrcon:
72 .long   0x01,0x01,0x01,0x01
73 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
74 .long   0x1b,0x1b,0x1b,0x1b
75
76 .globl  ${prefix}_set_encrypt_key
77 .type   ${prefix}_set_encrypt_key,%function
78 .align  5
79 ${prefix}_set_encrypt_key:
80 .Lenc_key:
81 ___
82 $code.=<<___    if ($flavour =~ /64/);
83         stp     x29,x30,[sp,#-16]!
84         add     x29,sp,#0
85 ___
86 $code.=<<___;
87         mov     $ptr,#-1
88         cmp     $inp,#0
89         b.eq    .Lenc_key_abort
90         cmp     $out,#0
91         b.eq    .Lenc_key_abort
92         mov     $ptr,#-2
93         cmp     $bits,#128
94         b.lt    .Lenc_key_abort
95         cmp     $bits,#256
96         b.gt    .Lenc_key_abort
97         tst     $bits,#0x3f
98         b.ne    .Lenc_key_abort
99
100         adr     $ptr,.Lrcon
101         cmp     $bits,#192
102
103         veor    $zero,$zero,$zero
104         vld1.8  {$in0},[$inp],#16
105         mov     $bits,#8                // reuse $bits
106         vld1.32 {$rcon,$mask},[$ptr],#32
107
108         b.lt    .Loop128
109         b.eq    .L192
110         b       .L256
111
112 .align  4
113 .Loop128:
114         vtbl.8  $key,{$in0},$mask
115         vext.8  $tmp,$zero,$in0,#12
116         vst1.32 {$in0},[$out],#16
117         aese    $key,$zero
118         subs    $bits,$bits,#1
119
120         veor    $in0,$in0,$tmp
121         vext.8  $tmp,$zero,$tmp,#12
122         veor    $in0,$in0,$tmp
123         vext.8  $tmp,$zero,$tmp,#12
124          veor   $key,$key,$rcon
125         veor    $in0,$in0,$tmp
126         vshl.u8 $rcon,$rcon,#1
127         veor    $in0,$in0,$key
128         b.ne    .Loop128
129
130         vld1.32 {$rcon},[$ptr]
131
132         vtbl.8  $key,{$in0},$mask
133         vext.8  $tmp,$zero,$in0,#12
134         vst1.32 {$in0},[$out],#16
135         aese    $key,$zero
136
137         veor    $in0,$in0,$tmp
138         vext.8  $tmp,$zero,$tmp,#12
139         veor    $in0,$in0,$tmp
140         vext.8  $tmp,$zero,$tmp,#12
141          veor   $key,$key,$rcon
142         veor    $in0,$in0,$tmp
143         vshl.u8 $rcon,$rcon,#1
144         veor    $in0,$in0,$key
145
146         vtbl.8  $key,{$in0},$mask
147         vext.8  $tmp,$zero,$in0,#12
148         vst1.32 {$in0},[$out],#16
149         aese    $key,$zero
150
151         veor    $in0,$in0,$tmp
152         vext.8  $tmp,$zero,$tmp,#12
153         veor    $in0,$in0,$tmp
154         vext.8  $tmp,$zero,$tmp,#12
155          veor   $key,$key,$rcon
156         veor    $in0,$in0,$tmp
157         veor    $in0,$in0,$key
158         vst1.32 {$in0},[$out]
159         add     $out,$out,#0x50
160
161         mov     $rounds,#10
162         b       .Ldone
163
164 .align  4
165 .L192:
166         vld1.8  {$in1},[$inp],#8
167         vmov.i8 $key,#8                 // borrow $key
168         vst1.32 {$in0},[$out],#16
169         vsub.i8 $mask,$mask,$key        // adjust the mask
170
171 .Loop192:
172         vtbl.8  $key,{$in1},$mask
173         vext.8  $tmp,$zero,$in0,#12
174         vst1.32 {$in1},[$out],#8
175         aese    $key,$zero
176         subs    $bits,$bits,#1
177
178         veor    $in0,$in0,$tmp
179         vext.8  $tmp,$zero,$tmp,#12
180         veor    $in0,$in0,$tmp
181         vext.8  $tmp,$zero,$tmp,#12
182         veor    $in0,$in0,$tmp
183
184         vdup.32 $tmp,${in0}[3]
185         veor    $tmp,$tmp,$in1
186          veor   $key,$key,$rcon
187         vext.8  $in1,$zero,$in1,#12
188         vshl.u8 $rcon,$rcon,#1
189         veor    $in1,$in1,$tmp
190         veor    $in0,$in0,$key
191         veor    $in1,$in1,$key
192         vst1.32 {$in0},[$out],#16
193         b.ne    .Loop192
194
195         mov     $rounds,#12
196         add     $out,$out,#0x20
197         b       .Ldone
198
199 .align  4
200 .L256:
201         vld1.8  {$in1},[$inp]
202         mov     $bits,#7
203         mov     $rounds,#14
204         vst1.32 {$in0},[$out],#16
205
206 .Loop256:
207         vtbl.8  $key,{$in1},$mask
208         vext.8  $tmp,$zero,$in0,#12
209         vst1.32 {$in1},[$out],#16
210         aese    $key,$zero
211         subs    $bits,$bits,#1
212
213         veor    $in0,$in0,$tmp
214         vext.8  $tmp,$zero,$tmp,#12
215         veor    $in0,$in0,$tmp
216         vext.8  $tmp,$zero,$tmp,#12
217          veor   $key,$key,$rcon
218         veor    $in0,$in0,$tmp
219         vshl.u8 $rcon,$rcon,#1
220         veor    $in0,$in0,$key
221         vst1.32 {$in0},[$out],#16
222         b.eq    .Ldone
223
224         vdup.32 $key,${in0}[3]          // just splat
225         vext.8  $tmp,$zero,$in1,#12
226         aese    $key,$zero
227
228         veor    $in1,$in1,$tmp
229         vext.8  $tmp,$zero,$tmp,#12
230         veor    $in1,$in1,$tmp
231         vext.8  $tmp,$zero,$tmp,#12
232         veor    $in1,$in1,$tmp
233
234         veor    $in1,$in1,$key
235         b       .Loop256
236
237 .Ldone:
238         str     $rounds,[$out]
239         mov     $ptr,#0
240
241 .Lenc_key_abort:
242         mov     x0,$ptr                 // return value
243         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
244         ret
245 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
246
247 .globl  ${prefix}_set_decrypt_key
248 .type   ${prefix}_set_decrypt_key,%function
249 .align  5
250 ${prefix}_set_decrypt_key:
251 ___
252 $code.=<<___    if ($flavour =~ /64/);
253         stp     x29,x30,[sp,#-16]!
254         add     x29,sp,#0
255 ___
256 $code.=<<___    if ($flavour !~ /64/);
257         stmdb   sp!,{r4,lr}
258 ___
259 $code.=<<___;
260         bl      .Lenc_key
261
262         cmp     x0,#0
263         b.ne    .Ldec_key_abort
264
265         sub     $out,$out,#240          // restore original $out
266         mov     x4,#-16
267         add     $inp,$out,x12,lsl#4     // end of key schedule
268
269         vld1.32 {v0.16b},[$out]
270         vld1.32 {v1.16b},[$inp]
271         vst1.32 {v0.16b},[$inp],x4
272         vst1.32 {v1.16b},[$out],#16
273
274 .Loop_imc:
275         vld1.32 {v0.16b},[$out]
276         vld1.32 {v1.16b},[$inp]
277         aesimc  v0.16b,v0.16b
278         aesimc  v1.16b,v1.16b
279         vst1.32 {v0.16b},[$inp],x4
280         vst1.32 {v1.16b},[$out],#16
281         cmp     $inp,$out
282         b.hi    .Loop_imc
283
284         vld1.32 {v0.16b},[$out]
285         aesimc  v0.16b,v0.16b
286         vst1.32 {v0.16b},[$inp]
287
288         eor     x0,x0,x0                // return value
289 .Ldec_key_abort:
290 ___
291 $code.=<<___    if ($flavour !~ /64/);
292         ldmia   sp!,{r4,pc}
293 ___
294 $code.=<<___    if ($flavour =~ /64/);
295         ldp     x29,x30,[sp],#16
296         ret
297 ___
298 $code.=<<___;
299 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
300 ___
301 }}}
302 {{{
303 sub gen_block () {
304 my $dir = shift;
305 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
306 my ($inp,$out,$key)=map("x$_",(0..2));
307 my $rounds="w3";
308 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
309
310 $code.=<<___;
311 .globl  ${prefix}_${dir}crypt
312 .type   ${prefix}_${dir}crypt,%function
313 .align  5
314 ${prefix}_${dir}crypt:
315         ldr     $rounds,[$key,#240]
316         vld1.32 {$rndkey0},[$key],#16
317         vld1.8  {$inout},[$inp]
318         sub     $rounds,$rounds,#2
319         vld1.32 {$rndkey1},[$key],#16
320
321 .Loop_${dir}c:
322         aes$e   $inout,$rndkey0
323         aes$mc  $inout,$inout
324         vld1.32 {$rndkey0},[$key],#16
325         subs    $rounds,$rounds,#2
326         aes$e   $inout,$rndkey1
327         aes$mc  $inout,$inout
328         vld1.32 {$rndkey1},[$key],#16
329         b.gt    .Loop_${dir}c
330
331         aes$e   $inout,$rndkey0
332         aes$mc  $inout,$inout
333         vld1.32 {$rndkey0},[$key]
334         aes$e   $inout,$rndkey1
335         veor    $inout,$inout,$rndkey0
336
337         vst1.8  {$inout},[$out]
338         ret
339 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
340 ___
341 }
342 &gen_block("en");
343 &gen_block("de");
344 }}}
345 {{{
346 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
347 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
348 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
349
350 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
351 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
352
353 ### q8-q15      preloaded key schedule
354
355 $code.=<<___;
356 .globl  ${prefix}_cbc_encrypt
357 .type   ${prefix}_cbc_encrypt,%function
358 .align  5
359 ${prefix}_cbc_encrypt:
360 ___
361 $code.=<<___    if ($flavour =~ /64/);
362         stp     x29,x30,[sp,#-16]!
363         add     x29,sp,#0
364 ___
365 $code.=<<___    if ($flavour !~ /64/);
366         mov     ip,sp
367         stmdb   sp!,{r4-r8,lr}
368         vstmdb  sp!,{d8-d15}            @ ABI specification says so
369         ldmia   ip,{r4-r5}              @ load remaining args
370 ___
371 $code.=<<___;
372         subs    $len,$len,#16
373         mov     $step,#16
374         b.lo    .Lcbc_abort
375         cclr    $step,eq
376
377         cmp     $enc,#0                 // en- or decrypting?
378         ldr     $rounds,[$key,#240]
379         and     $len,$len,#-16
380         vld1.8  {$ivec},[$ivp]
381         vld1.8  {$dat},[$inp],$step
382
383         vld1.32 {q8-q9},[$key]          // load key schedule...
384         sub     $rounds,$rounds,#6
385         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
386         sub     $rounds,$rounds,#2
387         vld1.32 {q10-q11},[$key_],#32
388         vld1.32 {q12-q13},[$key_],#32
389         vld1.32 {q14-q15},[$key_],#32
390         vld1.32 {$rndlast},[$key_]
391
392         add     $key_,$key,#32
393         mov     $cnt,$rounds
394         b.eq    .Lcbc_dec
395
396         cmp     $rounds,#2
397         veor    $dat,$dat,$ivec
398         veor    $rndzero_n_last,q8,$rndlast
399         b.eq    .Lcbc_enc128
400
401         vld1.32 {$in0-$in1},[$key_]
402         add     $key_,$key,#16
403         add     $key4,$key,#16*4
404         add     $key5,$key,#16*5
405         aese    $dat,q8
406         aesmc   $dat,$dat
407         add     $key6,$key,#16*6
408         add     $key7,$key,#16*7
409         b       .Lenter_cbc_enc
410
411 .align  4
412 .Loop_cbc_enc:
413         aese    $dat,q8
414         aesmc   $dat,$dat
415          vst1.8 {$ivec},[$out],#16
416 .Lenter_cbc_enc:
417         aese    $dat,q9
418         aesmc   $dat,$dat
419         aese    $dat,$in0
420         aesmc   $dat,$dat
421         vld1.32 {q8},[$key4]
422         cmp     $rounds,#4
423         aese    $dat,$in1
424         aesmc   $dat,$dat
425         vld1.32 {q9},[$key5]
426         b.eq    .Lcbc_enc192
427
428         aese    $dat,q8
429         aesmc   $dat,$dat
430         vld1.32 {q8},[$key6]
431         aese    $dat,q9
432         aesmc   $dat,$dat
433         vld1.32 {q9},[$key7]
434         nop
435
436 .Lcbc_enc192:
437         aese    $dat,q8
438         aesmc   $dat,$dat
439          subs   $len,$len,#16
440         aese    $dat,q9
441         aesmc   $dat,$dat
442          cclr   $step,eq
443         aese    $dat,q10
444         aesmc   $dat,$dat
445         aese    $dat,q11
446         aesmc   $dat,$dat
447          vld1.8 {q8},[$inp],$step
448         aese    $dat,q12
449         aesmc   $dat,$dat
450          veor   q8,q8,$rndzero_n_last
451         aese    $dat,q13
452         aesmc   $dat,$dat
453          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
454         aese    $dat,q14
455         aesmc   $dat,$dat
456         aese    $dat,q15
457         veor    $ivec,$dat,$rndlast
458         b.hs    .Loop_cbc_enc
459
460         vst1.8  {$ivec},[$out],#16
461         b       .Lcbc_done
462
463 .align  5
464 .Lcbc_enc128:
465         vld1.32 {$in0-$in1},[$key_]
466         aese    $dat,q8
467         aesmc   $dat,$dat
468         b       .Lenter_cbc_enc128
469 .Loop_cbc_enc128:
470         aese    $dat,q8
471         aesmc   $dat,$dat
472          vst1.8 {$ivec},[$out],#16
473 .Lenter_cbc_enc128:
474         aese    $dat,q9
475         aesmc   $dat,$dat
476          subs   $len,$len,#16
477         aese    $dat,$in0
478         aesmc   $dat,$dat
479          cclr   $step,eq
480         aese    $dat,$in1
481         aesmc   $dat,$dat
482         aese    $dat,q10
483         aesmc   $dat,$dat
484         aese    $dat,q11
485         aesmc   $dat,$dat
486          vld1.8 {q8},[$inp],$step
487         aese    $dat,q12
488         aesmc   $dat,$dat
489         aese    $dat,q13
490         aesmc   $dat,$dat
491         aese    $dat,q14
492         aesmc   $dat,$dat
493          veor   q8,q8,$rndzero_n_last
494         aese    $dat,q15
495         veor    $ivec,$dat,$rndlast
496         b.hs    .Loop_cbc_enc128
497
498         vst1.8  {$ivec},[$out],#16
499         b       .Lcbc_done
500 ___
501 {
502 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
503 $code.=<<___;
504 .align  5
505 .Lcbc_dec:
506         vld1.8  {$dat2},[$inp],#16
507         subs    $len,$len,#32           // bias
508         add     $cnt,$rounds,#2
509         vorr    $in1,$dat,$dat
510         vorr    $dat1,$dat,$dat
511         vorr    $in2,$dat2,$dat2
512         b.lo    .Lcbc_dec_tail
513
514         vorr    $dat1,$dat2,$dat2
515         vld1.8  {$dat2},[$inp],#16
516         vorr    $in0,$dat,$dat
517         vorr    $in1,$dat1,$dat1
518         vorr    $in2,$dat2,$dat2
519
520 .Loop3x_cbc_dec:
521         aesd    $dat0,q8
522         aesimc  $dat0,$dat0
523         aesd    $dat1,q8
524         aesimc  $dat1,$dat1
525         aesd    $dat2,q8
526         aesimc  $dat2,$dat2
527         vld1.32 {q8},[$key_],#16
528         subs    $cnt,$cnt,#2
529         aesd    $dat0,q9
530         aesimc  $dat0,$dat0
531         aesd    $dat1,q9
532         aesimc  $dat1,$dat1
533         aesd    $dat2,q9
534         aesimc  $dat2,$dat2
535         vld1.32 {q9},[$key_],#16
536         b.gt    .Loop3x_cbc_dec
537
538         aesd    $dat0,q8
539         aesimc  $dat0,$dat0
540         aesd    $dat1,q8
541         aesimc  $dat1,$dat1
542         aesd    $dat2,q8
543         aesimc  $dat2,$dat2
544          veor   $tmp0,$ivec,$rndlast
545          subs   $len,$len,#0x30
546          veor   $tmp1,$in0,$rndlast
547          mov.lo x6,$len                 // x6, $cnt, is zero at this point
548         aesd    $dat0,q9
549         aesimc  $dat0,$dat0
550         aesd    $dat1,q9
551         aesimc  $dat1,$dat1
552         aesd    $dat2,q9
553         aesimc  $dat2,$dat2
554          veor   $tmp2,$in1,$rndlast
555          add    $inp,$inp,x6            // $inp is adjusted in such way that
556                                         // at exit from the loop $dat1-$dat2
557                                         // are loaded with last "words"
558          vorr   $ivec,$in2,$in2
559          mov    $key_,$key
560         aesd    $dat0,q12
561         aesimc  $dat0,$dat0
562         aesd    $dat1,q12
563         aesimc  $dat1,$dat1
564         aesd    $dat2,q12
565         aesimc  $dat2,$dat2
566          vld1.8 {$in0},[$inp],#16
567         aesd    $dat0,q13
568         aesimc  $dat0,$dat0
569         aesd    $dat1,q13
570         aesimc  $dat1,$dat1
571         aesd    $dat2,q13
572         aesimc  $dat2,$dat2
573          vld1.8 {$in1},[$inp],#16
574         aesd    $dat0,q14
575         aesimc  $dat0,$dat0
576         aesd    $dat1,q14
577         aesimc  $dat1,$dat1
578         aesd    $dat2,q14
579         aesimc  $dat2,$dat2
580          vld1.8 {$in2},[$inp],#16
581         aesd    $dat0,q15
582         aesd    $dat1,q15
583         aesd    $dat2,q15
584          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
585          add    $cnt,$rounds,#2
586         veor    $tmp0,$tmp0,$dat0
587         veor    $tmp1,$tmp1,$dat1
588         veor    $dat2,$dat2,$tmp2
589          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
590         vst1.8  {$tmp0},[$out],#16
591          vorr   $dat0,$in0,$in0
592         vst1.8  {$tmp1},[$out],#16
593          vorr   $dat1,$in1,$in1
594         vst1.8  {$dat2},[$out],#16
595          vorr   $dat2,$in2,$in2
596         b.hs    .Loop3x_cbc_dec
597
598         cmn     $len,#0x30
599         b.eq    .Lcbc_done
600         nop
601
602 .Lcbc_dec_tail:
603         aesd    $dat1,q8
604         aesimc  $dat1,$dat1
605         aesd    $dat2,q8
606         aesimc  $dat2,$dat2
607         vld1.32 {q8},[$key_],#16
608         subs    $cnt,$cnt,#2
609         aesd    $dat1,q9
610         aesimc  $dat1,$dat1
611         aesd    $dat2,q9
612         aesimc  $dat2,$dat2
613         vld1.32 {q9},[$key_],#16
614         b.gt    .Lcbc_dec_tail
615
616         aesd    $dat1,q8
617         aesimc  $dat1,$dat1
618         aesd    $dat2,q8
619         aesimc  $dat2,$dat2
620         aesd    $dat1,q9
621         aesimc  $dat1,$dat1
622         aesd    $dat2,q9
623         aesimc  $dat2,$dat2
624         aesd    $dat1,q12
625         aesimc  $dat1,$dat1
626         aesd    $dat2,q12
627         aesimc  $dat2,$dat2
628          cmn    $len,#0x20
629         aesd    $dat1,q13
630         aesimc  $dat1,$dat1
631         aesd    $dat2,q13
632         aesimc  $dat2,$dat2
633          veor   $tmp1,$ivec,$rndlast
634         aesd    $dat1,q14
635         aesimc  $dat1,$dat1
636         aesd    $dat2,q14
637         aesimc  $dat2,$dat2
638          veor   $tmp2,$in1,$rndlast
639         aesd    $dat1,q15
640         aesd    $dat2,q15
641         b.eq    .Lcbc_dec_one
642         veor    $tmp1,$tmp1,$dat1
643         veor    $tmp2,$tmp2,$dat2
644          vorr   $ivec,$in2,$in2
645         vst1.8  {$tmp1},[$out],#16
646         vst1.8  {$tmp2},[$out],#16
647         b       .Lcbc_done
648
649 .Lcbc_dec_one:
650         veor    $tmp1,$tmp1,$dat2
651          vorr   $ivec,$in2,$in2
652         vst1.8  {$tmp1},[$out],#16
653
654 .Lcbc_done:
655         vst1.8  {$ivec},[$ivp]
656 .Lcbc_abort:
657 ___
658 }
659 $code.=<<___    if ($flavour !~ /64/);
660         vldmia  sp!,{d8-d15}
661         ldmia   sp!,{r4-r8,pc}
662 ___
663 $code.=<<___    if ($flavour =~ /64/);
664         ldr     x29,[sp],#16
665         ret
666 ___
667 $code.=<<___;
668 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
669 ___
670 }}}
671 {{{
672 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
673 my ($rounds,$cnt,$key_)=("w5","w6","x7");
674 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
675 my $step="x12";         # aliases with $tctr2
676
677 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
678 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
679
680 my ($dat,$tmp)=($dat0,$tmp0);
681
682 ### q8-q15      preloaded key schedule
683
684 $code.=<<___;
685 .globl  ${prefix}_ctr32_encrypt_blocks
686 .type   ${prefix}_ctr32_encrypt_blocks,%function
687 .align  5
688 ${prefix}_ctr32_encrypt_blocks:
689 ___
690 $code.=<<___    if ($flavour =~ /64/);
691         stp             x29,x30,[sp,#-16]!
692         add             x29,sp,#0
693 ___
694 $code.=<<___    if ($flavour !~ /64/);
695         mov             ip,sp
696         stmdb           sp!,{r4-r10,lr}
697         vstmdb          sp!,{d8-d15}            @ ABI specification says so
698         ldr             r4, [ip]                @ load remaining arg
699 ___
700 $code.=<<___;
701         ldr             $rounds,[$key,#240]
702
703         ldr             $ctr, [$ivp, #12]
704         vld1.32         {$dat0},[$ivp]
705
706         vld1.32         {q8-q9},[$key]          // load key schedule...
707         sub             $rounds,$rounds,#4
708         mov             $step,#16
709         cmp             $len,#2
710         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
711         sub             $rounds,$rounds,#2
712         vld1.32         {q12-q13},[$key_],#32
713         vld1.32         {q14-q15},[$key_],#32
714         vld1.32         {$rndlast},[$key_]
715         add             $key_,$key,#32
716         mov             $cnt,$rounds
717         cclr            $step,lo
718 #ifndef __ARMEB__
719         rev             $ctr, $ctr
720 #endif
721         vorr            $dat1,$dat0,$dat0
722         add             $tctr1, $ctr, #1
723         vorr            $dat2,$dat0,$dat0
724         add             $ctr, $ctr, #2
725         vorr            $ivec,$dat0,$dat0
726         rev             $tctr1, $tctr1
727         vmov.32         ${dat1}[3],$tctr1
728         b.ls            .Lctr32_tail
729         rev             $tctr2, $ctr
730         sub             $len,$len,#3            // bias
731         vmov.32         ${dat2}[3],$tctr2
732         b               .Loop3x_ctr32
733
734 .align  4
735 .Loop3x_ctr32:
736         aese            $dat0,q8
737         aesmc           $dat0,$dat0
738         aese            $dat1,q8
739         aesmc           $dat1,$dat1
740         aese            $dat2,q8
741         aesmc           $dat2,$dat2
742         vld1.32         {q8},[$key_],#16
743         subs            $cnt,$cnt,#2
744         aese            $dat0,q9
745         aesmc           $dat0,$dat0
746         aese            $dat1,q9
747         aesmc           $dat1,$dat1
748         aese            $dat2,q9
749         aesmc           $dat2,$dat2
750         vld1.32         {q9},[$key_],#16
751         b.gt            .Loop3x_ctr32
752
753         aese            $dat0,q8
754         aesmc           $tmp0,$dat0
755         aese            $dat1,q8
756         aesmc           $tmp1,$dat1
757          vld1.8         {$in0},[$inp],#16
758          vorr           $dat0,$ivec,$ivec
759         aese            $dat2,q8
760         aesmc           $dat2,$dat2
761          vld1.8         {$in1},[$inp],#16
762          vorr           $dat1,$ivec,$ivec
763         aese            $tmp0,q9
764         aesmc           $tmp0,$tmp0
765         aese            $tmp1,q9
766         aesmc           $tmp1,$tmp1
767          vld1.8         {$in2},[$inp],#16
768          mov            $key_,$key
769         aese            $dat2,q9
770         aesmc           $tmp2,$dat2
771          vorr           $dat2,$ivec,$ivec
772          add            $tctr0,$ctr,#1
773         aese            $tmp0,q12
774         aesmc           $tmp0,$tmp0
775         aese            $tmp1,q12
776         aesmc           $tmp1,$tmp1
777          veor           $in0,$in0,$rndlast
778          add            $tctr1,$ctr,#2
779         aese            $tmp2,q12
780         aesmc           $tmp2,$tmp2
781          veor           $in1,$in1,$rndlast
782          add            $ctr,$ctr,#3
783         aese            $tmp0,q13
784         aesmc           $tmp0,$tmp0
785         aese            $tmp1,q13
786         aesmc           $tmp1,$tmp1
787          veor           $in2,$in2,$rndlast
788          rev            $tctr0,$tctr0
789         aese            $tmp2,q13
790         aesmc           $tmp2,$tmp2
791          vmov.32        ${dat0}[3], $tctr0
792          rev            $tctr1,$tctr1
793         aese            $tmp0,q14
794         aesmc           $tmp0,$tmp0
795         aese            $tmp1,q14
796         aesmc           $tmp1,$tmp1
797          vmov.32        ${dat1}[3], $tctr1
798          rev            $tctr2,$ctr
799         aese            $tmp2,q14
800         aesmc           $tmp2,$tmp2
801          vmov.32        ${dat2}[3], $tctr2
802          subs           $len,$len,#3
803         aese            $tmp0,q15
804         aese            $tmp1,q15
805         aese            $tmp2,q15
806
807         veor            $in0,$in0,$tmp0
808          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
809         vst1.8          {$in0},[$out],#16
810         veor            $in1,$in1,$tmp1
811          mov            $cnt,$rounds
812         vst1.8          {$in1},[$out],#16
813         veor            $in2,$in2,$tmp2
814          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
815         vst1.8          {$in2},[$out],#16
816         b.hs            .Loop3x_ctr32
817
818         adds            $len,$len,#3
819         b.eq            .Lctr32_done
820         cmp             $len,#1
821         mov             $step,#16
822         cclr            $step,eq
823
824 .Lctr32_tail:
825         aese            $dat0,q8
826         aesmc           $dat0,$dat0
827         aese            $dat1,q8
828         aesmc           $dat1,$dat1
829         vld1.32         {q8},[$key_],#16
830         subs            $cnt,$cnt,#2
831         aese            $dat0,q9
832         aesmc           $dat0,$dat0
833         aese            $dat1,q9
834         aesmc           $dat1,$dat1
835         vld1.32         {q9},[$key_],#16
836         b.gt            .Lctr32_tail
837
838         aese            $dat0,q8
839         aesmc           $dat0,$dat0
840         aese            $dat1,q8
841         aesmc           $dat1,$dat1
842         aese            $dat0,q9
843         aesmc           $dat0,$dat0
844         aese            $dat1,q9
845         aesmc           $dat1,$dat1
846          vld1.8         {$in0},[$inp],$step
847         aese            $dat0,q12
848         aesmc           $dat0,$dat0
849         aese            $dat1,q12
850         aesmc           $dat1,$dat1
851          vld1.8         {$in1},[$inp]
852         aese            $dat0,q13
853         aesmc           $dat0,$dat0
854         aese            $dat1,q13
855         aesmc           $dat1,$dat1
856          veor           $in0,$in0,$rndlast
857         aese            $dat0,q14
858         aesmc           $dat0,$dat0
859         aese            $dat1,q14
860         aesmc           $dat1,$dat1
861          veor           $in1,$in1,$rndlast
862         aese            $dat0,q15
863         aese            $dat1,q15
864
865         cmp             $len,#1
866         veor            $in0,$in0,$dat0
867         veor            $in1,$in1,$dat1
868         vst1.8          {$in0},[$out],#16
869         b.eq            .Lctr32_done
870         vst1.8          {$in1},[$out]
871
872 .Lctr32_done:
873 ___
874 $code.=<<___    if ($flavour !~ /64/);
875         vldmia          sp!,{d8-d15}
876         ldmia           sp!,{r4-r10,pc}
877 ___
878 $code.=<<___    if ($flavour =~ /64/);
879         ldr             x29,[sp],#16
880         ret
881 ___
882 $code.=<<___;
883 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
884 ___
885 }}}
886 $code.=<<___;
887 #endif
888 ___
889 ########################################
890 if ($flavour =~ /64/) {                 ######## 64-bit code
891     my %opcode = (
892         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
893         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
894
895     local *unaes = sub {
896         my ($mnemonic,$arg)=@_;
897
898         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
899         sprintf ".inst\t0x%08x\t//%s %s",
900                         $opcode{$mnemonic}|$1|($2<<5),
901                         $mnemonic,$arg;
902     };
903
904     foreach(split("\n",$code)) {
905         s/\`([^\`]*)\`/eval($1)/geo;
906
907         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
908         s/@\s/\/\//o;                   # old->new style commentary
909
910         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
911         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
912         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
913         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
914         s/vext\.8/ext/o         or
915         s/vrev32\.8/rev32/o     or
916         s/vtst\.8/cmtst/o       or
917         s/vshr/ushr/o           or
918         s/^(\s+)v/$1/o          or      # strip off v prefix
919         s/\bbx\s+lr\b/ret/o;
920
921         # fix up remainig legacy suffixes
922         s/\.[ui]?8//o;
923         m/\],#8/o and s/\.16b/\.8b/go;
924         s/\.[ui]?32//o and s/\.16b/\.4s/go;
925         s/\.[ui]?64//o and s/\.16b/\.2d/go;
926         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
927
928         print $_,"\n";
929     }
930 } else {                                ######## 32-bit code
931     my %opcode = (
932         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
933         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
934
935     local *unaes = sub {
936         my ($mnemonic,$arg)=@_;
937
938         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
939             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
940                                          |(($2&7)<<1) |(($2&8)<<2);
941             # since ARMv7 instructions are always encoded little-endian.
942             # correct solution is to use .inst directive, but older
943             # assemblers don't implement it:-(
944             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
945                         $word&0xff,($word>>8)&0xff,
946                         ($word>>16)&0xff,($word>>24)&0xff,
947                         $mnemonic,$arg;
948         }
949     };
950
951     sub unvtbl {
952         my $arg=shift;
953
954         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
955         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
956                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
957     }
958
959     sub unvdup32 {
960         my $arg=shift;
961
962         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
963         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;      
964     }
965
966     sub unvmov32 {
967         my $arg=shift;
968
969         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
970         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;       
971     }
972
973     foreach(split("\n",$code)) {
974         s/\`([^\`]*)\`/eval($1)/geo;
975
976         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
977         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
978         s/\/\/\s?/@ /o;                         # new->old style commentary
979
980         # fix up remainig new-style suffixes
981         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
982         s/\],#[0-9]+/]!/o;
983
984         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
985         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
986         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
987         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
988         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
989         s/^(\s+)b\./$1b/o                               or
990         s/^(\s+)mov\./$1mov/o                           or
991         s/^(\s+)ret/$1bx\tlr/o;
992
993         print $_,"\n";
994     }
995 }
996
997 close STDOUT;