aa36ed272405ff172c50cef012dc5d1bcf6a1d2a
[openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
29 #
30 # Performance in cycles per byte processed with 128-bit key:
31 #
32 #               CBC enc         CBC dec         CTR
33 # Apple A7      2.39            1.20            1.20
34 # Cortex-A53    1.32            1.29            1.46
35 # Cortex-A57(*) 1.95            0.85            0.93
36 # Denver        1.96            0.86            0.80
37 #
38 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
39 #       and are still same even for updated module;
40
41 $flavour = shift;
42 $output  = shift;
43
44 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
45 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
46 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
47 die "can't locate arm-xlate.pl";
48
49 open OUT,"| \"$^X\" $xlate $flavour $output";
50 *STDOUT=*OUT;
51
52 $prefix="aes_v8";
53
54 $code=<<___;
55 #include "arm_arch.h"
56
57 #if __ARM_MAX_ARCH__>=7
58 .text
59 ___
60 $code.=".arch   armv8-a+crypto\n"                       if ($flavour =~ /64/);
61 $code.=".arch   armv7-a\n.fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
62                 #^^^^^^ this is done to simplify adoption by not depending
63                 #       on latest binutils.
64
65 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
66 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
67 # maintain both 32- and 64-bit codes within single module and
68 # transliterate common code to either flavour with regex vodoo.
69 #
70 {{{
71 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
72 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
73         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
74
75
76 $code.=<<___;
77 .align  5
78 .Lrcon:
79 .long   0x01,0x01,0x01,0x01
80 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
81 .long   0x1b,0x1b,0x1b,0x1b
82
83 .globl  ${prefix}_set_encrypt_key
84 .type   ${prefix}_set_encrypt_key,%function
85 .align  5
86 ${prefix}_set_encrypt_key:
87 .Lenc_key:
88 ___
89 $code.=<<___    if ($flavour =~ /64/);
90         stp     x29,x30,[sp,#-16]!
91         add     x29,sp,#0
92 ___
93 $code.=<<___;
94         mov     $ptr,#-1
95         cmp     $inp,#0
96         b.eq    .Lenc_key_abort
97         cmp     $out,#0
98         b.eq    .Lenc_key_abort
99         mov     $ptr,#-2
100         cmp     $bits,#128
101         b.lt    .Lenc_key_abort
102         cmp     $bits,#256
103         b.gt    .Lenc_key_abort
104         tst     $bits,#0x3f
105         b.ne    .Lenc_key_abort
106
107         adr     $ptr,.Lrcon
108         cmp     $bits,#192
109
110         veor    $zero,$zero,$zero
111         vld1.8  {$in0},[$inp],#16
112         mov     $bits,#8                // reuse $bits
113         vld1.32 {$rcon,$mask},[$ptr],#32
114
115         b.lt    .Loop128
116         b.eq    .L192
117         b       .L256
118
119 .align  4
120 .Loop128:
121         vtbl.8  $key,{$in0},$mask
122         vext.8  $tmp,$zero,$in0,#12
123         vst1.32 {$in0},[$out],#16
124         aese    $key,$zero
125         subs    $bits,$bits,#1
126
127         veor    $in0,$in0,$tmp
128         vext.8  $tmp,$zero,$tmp,#12
129         veor    $in0,$in0,$tmp
130         vext.8  $tmp,$zero,$tmp,#12
131          veor   $key,$key,$rcon
132         veor    $in0,$in0,$tmp
133         vshl.u8 $rcon,$rcon,#1
134         veor    $in0,$in0,$key
135         b.ne    .Loop128
136
137         vld1.32 {$rcon},[$ptr]
138
139         vtbl.8  $key,{$in0},$mask
140         vext.8  $tmp,$zero,$in0,#12
141         vst1.32 {$in0},[$out],#16
142         aese    $key,$zero
143
144         veor    $in0,$in0,$tmp
145         vext.8  $tmp,$zero,$tmp,#12
146         veor    $in0,$in0,$tmp
147         vext.8  $tmp,$zero,$tmp,#12
148          veor   $key,$key,$rcon
149         veor    $in0,$in0,$tmp
150         vshl.u8 $rcon,$rcon,#1
151         veor    $in0,$in0,$key
152
153         vtbl.8  $key,{$in0},$mask
154         vext.8  $tmp,$zero,$in0,#12
155         vst1.32 {$in0},[$out],#16
156         aese    $key,$zero
157
158         veor    $in0,$in0,$tmp
159         vext.8  $tmp,$zero,$tmp,#12
160         veor    $in0,$in0,$tmp
161         vext.8  $tmp,$zero,$tmp,#12
162          veor   $key,$key,$rcon
163         veor    $in0,$in0,$tmp
164         veor    $in0,$in0,$key
165         vst1.32 {$in0},[$out]
166         add     $out,$out,#0x50
167
168         mov     $rounds,#10
169         b       .Ldone
170
171 .align  4
172 .L192:
173         vld1.8  {$in1},[$inp],#8
174         vmov.i8 $key,#8                 // borrow $key
175         vst1.32 {$in0},[$out],#16
176         vsub.i8 $mask,$mask,$key        // adjust the mask
177
178 .Loop192:
179         vtbl.8  $key,{$in1},$mask
180         vext.8  $tmp,$zero,$in0,#12
181         vst1.32 {$in1},[$out],#8
182         aese    $key,$zero
183         subs    $bits,$bits,#1
184
185         veor    $in0,$in0,$tmp
186         vext.8  $tmp,$zero,$tmp,#12
187         veor    $in0,$in0,$tmp
188         vext.8  $tmp,$zero,$tmp,#12
189         veor    $in0,$in0,$tmp
190
191         vdup.32 $tmp,${in0}[3]
192         veor    $tmp,$tmp,$in1
193          veor   $key,$key,$rcon
194         vext.8  $in1,$zero,$in1,#12
195         vshl.u8 $rcon,$rcon,#1
196         veor    $in1,$in1,$tmp
197         veor    $in0,$in0,$key
198         veor    $in1,$in1,$key
199         vst1.32 {$in0},[$out],#16
200         b.ne    .Loop192
201
202         mov     $rounds,#12
203         add     $out,$out,#0x20
204         b       .Ldone
205
206 .align  4
207 .L256:
208         vld1.8  {$in1},[$inp]
209         mov     $bits,#7
210         mov     $rounds,#14
211         vst1.32 {$in0},[$out],#16
212
213 .Loop256:
214         vtbl.8  $key,{$in1},$mask
215         vext.8  $tmp,$zero,$in0,#12
216         vst1.32 {$in1},[$out],#16
217         aese    $key,$zero
218         subs    $bits,$bits,#1
219
220         veor    $in0,$in0,$tmp
221         vext.8  $tmp,$zero,$tmp,#12
222         veor    $in0,$in0,$tmp
223         vext.8  $tmp,$zero,$tmp,#12
224          veor   $key,$key,$rcon
225         veor    $in0,$in0,$tmp
226         vshl.u8 $rcon,$rcon,#1
227         veor    $in0,$in0,$key
228         vst1.32 {$in0},[$out],#16
229         b.eq    .Ldone
230
231         vdup.32 $key,${in0}[3]          // just splat
232         vext.8  $tmp,$zero,$in1,#12
233         aese    $key,$zero
234
235         veor    $in1,$in1,$tmp
236         vext.8  $tmp,$zero,$tmp,#12
237         veor    $in1,$in1,$tmp
238         vext.8  $tmp,$zero,$tmp,#12
239         veor    $in1,$in1,$tmp
240
241         veor    $in1,$in1,$key
242         b       .Loop256
243
244 .Ldone:
245         str     $rounds,[$out]
246         mov     $ptr,#0
247
248 .Lenc_key_abort:
249         mov     x0,$ptr                 // return value
250         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
251         ret
252 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
253
254 .globl  ${prefix}_set_decrypt_key
255 .type   ${prefix}_set_decrypt_key,%function
256 .align  5
257 ${prefix}_set_decrypt_key:
258 ___
259 $code.=<<___    if ($flavour =~ /64/);
260         stp     x29,x30,[sp,#-16]!
261         add     x29,sp,#0
262 ___
263 $code.=<<___    if ($flavour !~ /64/);
264         stmdb   sp!,{r4,lr}
265 ___
266 $code.=<<___;
267         bl      .Lenc_key
268
269         cmp     x0,#0
270         b.ne    .Ldec_key_abort
271
272         sub     $out,$out,#240          // restore original $out
273         mov     x4,#-16
274         add     $inp,$out,x12,lsl#4     // end of key schedule
275
276         vld1.32 {v0.16b},[$out]
277         vld1.32 {v1.16b},[$inp]
278         vst1.32 {v0.16b},[$inp],x4
279         vst1.32 {v1.16b},[$out],#16
280
281 .Loop_imc:
282         vld1.32 {v0.16b},[$out]
283         vld1.32 {v1.16b},[$inp]
284         aesimc  v0.16b,v0.16b
285         aesimc  v1.16b,v1.16b
286         vst1.32 {v0.16b},[$inp],x4
287         vst1.32 {v1.16b},[$out],#16
288         cmp     $inp,$out
289         b.hi    .Loop_imc
290
291         vld1.32 {v0.16b},[$out]
292         aesimc  v0.16b,v0.16b
293         vst1.32 {v0.16b},[$inp]
294
295         eor     x0,x0,x0                // return value
296 .Ldec_key_abort:
297 ___
298 $code.=<<___    if ($flavour !~ /64/);
299         ldmia   sp!,{r4,pc}
300 ___
301 $code.=<<___    if ($flavour =~ /64/);
302         ldp     x29,x30,[sp],#16
303         ret
304 ___
305 $code.=<<___;
306 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
307 ___
308 }}}
309 {{{
310 sub gen_block () {
311 my $dir = shift;
312 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
313 my ($inp,$out,$key)=map("x$_",(0..2));
314 my $rounds="w3";
315 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
316
317 $code.=<<___;
318 .globl  ${prefix}_${dir}crypt
319 .type   ${prefix}_${dir}crypt,%function
320 .align  5
321 ${prefix}_${dir}crypt:
322         ldr     $rounds,[$key,#240]
323         vld1.32 {$rndkey0},[$key],#16
324         vld1.8  {$inout},[$inp]
325         sub     $rounds,$rounds,#2
326         vld1.32 {$rndkey1},[$key],#16
327
328 .Loop_${dir}c:
329         aes$e   $inout,$rndkey0
330         aes$mc  $inout,$inout
331         vld1.32 {$rndkey0},[$key],#16
332         subs    $rounds,$rounds,#2
333         aes$e   $inout,$rndkey1
334         aes$mc  $inout,$inout
335         vld1.32 {$rndkey1},[$key],#16
336         b.gt    .Loop_${dir}c
337
338         aes$e   $inout,$rndkey0
339         aes$mc  $inout,$inout
340         vld1.32 {$rndkey0},[$key]
341         aes$e   $inout,$rndkey1
342         veor    $inout,$inout,$rndkey0
343
344         vst1.8  {$inout},[$out]
345         ret
346 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
347 ___
348 }
349 &gen_block("en");
350 &gen_block("de");
351 }}}
352 {{{
353 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
354 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
355 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
356
357 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
358 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
359
360 ### q8-q15      preloaded key schedule
361
362 $code.=<<___;
363 .globl  ${prefix}_cbc_encrypt
364 .type   ${prefix}_cbc_encrypt,%function
365 .align  5
366 ${prefix}_cbc_encrypt:
367 ___
368 $code.=<<___    if ($flavour =~ /64/);
369         stp     x29,x30,[sp,#-16]!
370         add     x29,sp,#0
371 ___
372 $code.=<<___    if ($flavour !~ /64/);
373         mov     ip,sp
374         stmdb   sp!,{r4-r8,lr}
375         vstmdb  sp!,{d8-d15}            @ ABI specification says so
376         ldmia   ip,{r4-r5}              @ load remaining args
377 ___
378 $code.=<<___;
379         subs    $len,$len,#16
380         mov     $step,#16
381         b.lo    .Lcbc_abort
382         cclr    $step,eq
383
384         cmp     $enc,#0                 // en- or decrypting?
385         ldr     $rounds,[$key,#240]
386         and     $len,$len,#-16
387         vld1.8  {$ivec},[$ivp]
388         vld1.8  {$dat},[$inp],$step
389
390         vld1.32 {q8-q9},[$key]          // load key schedule...
391         sub     $rounds,$rounds,#6
392         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
393         sub     $rounds,$rounds,#2
394         vld1.32 {q10-q11},[$key_],#32
395         vld1.32 {q12-q13},[$key_],#32
396         vld1.32 {q14-q15},[$key_],#32
397         vld1.32 {$rndlast},[$key_]
398
399         add     $key_,$key,#32
400         mov     $cnt,$rounds
401         b.eq    .Lcbc_dec
402
403         cmp     $rounds,#2
404         veor    $dat,$dat,$ivec
405         veor    $rndzero_n_last,q8,$rndlast
406         b.eq    .Lcbc_enc128
407
408         vld1.32 {$in0-$in1},[$key_]
409         add     $key_,$key,#16
410         add     $key4,$key,#16*4
411         add     $key5,$key,#16*5
412         aese    $dat,q8
413         aesmc   $dat,$dat
414         add     $key6,$key,#16*6
415         add     $key7,$key,#16*7
416         b       .Lenter_cbc_enc
417
418 .align  4
419 .Loop_cbc_enc:
420         aese    $dat,q8
421         aesmc   $dat,$dat
422          vst1.8 {$ivec},[$out],#16
423 .Lenter_cbc_enc:
424         aese    $dat,q9
425         aesmc   $dat,$dat
426         aese    $dat,$in0
427         aesmc   $dat,$dat
428         vld1.32 {q8},[$key4]
429         cmp     $rounds,#4
430         aese    $dat,$in1
431         aesmc   $dat,$dat
432         vld1.32 {q9},[$key5]
433         b.eq    .Lcbc_enc192
434
435         aese    $dat,q8
436         aesmc   $dat,$dat
437         vld1.32 {q8},[$key6]
438         aese    $dat,q9
439         aesmc   $dat,$dat
440         vld1.32 {q9},[$key7]
441         nop
442
443 .Lcbc_enc192:
444         aese    $dat,q8
445         aesmc   $dat,$dat
446          subs   $len,$len,#16
447         aese    $dat,q9
448         aesmc   $dat,$dat
449          cclr   $step,eq
450         aese    $dat,q10
451         aesmc   $dat,$dat
452         aese    $dat,q11
453         aesmc   $dat,$dat
454          vld1.8 {q8},[$inp],$step
455         aese    $dat,q12
456         aesmc   $dat,$dat
457          veor   q8,q8,$rndzero_n_last
458         aese    $dat,q13
459         aesmc   $dat,$dat
460          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
461         aese    $dat,q14
462         aesmc   $dat,$dat
463         aese    $dat,q15
464         veor    $ivec,$dat,$rndlast
465         b.hs    .Loop_cbc_enc
466
467         vst1.8  {$ivec},[$out],#16
468         b       .Lcbc_done
469
470 .align  5
471 .Lcbc_enc128:
472         vld1.32 {$in0-$in1},[$key_]
473         aese    $dat,q8
474         aesmc   $dat,$dat
475         b       .Lenter_cbc_enc128
476 .Loop_cbc_enc128:
477         aese    $dat,q8
478         aesmc   $dat,$dat
479          vst1.8 {$ivec},[$out],#16
480 .Lenter_cbc_enc128:
481         aese    $dat,q9
482         aesmc   $dat,$dat
483          subs   $len,$len,#16
484         aese    $dat,$in0
485         aesmc   $dat,$dat
486          cclr   $step,eq
487         aese    $dat,$in1
488         aesmc   $dat,$dat
489         aese    $dat,q10
490         aesmc   $dat,$dat
491         aese    $dat,q11
492         aesmc   $dat,$dat
493          vld1.8 {q8},[$inp],$step
494         aese    $dat,q12
495         aesmc   $dat,$dat
496         aese    $dat,q13
497         aesmc   $dat,$dat
498         aese    $dat,q14
499         aesmc   $dat,$dat
500          veor   q8,q8,$rndzero_n_last
501         aese    $dat,q15
502         veor    $ivec,$dat,$rndlast
503         b.hs    .Loop_cbc_enc128
504
505         vst1.8  {$ivec},[$out],#16
506         b       .Lcbc_done
507 ___
508 {
509 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
510 $code.=<<___;
511 .align  5
512 .Lcbc_dec:
513         vld1.8  {$dat2},[$inp],#16
514         subs    $len,$len,#32           // bias
515         add     $cnt,$rounds,#2
516         vorr    $in1,$dat,$dat
517         vorr    $dat1,$dat,$dat
518         vorr    $in2,$dat2,$dat2
519         b.lo    .Lcbc_dec_tail
520
521         vorr    $dat1,$dat2,$dat2
522         vld1.8  {$dat2},[$inp],#16
523         vorr    $in0,$dat,$dat
524         vorr    $in1,$dat1,$dat1
525         vorr    $in2,$dat2,$dat2
526
527 .Loop3x_cbc_dec:
528         aesd    $dat0,q8
529         aesimc  $dat0,$dat0
530         aesd    $dat1,q8
531         aesimc  $dat1,$dat1
532         aesd    $dat2,q8
533         aesimc  $dat2,$dat2
534         vld1.32 {q8},[$key_],#16
535         subs    $cnt,$cnt,#2
536         aesd    $dat0,q9
537         aesimc  $dat0,$dat0
538         aesd    $dat1,q9
539         aesimc  $dat1,$dat1
540         aesd    $dat2,q9
541         aesimc  $dat2,$dat2
542         vld1.32 {q9},[$key_],#16
543         b.gt    .Loop3x_cbc_dec
544
545         aesd    $dat0,q8
546         aesimc  $dat0,$dat0
547         aesd    $dat1,q8
548         aesimc  $dat1,$dat1
549         aesd    $dat2,q8
550         aesimc  $dat2,$dat2
551          veor   $tmp0,$ivec,$rndlast
552          subs   $len,$len,#0x30
553          veor   $tmp1,$in0,$rndlast
554          mov.lo x6,$len                 // x6, $cnt, is zero at this point
555         aesd    $dat0,q9
556         aesimc  $dat0,$dat0
557         aesd    $dat1,q9
558         aesimc  $dat1,$dat1
559         aesd    $dat2,q9
560         aesimc  $dat2,$dat2
561          veor   $tmp2,$in1,$rndlast
562          add    $inp,$inp,x6            // $inp is adjusted in such way that
563                                         // at exit from the loop $dat1-$dat2
564                                         // are loaded with last "words"
565          vorr   $ivec,$in2,$in2
566          mov    $key_,$key
567         aesd    $dat0,q12
568         aesimc  $dat0,$dat0
569         aesd    $dat1,q12
570         aesimc  $dat1,$dat1
571         aesd    $dat2,q12
572         aesimc  $dat2,$dat2
573          vld1.8 {$in0},[$inp],#16
574         aesd    $dat0,q13
575         aesimc  $dat0,$dat0
576         aesd    $dat1,q13
577         aesimc  $dat1,$dat1
578         aesd    $dat2,q13
579         aesimc  $dat2,$dat2
580          vld1.8 {$in1},[$inp],#16
581         aesd    $dat0,q14
582         aesimc  $dat0,$dat0
583         aesd    $dat1,q14
584         aesimc  $dat1,$dat1
585         aesd    $dat2,q14
586         aesimc  $dat2,$dat2
587          vld1.8 {$in2},[$inp],#16
588         aesd    $dat0,q15
589         aesd    $dat1,q15
590         aesd    $dat2,q15
591          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
592          add    $cnt,$rounds,#2
593         veor    $tmp0,$tmp0,$dat0
594         veor    $tmp1,$tmp1,$dat1
595         veor    $dat2,$dat2,$tmp2
596          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
597         vst1.8  {$tmp0},[$out],#16
598          vorr   $dat0,$in0,$in0
599         vst1.8  {$tmp1},[$out],#16
600          vorr   $dat1,$in1,$in1
601         vst1.8  {$dat2},[$out],#16
602          vorr   $dat2,$in2,$in2
603         b.hs    .Loop3x_cbc_dec
604
605         cmn     $len,#0x30
606         b.eq    .Lcbc_done
607         nop
608
609 .Lcbc_dec_tail:
610         aesd    $dat1,q8
611         aesimc  $dat1,$dat1
612         aesd    $dat2,q8
613         aesimc  $dat2,$dat2
614         vld1.32 {q8},[$key_],#16
615         subs    $cnt,$cnt,#2
616         aesd    $dat1,q9
617         aesimc  $dat1,$dat1
618         aesd    $dat2,q9
619         aesimc  $dat2,$dat2
620         vld1.32 {q9},[$key_],#16
621         b.gt    .Lcbc_dec_tail
622
623         aesd    $dat1,q8
624         aesimc  $dat1,$dat1
625         aesd    $dat2,q8
626         aesimc  $dat2,$dat2
627         aesd    $dat1,q9
628         aesimc  $dat1,$dat1
629         aesd    $dat2,q9
630         aesimc  $dat2,$dat2
631         aesd    $dat1,q12
632         aesimc  $dat1,$dat1
633         aesd    $dat2,q12
634         aesimc  $dat2,$dat2
635          cmn    $len,#0x20
636         aesd    $dat1,q13
637         aesimc  $dat1,$dat1
638         aesd    $dat2,q13
639         aesimc  $dat2,$dat2
640          veor   $tmp1,$ivec,$rndlast
641         aesd    $dat1,q14
642         aesimc  $dat1,$dat1
643         aesd    $dat2,q14
644         aesimc  $dat2,$dat2
645          veor   $tmp2,$in1,$rndlast
646         aesd    $dat1,q15
647         aesd    $dat2,q15
648         b.eq    .Lcbc_dec_one
649         veor    $tmp1,$tmp1,$dat1
650         veor    $tmp2,$tmp2,$dat2
651          vorr   $ivec,$in2,$in2
652         vst1.8  {$tmp1},[$out],#16
653         vst1.8  {$tmp2},[$out],#16
654         b       .Lcbc_done
655
656 .Lcbc_dec_one:
657         veor    $tmp1,$tmp1,$dat2
658          vorr   $ivec,$in2,$in2
659         vst1.8  {$tmp1},[$out],#16
660
661 .Lcbc_done:
662         vst1.8  {$ivec},[$ivp]
663 .Lcbc_abort:
664 ___
665 }
666 $code.=<<___    if ($flavour !~ /64/);
667         vldmia  sp!,{d8-d15}
668         ldmia   sp!,{r4-r8,pc}
669 ___
670 $code.=<<___    if ($flavour =~ /64/);
671         ldr     x29,[sp],#16
672         ret
673 ___
674 $code.=<<___;
675 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
676 ___
677 }}}
678 {{{
679 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
680 my ($rounds,$cnt,$key_)=("w5","w6","x7");
681 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
682 my $step="x12";         # aliases with $tctr2
683
684 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
685 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
686
687 my ($dat,$tmp)=($dat0,$tmp0);
688
689 ### q8-q15      preloaded key schedule
690
691 $code.=<<___;
692 .globl  ${prefix}_ctr32_encrypt_blocks
693 .type   ${prefix}_ctr32_encrypt_blocks,%function
694 .align  5
695 ${prefix}_ctr32_encrypt_blocks:
696 ___
697 $code.=<<___    if ($flavour =~ /64/);
698         stp             x29,x30,[sp,#-16]!
699         add             x29,sp,#0
700 ___
701 $code.=<<___    if ($flavour !~ /64/);
702         mov             ip,sp
703         stmdb           sp!,{r4-r10,lr}
704         vstmdb          sp!,{d8-d15}            @ ABI specification says so
705         ldr             r4, [ip]                @ load remaining arg
706 ___
707 $code.=<<___;
708         ldr             $rounds,[$key,#240]
709
710         ldr             $ctr, [$ivp, #12]
711         vld1.32         {$dat0},[$ivp]
712
713         vld1.32         {q8-q9},[$key]          // load key schedule...
714         sub             $rounds,$rounds,#4
715         mov             $step,#16
716         cmp             $len,#2
717         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
718         sub             $rounds,$rounds,#2
719         vld1.32         {q12-q13},[$key_],#32
720         vld1.32         {q14-q15},[$key_],#32
721         vld1.32         {$rndlast},[$key_]
722         add             $key_,$key,#32
723         mov             $cnt,$rounds
724         cclr            $step,lo
725 #ifndef __ARMEB__
726         rev             $ctr, $ctr
727 #endif
728         vorr            $dat1,$dat0,$dat0
729         add             $tctr1, $ctr, #1
730         vorr            $dat2,$dat0,$dat0
731         add             $ctr, $ctr, #2
732         vorr            $ivec,$dat0,$dat0
733         rev             $tctr1, $tctr1
734         vmov.32         ${dat1}[3],$tctr1
735         b.ls            .Lctr32_tail
736         rev             $tctr2, $ctr
737         sub             $len,$len,#3            // bias
738         vmov.32         ${dat2}[3],$tctr2
739         b               .Loop3x_ctr32
740
741 .align  4
742 .Loop3x_ctr32:
743         aese            $dat0,q8
744         aesmc           $dat0,$dat0
745         aese            $dat1,q8
746         aesmc           $dat1,$dat1
747         aese            $dat2,q8
748         aesmc           $dat2,$dat2
749         vld1.32         {q8},[$key_],#16
750         subs            $cnt,$cnt,#2
751         aese            $dat0,q9
752         aesmc           $dat0,$dat0
753         aese            $dat1,q9
754         aesmc           $dat1,$dat1
755         aese            $dat2,q9
756         aesmc           $dat2,$dat2
757         vld1.32         {q9},[$key_],#16
758         b.gt            .Loop3x_ctr32
759
760         aese            $dat0,q8
761         aesmc           $tmp0,$dat0
762         aese            $dat1,q8
763         aesmc           $tmp1,$dat1
764          vld1.8         {$in0},[$inp],#16
765          vorr           $dat0,$ivec,$ivec
766         aese            $dat2,q8
767         aesmc           $dat2,$dat2
768          vld1.8         {$in1},[$inp],#16
769          vorr           $dat1,$ivec,$ivec
770         aese            $tmp0,q9
771         aesmc           $tmp0,$tmp0
772         aese            $tmp1,q9
773         aesmc           $tmp1,$tmp1
774          vld1.8         {$in2},[$inp],#16
775          mov            $key_,$key
776         aese            $dat2,q9
777         aesmc           $tmp2,$dat2
778          vorr           $dat2,$ivec,$ivec
779          add            $tctr0,$ctr,#1
780         aese            $tmp0,q12
781         aesmc           $tmp0,$tmp0
782         aese            $tmp1,q12
783         aesmc           $tmp1,$tmp1
784          veor           $in0,$in0,$rndlast
785          add            $tctr1,$ctr,#2
786         aese            $tmp2,q12
787         aesmc           $tmp2,$tmp2
788          veor           $in1,$in1,$rndlast
789          add            $ctr,$ctr,#3
790         aese            $tmp0,q13
791         aesmc           $tmp0,$tmp0
792         aese            $tmp1,q13
793         aesmc           $tmp1,$tmp1
794          veor           $in2,$in2,$rndlast
795          rev            $tctr0,$tctr0
796         aese            $tmp2,q13
797         aesmc           $tmp2,$tmp2
798          vmov.32        ${dat0}[3], $tctr0
799          rev            $tctr1,$tctr1
800         aese            $tmp0,q14
801         aesmc           $tmp0,$tmp0
802         aese            $tmp1,q14
803         aesmc           $tmp1,$tmp1
804          vmov.32        ${dat1}[3], $tctr1
805          rev            $tctr2,$ctr
806         aese            $tmp2,q14
807         aesmc           $tmp2,$tmp2
808          vmov.32        ${dat2}[3], $tctr2
809          subs           $len,$len,#3
810         aese            $tmp0,q15
811         aese            $tmp1,q15
812         aese            $tmp2,q15
813
814         veor            $in0,$in0,$tmp0
815          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
816         vst1.8          {$in0},[$out],#16
817         veor            $in1,$in1,$tmp1
818          mov            $cnt,$rounds
819         vst1.8          {$in1},[$out],#16
820         veor            $in2,$in2,$tmp2
821          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
822         vst1.8          {$in2},[$out],#16
823         b.hs            .Loop3x_ctr32
824
825         adds            $len,$len,#3
826         b.eq            .Lctr32_done
827         cmp             $len,#1
828         mov             $step,#16
829         cclr            $step,eq
830
831 .Lctr32_tail:
832         aese            $dat0,q8
833         aesmc           $dat0,$dat0
834         aese            $dat1,q8
835         aesmc           $dat1,$dat1
836         vld1.32         {q8},[$key_],#16
837         subs            $cnt,$cnt,#2
838         aese            $dat0,q9
839         aesmc           $dat0,$dat0
840         aese            $dat1,q9
841         aesmc           $dat1,$dat1
842         vld1.32         {q9},[$key_],#16
843         b.gt            .Lctr32_tail
844
845         aese            $dat0,q8
846         aesmc           $dat0,$dat0
847         aese            $dat1,q8
848         aesmc           $dat1,$dat1
849         aese            $dat0,q9
850         aesmc           $dat0,$dat0
851         aese            $dat1,q9
852         aesmc           $dat1,$dat1
853          vld1.8         {$in0},[$inp],$step
854         aese            $dat0,q12
855         aesmc           $dat0,$dat0
856         aese            $dat1,q12
857         aesmc           $dat1,$dat1
858          vld1.8         {$in1},[$inp]
859         aese            $dat0,q13
860         aesmc           $dat0,$dat0
861         aese            $dat1,q13
862         aesmc           $dat1,$dat1
863          veor           $in0,$in0,$rndlast
864         aese            $dat0,q14
865         aesmc           $dat0,$dat0
866         aese            $dat1,q14
867         aesmc           $dat1,$dat1
868          veor           $in1,$in1,$rndlast
869         aese            $dat0,q15
870         aese            $dat1,q15
871
872         cmp             $len,#1
873         veor            $in0,$in0,$dat0
874         veor            $in1,$in1,$dat1
875         vst1.8          {$in0},[$out],#16
876         b.eq            .Lctr32_done
877         vst1.8          {$in1},[$out]
878
879 .Lctr32_done:
880 ___
881 $code.=<<___    if ($flavour !~ /64/);
882         vldmia          sp!,{d8-d15}
883         ldmia           sp!,{r4-r10,pc}
884 ___
885 $code.=<<___    if ($flavour =~ /64/);
886         ldr             x29,[sp],#16
887         ret
888 ___
889 $code.=<<___;
890 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
891 ___
892 }}}
893 $code.=<<___;
894 #endif
895 ___
896 ########################################
897 if ($flavour =~ /64/) {                 ######## 64-bit code
898     my %opcode = (
899         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
900         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
901
902     local *unaes = sub {
903         my ($mnemonic,$arg)=@_;
904
905         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
906         sprintf ".inst\t0x%08x\t//%s %s",
907                         $opcode{$mnemonic}|$1|($2<<5),
908                         $mnemonic,$arg;
909     };
910
911     foreach(split("\n",$code)) {
912         s/\`([^\`]*)\`/eval($1)/geo;
913
914         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
915         s/@\s/\/\//o;                   # old->new style commentary
916
917         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
918         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
919         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
920         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
921         s/vext\.8/ext/o         or
922         s/vrev32\.8/rev32/o     or
923         s/vtst\.8/cmtst/o       or
924         s/vshr/ushr/o           or
925         s/^(\s+)v/$1/o          or      # strip off v prefix
926         s/\bbx\s+lr\b/ret/o;
927
928         # fix up remainig legacy suffixes
929         s/\.[ui]?8//o;
930         m/\],#8/o and s/\.16b/\.8b/go;
931         s/\.[ui]?32//o and s/\.16b/\.4s/go;
932         s/\.[ui]?64//o and s/\.16b/\.2d/go;
933         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
934
935         print $_,"\n";
936     }
937 } else {                                ######## 32-bit code
938     my %opcode = (
939         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
940         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
941
942     local *unaes = sub {
943         my ($mnemonic,$arg)=@_;
944
945         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
946             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
947                                          |(($2&7)<<1) |(($2&8)<<2);
948             # since ARMv7 instructions are always encoded little-endian.
949             # correct solution is to use .inst directive, but older
950             # assemblers don't implement it:-(
951             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
952                         $word&0xff,($word>>8)&0xff,
953                         ($word>>16)&0xff,($word>>24)&0xff,
954                         $mnemonic,$arg;
955         }
956     };
957
958     sub unvtbl {
959         my $arg=shift;
960
961         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
962         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
963                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
964     }
965
966     sub unvdup32 {
967         my $arg=shift;
968
969         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
970         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;      
971     }
972
973     sub unvmov32 {
974         my $arg=shift;
975
976         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
977         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;       
978     }
979
980     foreach(split("\n",$code)) {
981         s/\`([^\`]*)\`/eval($1)/geo;
982
983         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
984         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
985         s/\/\/\s?/@ /o;                         # new->old style commentary
986
987         # fix up remainig new-style suffixes
988         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
989         s/\],#[0-9]+/]!/o;
990
991         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
992         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
993         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
994         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
995         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
996         s/^(\s+)b\./$1b/o                               or
997         s/^(\s+)mov\./$1mov/o                           or
998         s/^(\s+)ret/$1bx\tlr/o;
999
1000         print $_,"\n";
1001     }
1002 }
1003
1004 close STDOUT;