ARMv8 assembly pack: add Samsung Mongoose results.
[openssl.git] / crypto / aes / asm / aesv8-armx.pl
1 #! /usr/bin/env perl
2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
3 #
4 # Licensed under the OpenSSL license (the "License").  You may not use
5 # this file except in compliance with the License.  You can obtain a copy
6 # in the file LICENSE in the source distribution or at
7 # https://www.openssl.org/source/license.html
8
9 #
10 # ====================================================================
11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
12 # project. The module is, however, dual licensed under OpenSSL and
13 # CRYPTOGAMS licenses depending on where you obtain it. For further
14 # details see http://www.openssl.org/~appro/cryptogams/.
15 # ====================================================================
16 #
17 # This module implements support for ARMv8 AES instructions. The
18 # module is endian-agnostic in sense that it supports both big- and
19 # little-endian cases. As does it support both 32- and 64-bit modes
20 # of operation. Latter is achieved by limiting amount of utilized
21 # registers to 16, which implies additional NEON load and integer
22 # instructions. This has no effect on mighty Apple A7, where results
23 # are literally equal to the theoretical estimates based on AES
24 # instruction latencies and issue rates. On Cortex-A53, an in-order
25 # execution core, this costs up to 10-15%, which is partially
26 # compensated by implementing dedicated code path for 128-bit
27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
28 # seems to be limited by sheer amount of NEON instructions...
29 #
30 # Performance in cycles per byte processed with 128-bit key:
31 #
32 #               CBC enc         CBC dec         CTR
33 # Apple A7      2.39            1.20            1.20
34 # Cortex-A53    1.32            1.29            1.46
35 # Cortex-A57(*) 1.95            0.85            0.93
36 # Denver        1.96            0.86            0.80
37 # Mongoose      1.33            1.20            1.20
38 #
39 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
40 #       and are still same even for updated module;
41
42 $flavour = shift;
43 $output  = shift;
44
45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
46 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
47 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
48 die "can't locate arm-xlate.pl";
49
50 open OUT,"| \"$^X\" $xlate $flavour $output";
51 *STDOUT=*OUT;
52
53 $prefix="aes_v8";
54
55 $code=<<___;
56 #include "arm_arch.h"
57
58 #if __ARM_MAX_ARCH__>=7
59 .text
60 ___
61 $code.=".arch   armv8-a+crypto\n"                       if ($flavour =~ /64/);
62 $code.=".arch   armv7-a\n.fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
63                 #^^^^^^ this is done to simplify adoption by not depending
64                 #       on latest binutils.
65
66 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
67 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
68 # maintain both 32- and 64-bit codes within single module and
69 # transliterate common code to either flavour with regex vodoo.
70 #
71 {{{
72 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
73 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
74         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
75
76
77 $code.=<<___;
78 .align  5
79 .Lrcon:
80 .long   0x01,0x01,0x01,0x01
81 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
82 .long   0x1b,0x1b,0x1b,0x1b
83
84 .globl  ${prefix}_set_encrypt_key
85 .type   ${prefix}_set_encrypt_key,%function
86 .align  5
87 ${prefix}_set_encrypt_key:
88 .Lenc_key:
89 ___
90 $code.=<<___    if ($flavour =~ /64/);
91         stp     x29,x30,[sp,#-16]!
92         add     x29,sp,#0
93 ___
94 $code.=<<___;
95         mov     $ptr,#-1
96         cmp     $inp,#0
97         b.eq    .Lenc_key_abort
98         cmp     $out,#0
99         b.eq    .Lenc_key_abort
100         mov     $ptr,#-2
101         cmp     $bits,#128
102         b.lt    .Lenc_key_abort
103         cmp     $bits,#256
104         b.gt    .Lenc_key_abort
105         tst     $bits,#0x3f
106         b.ne    .Lenc_key_abort
107
108         adr     $ptr,.Lrcon
109         cmp     $bits,#192
110
111         veor    $zero,$zero,$zero
112         vld1.8  {$in0},[$inp],#16
113         mov     $bits,#8                // reuse $bits
114         vld1.32 {$rcon,$mask},[$ptr],#32
115
116         b.lt    .Loop128
117         b.eq    .L192
118         b       .L256
119
120 .align  4
121 .Loop128:
122         vtbl.8  $key,{$in0},$mask
123         vext.8  $tmp,$zero,$in0,#12
124         vst1.32 {$in0},[$out],#16
125         aese    $key,$zero
126         subs    $bits,$bits,#1
127
128         veor    $in0,$in0,$tmp
129         vext.8  $tmp,$zero,$tmp,#12
130         veor    $in0,$in0,$tmp
131         vext.8  $tmp,$zero,$tmp,#12
132          veor   $key,$key,$rcon
133         veor    $in0,$in0,$tmp
134         vshl.u8 $rcon,$rcon,#1
135         veor    $in0,$in0,$key
136         b.ne    .Loop128
137
138         vld1.32 {$rcon},[$ptr]
139
140         vtbl.8  $key,{$in0},$mask
141         vext.8  $tmp,$zero,$in0,#12
142         vst1.32 {$in0},[$out],#16
143         aese    $key,$zero
144
145         veor    $in0,$in0,$tmp
146         vext.8  $tmp,$zero,$tmp,#12
147         veor    $in0,$in0,$tmp
148         vext.8  $tmp,$zero,$tmp,#12
149          veor   $key,$key,$rcon
150         veor    $in0,$in0,$tmp
151         vshl.u8 $rcon,$rcon,#1
152         veor    $in0,$in0,$key
153
154         vtbl.8  $key,{$in0},$mask
155         vext.8  $tmp,$zero,$in0,#12
156         vst1.32 {$in0},[$out],#16
157         aese    $key,$zero
158
159         veor    $in0,$in0,$tmp
160         vext.8  $tmp,$zero,$tmp,#12
161         veor    $in0,$in0,$tmp
162         vext.8  $tmp,$zero,$tmp,#12
163          veor   $key,$key,$rcon
164         veor    $in0,$in0,$tmp
165         veor    $in0,$in0,$key
166         vst1.32 {$in0},[$out]
167         add     $out,$out,#0x50
168
169         mov     $rounds,#10
170         b       .Ldone
171
172 .align  4
173 .L192:
174         vld1.8  {$in1},[$inp],#8
175         vmov.i8 $key,#8                 // borrow $key
176         vst1.32 {$in0},[$out],#16
177         vsub.i8 $mask,$mask,$key        // adjust the mask
178
179 .Loop192:
180         vtbl.8  $key,{$in1},$mask
181         vext.8  $tmp,$zero,$in0,#12
182         vst1.32 {$in1},[$out],#8
183         aese    $key,$zero
184         subs    $bits,$bits,#1
185
186         veor    $in0,$in0,$tmp
187         vext.8  $tmp,$zero,$tmp,#12
188         veor    $in0,$in0,$tmp
189         vext.8  $tmp,$zero,$tmp,#12
190         veor    $in0,$in0,$tmp
191
192         vdup.32 $tmp,${in0}[3]
193         veor    $tmp,$tmp,$in1
194          veor   $key,$key,$rcon
195         vext.8  $in1,$zero,$in1,#12
196         vshl.u8 $rcon,$rcon,#1
197         veor    $in1,$in1,$tmp
198         veor    $in0,$in0,$key
199         veor    $in1,$in1,$key
200         vst1.32 {$in0},[$out],#16
201         b.ne    .Loop192
202
203         mov     $rounds,#12
204         add     $out,$out,#0x20
205         b       .Ldone
206
207 .align  4
208 .L256:
209         vld1.8  {$in1},[$inp]
210         mov     $bits,#7
211         mov     $rounds,#14
212         vst1.32 {$in0},[$out],#16
213
214 .Loop256:
215         vtbl.8  $key,{$in1},$mask
216         vext.8  $tmp,$zero,$in0,#12
217         vst1.32 {$in1},[$out],#16
218         aese    $key,$zero
219         subs    $bits,$bits,#1
220
221         veor    $in0,$in0,$tmp
222         vext.8  $tmp,$zero,$tmp,#12
223         veor    $in0,$in0,$tmp
224         vext.8  $tmp,$zero,$tmp,#12
225          veor   $key,$key,$rcon
226         veor    $in0,$in0,$tmp
227         vshl.u8 $rcon,$rcon,#1
228         veor    $in0,$in0,$key
229         vst1.32 {$in0},[$out],#16
230         b.eq    .Ldone
231
232         vdup.32 $key,${in0}[3]          // just splat
233         vext.8  $tmp,$zero,$in1,#12
234         aese    $key,$zero
235
236         veor    $in1,$in1,$tmp
237         vext.8  $tmp,$zero,$tmp,#12
238         veor    $in1,$in1,$tmp
239         vext.8  $tmp,$zero,$tmp,#12
240         veor    $in1,$in1,$tmp
241
242         veor    $in1,$in1,$key
243         b       .Loop256
244
245 .Ldone:
246         str     $rounds,[$out]
247         mov     $ptr,#0
248
249 .Lenc_key_abort:
250         mov     x0,$ptr                 // return value
251         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
252         ret
253 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
254
255 .globl  ${prefix}_set_decrypt_key
256 .type   ${prefix}_set_decrypt_key,%function
257 .align  5
258 ${prefix}_set_decrypt_key:
259 ___
260 $code.=<<___    if ($flavour =~ /64/);
261         stp     x29,x30,[sp,#-16]!
262         add     x29,sp,#0
263 ___
264 $code.=<<___    if ($flavour !~ /64/);
265         stmdb   sp!,{r4,lr}
266 ___
267 $code.=<<___;
268         bl      .Lenc_key
269
270         cmp     x0,#0
271         b.ne    .Ldec_key_abort
272
273         sub     $out,$out,#240          // restore original $out
274         mov     x4,#-16
275         add     $inp,$out,x12,lsl#4     // end of key schedule
276
277         vld1.32 {v0.16b},[$out]
278         vld1.32 {v1.16b},[$inp]
279         vst1.32 {v0.16b},[$inp],x4
280         vst1.32 {v1.16b},[$out],#16
281
282 .Loop_imc:
283         vld1.32 {v0.16b},[$out]
284         vld1.32 {v1.16b},[$inp]
285         aesimc  v0.16b,v0.16b
286         aesimc  v1.16b,v1.16b
287         vst1.32 {v0.16b},[$inp],x4
288         vst1.32 {v1.16b},[$out],#16
289         cmp     $inp,$out
290         b.hi    .Loop_imc
291
292         vld1.32 {v0.16b},[$out]
293         aesimc  v0.16b,v0.16b
294         vst1.32 {v0.16b},[$inp]
295
296         eor     x0,x0,x0                // return value
297 .Ldec_key_abort:
298 ___
299 $code.=<<___    if ($flavour !~ /64/);
300         ldmia   sp!,{r4,pc}
301 ___
302 $code.=<<___    if ($flavour =~ /64/);
303         ldp     x29,x30,[sp],#16
304         ret
305 ___
306 $code.=<<___;
307 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
308 ___
309 }}}
310 {{{
311 sub gen_block () {
312 my $dir = shift;
313 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
314 my ($inp,$out,$key)=map("x$_",(0..2));
315 my $rounds="w3";
316 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
317
318 $code.=<<___;
319 .globl  ${prefix}_${dir}crypt
320 .type   ${prefix}_${dir}crypt,%function
321 .align  5
322 ${prefix}_${dir}crypt:
323         ldr     $rounds,[$key,#240]
324         vld1.32 {$rndkey0},[$key],#16
325         vld1.8  {$inout},[$inp]
326         sub     $rounds,$rounds,#2
327         vld1.32 {$rndkey1},[$key],#16
328
329 .Loop_${dir}c:
330         aes$e   $inout,$rndkey0
331         aes$mc  $inout,$inout
332         vld1.32 {$rndkey0},[$key],#16
333         subs    $rounds,$rounds,#2
334         aes$e   $inout,$rndkey1
335         aes$mc  $inout,$inout
336         vld1.32 {$rndkey1},[$key],#16
337         b.gt    .Loop_${dir}c
338
339         aes$e   $inout,$rndkey0
340         aes$mc  $inout,$inout
341         vld1.32 {$rndkey0},[$key]
342         aes$e   $inout,$rndkey1
343         veor    $inout,$inout,$rndkey0
344
345         vst1.8  {$inout},[$out]
346         ret
347 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
348 ___
349 }
350 &gen_block("en");
351 &gen_block("de");
352 }}}
353 {{{
354 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
355 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
356 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
357
358 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
359 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
360
361 ### q8-q15      preloaded key schedule
362
363 $code.=<<___;
364 .globl  ${prefix}_cbc_encrypt
365 .type   ${prefix}_cbc_encrypt,%function
366 .align  5
367 ${prefix}_cbc_encrypt:
368 ___
369 $code.=<<___    if ($flavour =~ /64/);
370         stp     x29,x30,[sp,#-16]!
371         add     x29,sp,#0
372 ___
373 $code.=<<___    if ($flavour !~ /64/);
374         mov     ip,sp
375         stmdb   sp!,{r4-r8,lr}
376         vstmdb  sp!,{d8-d15}            @ ABI specification says so
377         ldmia   ip,{r4-r5}              @ load remaining args
378 ___
379 $code.=<<___;
380         subs    $len,$len,#16
381         mov     $step,#16
382         b.lo    .Lcbc_abort
383         cclr    $step,eq
384
385         cmp     $enc,#0                 // en- or decrypting?
386         ldr     $rounds,[$key,#240]
387         and     $len,$len,#-16
388         vld1.8  {$ivec},[$ivp]
389         vld1.8  {$dat},[$inp],$step
390
391         vld1.32 {q8-q9},[$key]          // load key schedule...
392         sub     $rounds,$rounds,#6
393         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
394         sub     $rounds,$rounds,#2
395         vld1.32 {q10-q11},[$key_],#32
396         vld1.32 {q12-q13},[$key_],#32
397         vld1.32 {q14-q15},[$key_],#32
398         vld1.32 {$rndlast},[$key_]
399
400         add     $key_,$key,#32
401         mov     $cnt,$rounds
402         b.eq    .Lcbc_dec
403
404         cmp     $rounds,#2
405         veor    $dat,$dat,$ivec
406         veor    $rndzero_n_last,q8,$rndlast
407         b.eq    .Lcbc_enc128
408
409         vld1.32 {$in0-$in1},[$key_]
410         add     $key_,$key,#16
411         add     $key4,$key,#16*4
412         add     $key5,$key,#16*5
413         aese    $dat,q8
414         aesmc   $dat,$dat
415         add     $key6,$key,#16*6
416         add     $key7,$key,#16*7
417         b       .Lenter_cbc_enc
418
419 .align  4
420 .Loop_cbc_enc:
421         aese    $dat,q8
422         aesmc   $dat,$dat
423          vst1.8 {$ivec},[$out],#16
424 .Lenter_cbc_enc:
425         aese    $dat,q9
426         aesmc   $dat,$dat
427         aese    $dat,$in0
428         aesmc   $dat,$dat
429         vld1.32 {q8},[$key4]
430         cmp     $rounds,#4
431         aese    $dat,$in1
432         aesmc   $dat,$dat
433         vld1.32 {q9},[$key5]
434         b.eq    .Lcbc_enc192
435
436         aese    $dat,q8
437         aesmc   $dat,$dat
438         vld1.32 {q8},[$key6]
439         aese    $dat,q9
440         aesmc   $dat,$dat
441         vld1.32 {q9},[$key7]
442         nop
443
444 .Lcbc_enc192:
445         aese    $dat,q8
446         aesmc   $dat,$dat
447          subs   $len,$len,#16
448         aese    $dat,q9
449         aesmc   $dat,$dat
450          cclr   $step,eq
451         aese    $dat,q10
452         aesmc   $dat,$dat
453         aese    $dat,q11
454         aesmc   $dat,$dat
455          vld1.8 {q8},[$inp],$step
456         aese    $dat,q12
457         aesmc   $dat,$dat
458          veor   q8,q8,$rndzero_n_last
459         aese    $dat,q13
460         aesmc   $dat,$dat
461          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
462         aese    $dat,q14
463         aesmc   $dat,$dat
464         aese    $dat,q15
465         veor    $ivec,$dat,$rndlast
466         b.hs    .Loop_cbc_enc
467
468         vst1.8  {$ivec},[$out],#16
469         b       .Lcbc_done
470
471 .align  5
472 .Lcbc_enc128:
473         vld1.32 {$in0-$in1},[$key_]
474         aese    $dat,q8
475         aesmc   $dat,$dat
476         b       .Lenter_cbc_enc128
477 .Loop_cbc_enc128:
478         aese    $dat,q8
479         aesmc   $dat,$dat
480          vst1.8 {$ivec},[$out],#16
481 .Lenter_cbc_enc128:
482         aese    $dat,q9
483         aesmc   $dat,$dat
484          subs   $len,$len,#16
485         aese    $dat,$in0
486         aesmc   $dat,$dat
487          cclr   $step,eq
488         aese    $dat,$in1
489         aesmc   $dat,$dat
490         aese    $dat,q10
491         aesmc   $dat,$dat
492         aese    $dat,q11
493         aesmc   $dat,$dat
494          vld1.8 {q8},[$inp],$step
495         aese    $dat,q12
496         aesmc   $dat,$dat
497         aese    $dat,q13
498         aesmc   $dat,$dat
499         aese    $dat,q14
500         aesmc   $dat,$dat
501          veor   q8,q8,$rndzero_n_last
502         aese    $dat,q15
503         veor    $ivec,$dat,$rndlast
504         b.hs    .Loop_cbc_enc128
505
506         vst1.8  {$ivec},[$out],#16
507         b       .Lcbc_done
508 ___
509 {
510 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
511 $code.=<<___;
512 .align  5
513 .Lcbc_dec:
514         vld1.8  {$dat2},[$inp],#16
515         subs    $len,$len,#32           // bias
516         add     $cnt,$rounds,#2
517         vorr    $in1,$dat,$dat
518         vorr    $dat1,$dat,$dat
519         vorr    $in2,$dat2,$dat2
520         b.lo    .Lcbc_dec_tail
521
522         vorr    $dat1,$dat2,$dat2
523         vld1.8  {$dat2},[$inp],#16
524         vorr    $in0,$dat,$dat
525         vorr    $in1,$dat1,$dat1
526         vorr    $in2,$dat2,$dat2
527
528 .Loop3x_cbc_dec:
529         aesd    $dat0,q8
530         aesimc  $dat0,$dat0
531         aesd    $dat1,q8
532         aesimc  $dat1,$dat1
533         aesd    $dat2,q8
534         aesimc  $dat2,$dat2
535         vld1.32 {q8},[$key_],#16
536         subs    $cnt,$cnt,#2
537         aesd    $dat0,q9
538         aesimc  $dat0,$dat0
539         aesd    $dat1,q9
540         aesimc  $dat1,$dat1
541         aesd    $dat2,q9
542         aesimc  $dat2,$dat2
543         vld1.32 {q9},[$key_],#16
544         b.gt    .Loop3x_cbc_dec
545
546         aesd    $dat0,q8
547         aesimc  $dat0,$dat0
548         aesd    $dat1,q8
549         aesimc  $dat1,$dat1
550         aesd    $dat2,q8
551         aesimc  $dat2,$dat2
552          veor   $tmp0,$ivec,$rndlast
553          subs   $len,$len,#0x30
554          veor   $tmp1,$in0,$rndlast
555          mov.lo x6,$len                 // x6, $cnt, is zero at this point
556         aesd    $dat0,q9
557         aesimc  $dat0,$dat0
558         aesd    $dat1,q9
559         aesimc  $dat1,$dat1
560         aesd    $dat2,q9
561         aesimc  $dat2,$dat2
562          veor   $tmp2,$in1,$rndlast
563          add    $inp,$inp,x6            // $inp is adjusted in such way that
564                                         // at exit from the loop $dat1-$dat2
565                                         // are loaded with last "words"
566          vorr   $ivec,$in2,$in2
567          mov    $key_,$key
568         aesd    $dat0,q12
569         aesimc  $dat0,$dat0
570         aesd    $dat1,q12
571         aesimc  $dat1,$dat1
572         aesd    $dat2,q12
573         aesimc  $dat2,$dat2
574          vld1.8 {$in0},[$inp],#16
575         aesd    $dat0,q13
576         aesimc  $dat0,$dat0
577         aesd    $dat1,q13
578         aesimc  $dat1,$dat1
579         aesd    $dat2,q13
580         aesimc  $dat2,$dat2
581          vld1.8 {$in1},[$inp],#16
582         aesd    $dat0,q14
583         aesimc  $dat0,$dat0
584         aesd    $dat1,q14
585         aesimc  $dat1,$dat1
586         aesd    $dat2,q14
587         aesimc  $dat2,$dat2
588          vld1.8 {$in2},[$inp],#16
589         aesd    $dat0,q15
590         aesd    $dat1,q15
591         aesd    $dat2,q15
592          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
593          add    $cnt,$rounds,#2
594         veor    $tmp0,$tmp0,$dat0
595         veor    $tmp1,$tmp1,$dat1
596         veor    $dat2,$dat2,$tmp2
597          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
598         vst1.8  {$tmp0},[$out],#16
599          vorr   $dat0,$in0,$in0
600         vst1.8  {$tmp1},[$out],#16
601          vorr   $dat1,$in1,$in1
602         vst1.8  {$dat2},[$out],#16
603          vorr   $dat2,$in2,$in2
604         b.hs    .Loop3x_cbc_dec
605
606         cmn     $len,#0x30
607         b.eq    .Lcbc_done
608         nop
609
610 .Lcbc_dec_tail:
611         aesd    $dat1,q8
612         aesimc  $dat1,$dat1
613         aesd    $dat2,q8
614         aesimc  $dat2,$dat2
615         vld1.32 {q8},[$key_],#16
616         subs    $cnt,$cnt,#2
617         aesd    $dat1,q9
618         aesimc  $dat1,$dat1
619         aesd    $dat2,q9
620         aesimc  $dat2,$dat2
621         vld1.32 {q9},[$key_],#16
622         b.gt    .Lcbc_dec_tail
623
624         aesd    $dat1,q8
625         aesimc  $dat1,$dat1
626         aesd    $dat2,q8
627         aesimc  $dat2,$dat2
628         aesd    $dat1,q9
629         aesimc  $dat1,$dat1
630         aesd    $dat2,q9
631         aesimc  $dat2,$dat2
632         aesd    $dat1,q12
633         aesimc  $dat1,$dat1
634         aesd    $dat2,q12
635         aesimc  $dat2,$dat2
636          cmn    $len,#0x20
637         aesd    $dat1,q13
638         aesimc  $dat1,$dat1
639         aesd    $dat2,q13
640         aesimc  $dat2,$dat2
641          veor   $tmp1,$ivec,$rndlast
642         aesd    $dat1,q14
643         aesimc  $dat1,$dat1
644         aesd    $dat2,q14
645         aesimc  $dat2,$dat2
646          veor   $tmp2,$in1,$rndlast
647         aesd    $dat1,q15
648         aesd    $dat2,q15
649         b.eq    .Lcbc_dec_one
650         veor    $tmp1,$tmp1,$dat1
651         veor    $tmp2,$tmp2,$dat2
652          vorr   $ivec,$in2,$in2
653         vst1.8  {$tmp1},[$out],#16
654         vst1.8  {$tmp2},[$out],#16
655         b       .Lcbc_done
656
657 .Lcbc_dec_one:
658         veor    $tmp1,$tmp1,$dat2
659          vorr   $ivec,$in2,$in2
660         vst1.8  {$tmp1},[$out],#16
661
662 .Lcbc_done:
663         vst1.8  {$ivec},[$ivp]
664 .Lcbc_abort:
665 ___
666 }
667 $code.=<<___    if ($flavour !~ /64/);
668         vldmia  sp!,{d8-d15}
669         ldmia   sp!,{r4-r8,pc}
670 ___
671 $code.=<<___    if ($flavour =~ /64/);
672         ldr     x29,[sp],#16
673         ret
674 ___
675 $code.=<<___;
676 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
677 ___
678 }}}
679 {{{
680 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
681 my ($rounds,$cnt,$key_)=("w5","w6","x7");
682 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
683 my $step="x12";         # aliases with $tctr2
684
685 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
686 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
687
688 my ($dat,$tmp)=($dat0,$tmp0);
689
690 ### q8-q15      preloaded key schedule
691
692 $code.=<<___;
693 .globl  ${prefix}_ctr32_encrypt_blocks
694 .type   ${prefix}_ctr32_encrypt_blocks,%function
695 .align  5
696 ${prefix}_ctr32_encrypt_blocks:
697 ___
698 $code.=<<___    if ($flavour =~ /64/);
699         stp             x29,x30,[sp,#-16]!
700         add             x29,sp,#0
701 ___
702 $code.=<<___    if ($flavour !~ /64/);
703         mov             ip,sp
704         stmdb           sp!,{r4-r10,lr}
705         vstmdb          sp!,{d8-d15}            @ ABI specification says so
706         ldr             r4, [ip]                @ load remaining arg
707 ___
708 $code.=<<___;
709         ldr             $rounds,[$key,#240]
710
711         ldr             $ctr, [$ivp, #12]
712         vld1.32         {$dat0},[$ivp]
713
714         vld1.32         {q8-q9},[$key]          // load key schedule...
715         sub             $rounds,$rounds,#4
716         mov             $step,#16
717         cmp             $len,#2
718         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
719         sub             $rounds,$rounds,#2
720         vld1.32         {q12-q13},[$key_],#32
721         vld1.32         {q14-q15},[$key_],#32
722         vld1.32         {$rndlast},[$key_]
723         add             $key_,$key,#32
724         mov             $cnt,$rounds
725         cclr            $step,lo
726 #ifndef __ARMEB__
727         rev             $ctr, $ctr
728 #endif
729         vorr            $dat1,$dat0,$dat0
730         add             $tctr1, $ctr, #1
731         vorr            $dat2,$dat0,$dat0
732         add             $ctr, $ctr, #2
733         vorr            $ivec,$dat0,$dat0
734         rev             $tctr1, $tctr1
735         vmov.32         ${dat1}[3],$tctr1
736         b.ls            .Lctr32_tail
737         rev             $tctr2, $ctr
738         sub             $len,$len,#3            // bias
739         vmov.32         ${dat2}[3],$tctr2
740         b               .Loop3x_ctr32
741
742 .align  4
743 .Loop3x_ctr32:
744         aese            $dat0,q8
745         aesmc           $dat0,$dat0
746         aese            $dat1,q8
747         aesmc           $dat1,$dat1
748         aese            $dat2,q8
749         aesmc           $dat2,$dat2
750         vld1.32         {q8},[$key_],#16
751         subs            $cnt,$cnt,#2
752         aese            $dat0,q9
753         aesmc           $dat0,$dat0
754         aese            $dat1,q9
755         aesmc           $dat1,$dat1
756         aese            $dat2,q9
757         aesmc           $dat2,$dat2
758         vld1.32         {q9},[$key_],#16
759         b.gt            .Loop3x_ctr32
760
761         aese            $dat0,q8
762         aesmc           $tmp0,$dat0
763         aese            $dat1,q8
764         aesmc           $tmp1,$dat1
765          vld1.8         {$in0},[$inp],#16
766          vorr           $dat0,$ivec,$ivec
767         aese            $dat2,q8
768         aesmc           $dat2,$dat2
769          vld1.8         {$in1},[$inp],#16
770          vorr           $dat1,$ivec,$ivec
771         aese            $tmp0,q9
772         aesmc           $tmp0,$tmp0
773         aese            $tmp1,q9
774         aesmc           $tmp1,$tmp1
775          vld1.8         {$in2},[$inp],#16
776          mov            $key_,$key
777         aese            $dat2,q9
778         aesmc           $tmp2,$dat2
779          vorr           $dat2,$ivec,$ivec
780          add            $tctr0,$ctr,#1
781         aese            $tmp0,q12
782         aesmc           $tmp0,$tmp0
783         aese            $tmp1,q12
784         aesmc           $tmp1,$tmp1
785          veor           $in0,$in0,$rndlast
786          add            $tctr1,$ctr,#2
787         aese            $tmp2,q12
788         aesmc           $tmp2,$tmp2
789          veor           $in1,$in1,$rndlast
790          add            $ctr,$ctr,#3
791         aese            $tmp0,q13
792         aesmc           $tmp0,$tmp0
793         aese            $tmp1,q13
794         aesmc           $tmp1,$tmp1
795          veor           $in2,$in2,$rndlast
796          rev            $tctr0,$tctr0
797         aese            $tmp2,q13
798         aesmc           $tmp2,$tmp2
799          vmov.32        ${dat0}[3], $tctr0
800          rev            $tctr1,$tctr1
801         aese            $tmp0,q14
802         aesmc           $tmp0,$tmp0
803         aese            $tmp1,q14
804         aesmc           $tmp1,$tmp1
805          vmov.32        ${dat1}[3], $tctr1
806          rev            $tctr2,$ctr
807         aese            $tmp2,q14
808         aesmc           $tmp2,$tmp2
809          vmov.32        ${dat2}[3], $tctr2
810          subs           $len,$len,#3
811         aese            $tmp0,q15
812         aese            $tmp1,q15
813         aese            $tmp2,q15
814
815         veor            $in0,$in0,$tmp0
816          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
817         vst1.8          {$in0},[$out],#16
818         veor            $in1,$in1,$tmp1
819          mov            $cnt,$rounds
820         vst1.8          {$in1},[$out],#16
821         veor            $in2,$in2,$tmp2
822          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
823         vst1.8          {$in2},[$out],#16
824         b.hs            .Loop3x_ctr32
825
826         adds            $len,$len,#3
827         b.eq            .Lctr32_done
828         cmp             $len,#1
829         mov             $step,#16
830         cclr            $step,eq
831
832 .Lctr32_tail:
833         aese            $dat0,q8
834         aesmc           $dat0,$dat0
835         aese            $dat1,q8
836         aesmc           $dat1,$dat1
837         vld1.32         {q8},[$key_],#16
838         subs            $cnt,$cnt,#2
839         aese            $dat0,q9
840         aesmc           $dat0,$dat0
841         aese            $dat1,q9
842         aesmc           $dat1,$dat1
843         vld1.32         {q9},[$key_],#16
844         b.gt            .Lctr32_tail
845
846         aese            $dat0,q8
847         aesmc           $dat0,$dat0
848         aese            $dat1,q8
849         aesmc           $dat1,$dat1
850         aese            $dat0,q9
851         aesmc           $dat0,$dat0
852         aese            $dat1,q9
853         aesmc           $dat1,$dat1
854          vld1.8         {$in0},[$inp],$step
855         aese            $dat0,q12
856         aesmc           $dat0,$dat0
857         aese            $dat1,q12
858         aesmc           $dat1,$dat1
859          vld1.8         {$in1},[$inp]
860         aese            $dat0,q13
861         aesmc           $dat0,$dat0
862         aese            $dat1,q13
863         aesmc           $dat1,$dat1
864          veor           $in0,$in0,$rndlast
865         aese            $dat0,q14
866         aesmc           $dat0,$dat0
867         aese            $dat1,q14
868         aesmc           $dat1,$dat1
869          veor           $in1,$in1,$rndlast
870         aese            $dat0,q15
871         aese            $dat1,q15
872
873         cmp             $len,#1
874         veor            $in0,$in0,$dat0
875         veor            $in1,$in1,$dat1
876         vst1.8          {$in0},[$out],#16
877         b.eq            .Lctr32_done
878         vst1.8          {$in1},[$out]
879
880 .Lctr32_done:
881 ___
882 $code.=<<___    if ($flavour !~ /64/);
883         vldmia          sp!,{d8-d15}
884         ldmia           sp!,{r4-r10,pc}
885 ___
886 $code.=<<___    if ($flavour =~ /64/);
887         ldr             x29,[sp],#16
888         ret
889 ___
890 $code.=<<___;
891 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
892 ___
893 }}}
894 $code.=<<___;
895 #endif
896 ___
897 ########################################
898 if ($flavour =~ /64/) {                 ######## 64-bit code
899     my %opcode = (
900         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
901         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
902
903     local *unaes = sub {
904         my ($mnemonic,$arg)=@_;
905
906         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
907         sprintf ".inst\t0x%08x\t//%s %s",
908                         $opcode{$mnemonic}|$1|($2<<5),
909                         $mnemonic,$arg;
910     };
911
912     foreach(split("\n",$code)) {
913         s/\`([^\`]*)\`/eval($1)/geo;
914
915         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
916         s/@\s/\/\//o;                   # old->new style commentary
917
918         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
919         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
920         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
921         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
922         s/vext\.8/ext/o         or
923         s/vrev32\.8/rev32/o     or
924         s/vtst\.8/cmtst/o       or
925         s/vshr/ushr/o           or
926         s/^(\s+)v/$1/o          or      # strip off v prefix
927         s/\bbx\s+lr\b/ret/o;
928
929         # fix up remainig legacy suffixes
930         s/\.[ui]?8//o;
931         m/\],#8/o and s/\.16b/\.8b/go;
932         s/\.[ui]?32//o and s/\.16b/\.4s/go;
933         s/\.[ui]?64//o and s/\.16b/\.2d/go;
934         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
935
936         print $_,"\n";
937     }
938 } else {                                ######## 32-bit code
939     my %opcode = (
940         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
941         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
942
943     local *unaes = sub {
944         my ($mnemonic,$arg)=@_;
945
946         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
947             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
948                                          |(($2&7)<<1) |(($2&8)<<2);
949             # since ARMv7 instructions are always encoded little-endian.
950             # correct solution is to use .inst directive, but older
951             # assemblers don't implement it:-(
952             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
953                         $word&0xff,($word>>8)&0xff,
954                         ($word>>16)&0xff,($word>>24)&0xff,
955                         $mnemonic,$arg;
956         }
957     };
958
959     sub unvtbl {
960         my $arg=shift;
961
962         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
963         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
964                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1; 
965     }
966
967     sub unvdup32 {
968         my $arg=shift;
969
970         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
971         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;      
972     }
973
974     sub unvmov32 {
975         my $arg=shift;
976
977         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
978         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;       
979     }
980
981     foreach(split("\n",$code)) {
982         s/\`([^\`]*)\`/eval($1)/geo;
983
984         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
985         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
986         s/\/\/\s?/@ /o;                         # new->old style commentary
987
988         # fix up remainig new-style suffixes
989         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
990         s/\],#[0-9]+/]!/o;
991
992         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
993         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
994         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
995         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
996         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
997         s/^(\s+)b\./$1b/o                               or
998         s/^(\s+)mov\./$1mov/o                           or
999         s/^(\s+)ret/$1bx\tlr/o;
1000
1001         print $_,"\n";
1002     }
1003 }
1004
1005 close STDOUT;