crypto/aes/asm/aesv8-armx.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # This module implements support for ARMv8 AES instructions. The
  18 # module is endian-agnostic in sense that it supports both big- and
  19 # little-endian cases. As does it support both 32- and 64-bit modes
  20 # of operation. Latter is achieved by limiting amount of utilized
  21 # registers to 16, which implies additional NEON load and integer
  22 # instructions. This has no effect on mighty Apple A7, where results
  23 # are literally equal to the theoretical estimates based on AES
  24 # instruction latencies and issue rates. On Cortex-A53, an in-order
  25 # execution core, this costs up to 10-15%, which is partially
  26 # compensated by implementing dedicated code path for 128-bit
  27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  28 # seems to be limited by sheer amount of NEON instructions...
  29 #
  30 # April 2019
  31 #
  32 # Key to performance of parallelize-able modes is round instruction
  33 # interleaving. But which factor to use? There is optimal one for
  34 # each combination of instruction latency and issue rate, beyond
  35 # which increasing interleave factor doesn't pay off. While on cons
  36 # side we have code size increase and resource waste on platforms for
  37 # which interleave factor is too high. In other words you want it to
  38 # be just right. So far interleave factor of 3x was serving well all
  39 # platforms. But for ThunderX2 optimal interleave factor was measured
  40 # to be 5x...
  41 #
  42 # Performance in cycles per byte processed with 128-bit key:
  43 #
  44 #               CBC enc         CBC dec         CTR
  45 # Apple A7      2.39            1.20            1.20
  46 # Cortex-A53    1.32            1.17/1.29(**)   1.36/1.46
  47 # Cortex-A57(*) 1.95            0.82/0.85       0.89/0.93
  48 # Cortex-A72    1.33            0.85/0.88       0.92/0.96
  49 # Denver        1.96            0.65/0.86       0.76/0.80
  50 # Mongoose      1.33            1.23/1.20       1.30/1.20
  51 # Kryo          1.26            0.87/0.94       1.00/1.00
  52 # ThunderX2     5.95            1.25            1.30
  53 #
  54 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
  55 #       and are still same even for updated module;
  56 # (**)  numbers after slash are for 32-bit code, which is 3x-
  57 #       interleaved;
  58
  59 # $output is the last argument if it looks like a file (it has an extension)
  60 # $flavour is the first argument if it doesn't look like a file
  61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  63
  64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  67 die "can't locate arm-xlate.pl";
  68
  69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  70     or die "can't call $xlate: $!";
  71 *STDOUT=*OUT;
  72
  73 $prefix="aes_v8";
  74
  75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  76
  77 $code=<<___;
  78 #include "arm_arch.h"
  79
  80 #if __ARM_MAX_ARCH__>=7
  81 ___
  82 $code.=".arch   armv8-a+crypto\n.text\n"                if ($flavour =~ /64/);
  83 $code.=<<___                                            if ($flavour !~ /64/);
  84 .arch   armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  85 .fpu    neon
  86 #ifdef  __thumb2__
  87 .syntax unified
  88 .thumb
  89 # define INST(a,b,c,d)  $_byte  c,d|0xc,a,b
  90 #else
  91 .code   32
  92 # define INST(a,b,c,d)  $_byte  a,b,c,d
  93 #endif
  94
  95 .text
  96 ___
  97
  98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
 100 # maintain both 32- and 64-bit codes within single module and
 101 # transliterate common code to either flavour with regex vodoo.
 102 #
 103 {{{
 104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
 105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
 106         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
 107
 108
 109 $code.=<<___;
 110 .align  5
 111 .Lrcon:
 112 .long   0x01,0x01,0x01,0x01
 113 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
 114 .long   0x1b,0x1b,0x1b,0x1b
 115
 116 .globl  ${prefix}_set_encrypt_key
 117 .type   ${prefix}_set_encrypt_key,%function
 118 .align  5
 119 ${prefix}_set_encrypt_key:
 120 .Lenc_key:
 121 ___
 122 $code.=<<___    if ($flavour =~ /64/);
 123         stp     x29,x30,[sp,#-16]!
 124         add     x29,sp,#0
 125 ___
 126 $code.=<<___;
 127         mov     $ptr,#-1
 128         cmp     $inp,#0
 129         b.eq    .Lenc_key_abort
 130         cmp     $out,#0
 131         b.eq    .Lenc_key_abort
 132         mov     $ptr,#-2
 133         cmp     $bits,#128
 134         b.lt    .Lenc_key_abort
 135         cmp     $bits,#256
 136         b.gt    .Lenc_key_abort
 137         tst     $bits,#0x3f
 138         b.ne    .Lenc_key_abort
 139
 140         adr     $ptr,.Lrcon
 141         cmp     $bits,#192
 142
 143         veor    $zero,$zero,$zero
 144         vld1.8  {$in0},[$inp],#16
 145         mov     $bits,#8                // reuse $bits
 146         vld1.32 {$rcon,$mask},[$ptr],#32
 147
 148         b.lt    .Loop128
 149         b.eq    .L192
 150         b       .L256
 151
 152 .align  4
 153 .Loop128:
 154         vtbl.8  $key,{$in0},$mask
 155         vext.8  $tmp,$zero,$in0,#12
 156         vst1.32 {$in0},[$out],#16
 157         aese    $key,$zero
 158         subs    $bits,$bits,#1
 159
 160         veor    $in0,$in0,$tmp
 161         vext.8  $tmp,$zero,$tmp,#12
 162         veor    $in0,$in0,$tmp
 163         vext.8  $tmp,$zero,$tmp,#12
 164          veor   $key,$key,$rcon
 165         veor    $in0,$in0,$tmp
 166         vshl.u8 $rcon,$rcon,#1
 167         veor    $in0,$in0,$key
 168         b.ne    .Loop128
 169
 170         vld1.32 {$rcon},[$ptr]
 171
 172         vtbl.8  $key,{$in0},$mask
 173         vext.8  $tmp,$zero,$in0,#12
 174         vst1.32 {$in0},[$out],#16
 175         aese    $key,$zero
 176
 177         veor    $in0,$in0,$tmp
 178         vext.8  $tmp,$zero,$tmp,#12
 179         veor    $in0,$in0,$tmp
 180         vext.8  $tmp,$zero,$tmp,#12
 181          veor   $key,$key,$rcon
 182         veor    $in0,$in0,$tmp
 183         vshl.u8 $rcon,$rcon,#1
 184         veor    $in0,$in0,$key
 185
 186         vtbl.8  $key,{$in0},$mask
 187         vext.8  $tmp,$zero,$in0,#12
 188         vst1.32 {$in0},[$out],#16
 189         aese    $key,$zero
 190
 191         veor    $in0,$in0,$tmp
 192         vext.8  $tmp,$zero,$tmp,#12
 193         veor    $in0,$in0,$tmp
 194         vext.8  $tmp,$zero,$tmp,#12
 195          veor   $key,$key,$rcon
 196         veor    $in0,$in0,$tmp
 197         veor    $in0,$in0,$key
 198         vst1.32 {$in0},[$out]
 199         add     $out,$out,#0x50
 200
 201         mov     $rounds,#10
 202         b       .Ldone
 203
 204 .align  4
 205 .L192:
 206         vld1.8  {$in1},[$inp],#8
 207         vmov.i8 $key,#8                 // borrow $key
 208         vst1.32 {$in0},[$out],#16
 209         vsub.i8 $mask,$mask,$key        // adjust the mask
 210
 211 .Loop192:
 212         vtbl.8  $key,{$in1},$mask
 213         vext.8  $tmp,$zero,$in0,#12
 214         vst1.32 {$in1},[$out],#8
 215         aese    $key,$zero
 216         subs    $bits,$bits,#1
 217
 218         veor    $in0,$in0,$tmp
 219         vext.8  $tmp,$zero,$tmp,#12
 220         veor    $in0,$in0,$tmp
 221         vext.8  $tmp,$zero,$tmp,#12
 222         veor    $in0,$in0,$tmp
 223
 224         vdup.32 $tmp,${in0}[3]
 225         veor    $tmp,$tmp,$in1
 226          veor   $key,$key,$rcon
 227         vext.8  $in1,$zero,$in1,#12
 228         vshl.u8 $rcon,$rcon,#1
 229         veor    $in1,$in1,$tmp
 230         veor    $in0,$in0,$key
 231         veor    $in1,$in1,$key
 232         vst1.32 {$in0},[$out],#16
 233         b.ne    .Loop192
 234
 235         mov     $rounds,#12
 236         add     $out,$out,#0x20
 237         b       .Ldone
 238
 239 .align  4
 240 .L256:
 241         vld1.8  {$in1},[$inp]
 242         mov     $bits,#7
 243         mov     $rounds,#14
 244         vst1.32 {$in0},[$out],#16
 245
 246 .Loop256:
 247         vtbl.8  $key,{$in1},$mask
 248         vext.8  $tmp,$zero,$in0,#12
 249         vst1.32 {$in1},[$out],#16
 250         aese    $key,$zero
 251         subs    $bits,$bits,#1
 252
 253         veor    $in0,$in0,$tmp
 254         vext.8  $tmp,$zero,$tmp,#12
 255         veor    $in0,$in0,$tmp
 256         vext.8  $tmp,$zero,$tmp,#12
 257          veor   $key,$key,$rcon
 258         veor    $in0,$in0,$tmp
 259         vshl.u8 $rcon,$rcon,#1
 260         veor    $in0,$in0,$key
 261         vst1.32 {$in0},[$out],#16
 262         b.eq    .Ldone
 263
 264         vdup.32 $key,${in0}[3]          // just splat
 265         vext.8  $tmp,$zero,$in1,#12
 266         aese    $key,$zero
 267
 268         veor    $in1,$in1,$tmp
 269         vext.8  $tmp,$zero,$tmp,#12
 270         veor    $in1,$in1,$tmp
 271         vext.8  $tmp,$zero,$tmp,#12
 272         veor    $in1,$in1,$tmp
 273
 274         veor    $in1,$in1,$key
 275         b       .Loop256
 276
 277 .Ldone:
 278         str     $rounds,[$out]
 279         mov     $ptr,#0
 280
 281 .Lenc_key_abort:
 282         mov     x0,$ptr                 // return value
 283         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 284         ret
 285 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 286
 287 .globl  ${prefix}_set_decrypt_key
 288 .type   ${prefix}_set_decrypt_key,%function
 289 .align  5
 290 ${prefix}_set_decrypt_key:
 291 ___
 292 $code.=<<___    if ($flavour =~ /64/);
 293         .inst   0xd503233f              // paciasp
 294         stp     x29,x30,[sp,#-16]!
 295         add     x29,sp,#0
 296 ___
 297 $code.=<<___    if ($flavour !~ /64/);
 298         stmdb   sp!,{r4,lr}
 299 ___
 300 $code.=<<___;
 301         bl      .Lenc_key
 302
 303         cmp     x0,#0
 304         b.ne    .Ldec_key_abort
 305
 306         sub     $out,$out,#240          // restore original $out
 307         mov     x4,#-16
 308         add     $inp,$out,x12,lsl#4     // end of key schedule
 309
 310         vld1.32 {v0.16b},[$out]
 311         vld1.32 {v1.16b},[$inp]
 312         vst1.32 {v0.16b},[$inp],x4
 313         vst1.32 {v1.16b},[$out],#16
 314
 315 .Loop_imc:
 316         vld1.32 {v0.16b},[$out]
 317         vld1.32 {v1.16b},[$inp]
 318         aesimc  v0.16b,v0.16b
 319         aesimc  v1.16b,v1.16b
 320         vst1.32 {v0.16b},[$inp],x4
 321         vst1.32 {v1.16b},[$out],#16
 322         cmp     $inp,$out
 323         b.hi    .Loop_imc
 324
 325         vld1.32 {v0.16b},[$out]
 326         aesimc  v0.16b,v0.16b
 327         vst1.32 {v0.16b},[$inp]
 328
 329         eor     x0,x0,x0                // return value
 330 .Ldec_key_abort:
 331 ___
 332 $code.=<<___    if ($flavour !~ /64/);
 333         ldmia   sp!,{r4,pc}
 334 ___
 335 $code.=<<___    if ($flavour =~ /64/);
 336         ldp     x29,x30,[sp],#16
 337         .inst   0xd50323bf              // autiasp
 338         ret
 339 ___
 340 $code.=<<___;
 341 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 342 ___
 343 }}}
 344 {{{
 345 sub gen_block () {
 346 my $dir = shift;
 347 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 348 my ($inp,$out,$key)=map("x$_",(0..2));
 349 my $rounds="w3";
 350 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 351
 352 $code.=<<___;
 353 .globl  ${prefix}_${dir}crypt
 354 .type   ${prefix}_${dir}crypt,%function
 355 .align  5
 356 ${prefix}_${dir}crypt:
 357         ldr     $rounds,[$key,#240]
 358         vld1.32 {$rndkey0},[$key],#16
 359         vld1.8  {$inout},[$inp]
 360         sub     $rounds,$rounds,#2
 361         vld1.32 {$rndkey1},[$key],#16
 362
 363 .Loop_${dir}c:
 364         aes$e   $inout,$rndkey0
 365         aes$mc  $inout,$inout
 366         vld1.32 {$rndkey0},[$key],#16
 367         subs    $rounds,$rounds,#2
 368         aes$e   $inout,$rndkey1
 369         aes$mc  $inout,$inout
 370         vld1.32 {$rndkey1},[$key],#16
 371         b.gt    .Loop_${dir}c
 372
 373         aes$e   $inout,$rndkey0
 374         aes$mc  $inout,$inout
 375         vld1.32 {$rndkey0},[$key]
 376         aes$e   $inout,$rndkey1
 377         veor    $inout,$inout,$rndkey0
 378
 379         vst1.8  {$inout},[$out]
 380         ret
 381 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 382 ___
 383 }
 384 &gen_block("en");
 385 &gen_block("de");
 386 }}}
 387
 388 # Performance in cycles per byte.
 389 # Processed with AES-ECB different key size.
 390 # It shows the value before and after optimization as below:
 391 # (before/after):
 392 #
 393 #               AES-128-ECB             AES-192-ECB             AES-256-ECB
 394 # Cortex-A57    1.85/0.82               2.16/0.96               2.47/1.10
 395 # Cortex-A72    1.64/0.85               1.82/0.99               2.13/1.14
 396
 397 # Optimization is implemented by loop unrolling and interleaving.
 398 # Commonly, we choose the unrolling factor as 5, if the input
 399 # data size smaller than 5 blocks, but not smaller than 3 blocks,
 400 # choose 3 as the unrolling factor.
 401 # If the input data size dsize >= 5*16 bytes, then take 5 blocks
 402 # as one iteration, every loop the left size lsize -= 5*16.
 403 # If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
 404 # every loop lsize -=3*16.
 405 # If lsize < 3*16 bytes, treat them as the tail, interleave the
 406 # two blocks AES instructions.
 407 # There is one special case, if the original input data size dsize
 408 # = 16 bytes, we will treat it seperately to improve the
 409 # performance: one independent code block without LR, FP load and
 410 # store, just looks like what the original ECB implementation does.
 411
 412 {{{
 413 my ($inp,$out,$len,$key)=map("x$_",(0..3));
 414 my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
 415 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
 416
 417 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 418
 419 ### q7  last round key
 420 ### q10-q15     q7 Last 7 round keys
 421 ### q8-q9       preloaded round keys except last 7 keys for big size
 422 ### q5, q6, q8-q9       preloaded round keys except last 7 keys for only 16 byte
 423
 424 {
 425 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 426
 427 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
 428 my ($dat4,$in4,$tmp4);
 429 if ($flavour =~ /64/) {
 430     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
 431 }
 432
 433 $code.=<<___;
 434 .globl  ${prefix}_ecb_encrypt
 435 .type   ${prefix}_ecb_encrypt,%function
 436 .align  5
 437 ${prefix}_ecb_encrypt:
 438 ___
 439 $code.=<<___    if ($flavour =~ /64/);
 440         subs    $len,$len,#16
 441         // Original input data size bigger than 16, jump to big size processing.
 442         b.ne    .Lecb_big_size
 443         vld1.8  {$dat0},[$inp]
 444         cmp     $enc,#0                                 // en- or decrypting?
 445         ldr     $rounds,[$key,#240]
 446         vld1.32 {q5-q6},[$key],#32                      // load key schedule...
 447
 448         b.eq .Lecb_small_dec
 449         aese    $dat0,q5
 450         aesmc   $dat0,$dat0
 451         vld1.32 {q8-q9},[$key],#32                      // load key schedule...
 452         aese    $dat0,q6
 453         aesmc   $dat0,$dat0
 454         subs    $rounds,$rounds,#10                     // if rounds==10, jump to aes-128-ecb processing
 455         b.eq    .Lecb_128_enc
 456 .Lecb_round_loop:
 457         aese    $dat0,q8
 458         aesmc   $dat0,$dat0
 459         vld1.32 {q8},[$key],#16                         // load key schedule...
 460         aese    $dat0,q9
 461         aesmc   $dat0,$dat0
 462         vld1.32 {q9},[$key],#16                         // load key schedule...
 463         subs    $rounds,$rounds,#2                      // bias
 464         b.gt    .Lecb_round_loop
 465 .Lecb_128_enc:
 466         vld1.32 {q10-q11},[$key],#32            // load key schedule...
 467         aese    $dat0,q8
 468         aesmc   $dat0,$dat0
 469         aese    $dat0,q9
 470         aesmc   $dat0,$dat0
 471         vld1.32 {q12-q13},[$key],#32            // load key schedule...
 472         aese    $dat0,q10
 473         aesmc   $dat0,$dat0
 474         aese    $dat0,q11
 475         aesmc   $dat0,$dat0
 476         vld1.32 {q14-q15},[$key],#32            // load key schedule...
 477         aese    $dat0,q12
 478         aesmc   $dat0,$dat0
 479         aese    $dat0,q13
 480         aesmc   $dat0,$dat0
 481         vld1.32 {$rndlast},[$key]
 482         aese    $dat0,q14
 483         aesmc   $dat0,$dat0
 484         aese    $dat0,q15
 485         veor    $dat0,$dat0,$rndlast
 486         vst1.8  {$dat0},[$out]
 487         b       .Lecb_Final_abort
 488 .Lecb_small_dec:
 489         aesd    $dat0,q5
 490         aesimc  $dat0,$dat0
 491         vld1.32 {q8-q9},[$key],#32                      // load key schedule...
 492         aesd    $dat0,q6
 493         aesimc  $dat0,$dat0
 494         subs    $rounds,$rounds,#10                     // bias
 495         b.eq    .Lecb_128_dec
 496 .Lecb_dec_round_loop:
 497         aesd    $dat0,q8
 498         aesimc  $dat0,$dat0
 499         vld1.32 {q8},[$key],#16                         // load key schedule...
 500         aesd    $dat0,q9
 501         aesimc  $dat0,$dat0
 502         vld1.32 {q9},[$key],#16                         // load key schedule...
 503         subs    $rounds,$rounds,#2                      // bias
 504         b.gt    .Lecb_dec_round_loop
 505 .Lecb_128_dec:
 506         vld1.32 {q10-q11},[$key],#32            // load key schedule...
 507         aesd    $dat0,q8
 508         aesimc  $dat0,$dat0
 509         aesd    $dat0,q9
 510         aesimc  $dat0,$dat0
 511         vld1.32 {q12-q13},[$key],#32            // load key schedule...
 512         aesd    $dat0,q10
 513         aesimc  $dat0,$dat0
 514         aesd    $dat0,q11
 515         aesimc  $dat0,$dat0
 516         vld1.32 {q14-q15},[$key],#32            // load key schedule...
 517         aesd    $dat0,q12
 518         aesimc  $dat0,$dat0
 519         aesd    $dat0,q13
 520         aesimc  $dat0,$dat0
 521         vld1.32 {$rndlast},[$key]
 522         aesd    $dat0,q14
 523         aesimc  $dat0,$dat0
 524         aesd    $dat0,q15
 525         veor    $dat0,$dat0,$rndlast
 526         vst1.8  {$dat0},[$out]
 527         b       .Lecb_Final_abort
 528 .Lecb_big_size:
 529 ___
 530 $code.=<<___    if ($flavour =~ /64/);
 531         stp     x29,x30,[sp,#-16]!
 532         add     x29,sp,#0
 533 ___
 534 $code.=<<___    if ($flavour !~ /64/);
 535         mov     ip,sp
 536         stmdb   sp!,{r4-r8,lr}
 537         vstmdb  sp!,{d8-d15}                    @ ABI specification says so
 538         ldmia   ip,{r4-r5}                      @ load remaining args
 539         subs    $len,$len,#16
 540 ___
 541 $code.=<<___;
 542         mov     $step,#16
 543         b.lo    .Lecb_done
 544         cclr    $step,eq
 545
 546         cmp     $enc,#0                                 // en- or decrypting?
 547         ldr     $rounds,[$key,#240]
 548         and     $len,$len,#-16
 549         vld1.8  {$dat},[$inp],$step
 550
 551         vld1.32 {q8-q9},[$key]                          // load key schedule...
 552         sub     $rounds,$rounds,#6
 553         add     $key_,$key,x5,lsl#4                             // pointer to last 7 round keys
 554         sub     $rounds,$rounds,#2
 555         vld1.32 {q10-q11},[$key_],#32
 556         vld1.32 {q12-q13},[$key_],#32
 557         vld1.32 {q14-q15},[$key_],#32
 558         vld1.32 {$rndlast},[$key_]
 559
 560         add     $key_,$key,#32
 561         mov     $cnt,$rounds
 562         b.eq    .Lecb_dec
 563
 564         vld1.8  {$dat1},[$inp],#16
 565         subs    $len,$len,#32                           // bias
 566         add     $cnt,$rounds,#2
 567         vorr    $in1,$dat1,$dat1
 568         vorr    $dat2,$dat1,$dat1
 569         vorr    $dat1,$dat,$dat
 570         b.lo    .Lecb_enc_tail
 571
 572         vorr    $dat1,$in1,$in1
 573         vld1.8  {$dat2},[$inp],#16
 574 ___
 575 $code.=<<___    if ($flavour =~ /64/);
 576         cmp     $len,#32
 577         b.lo    .Loop3x_ecb_enc
 578
 579         vld1.8  {$dat3},[$inp],#16
 580         vld1.8  {$dat4},[$inp],#16
 581         sub     $len,$len,#32                           // bias
 582         mov     $cnt,$rounds
 583
 584 .Loop5x_ecb_enc:
 585         aese    $dat0,q8
 586         aesmc   $dat0,$dat0
 587         aese    $dat1,q8
 588         aesmc   $dat1,$dat1
 589         aese    $dat2,q8
 590         aesmc   $dat2,$dat2
 591         aese    $dat3,q8
 592         aesmc   $dat3,$dat3
 593         aese    $dat4,q8
 594         aesmc   $dat4,$dat4
 595         vld1.32 {q8},[$key_],#16
 596         subs    $cnt,$cnt,#2
 597         aese    $dat0,q9
 598         aesmc   $dat0,$dat0
 599         aese    $dat1,q9
 600         aesmc   $dat1,$dat1
 601         aese    $dat2,q9
 602         aesmc   $dat2,$dat2
 603         aese    $dat3,q9
 604         aesmc   $dat3,$dat3
 605         aese    $dat4,q9
 606         aesmc   $dat4,$dat4
 607         vld1.32 {q9},[$key_],#16
 608         b.gt    .Loop5x_ecb_enc
 609
 610         aese    $dat0,q8
 611         aesmc   $dat0,$dat0
 612         aese    $dat1,q8
 613         aesmc   $dat1,$dat1
 614         aese    $dat2,q8
 615         aesmc   $dat2,$dat2
 616         aese    $dat3,q8
 617         aesmc   $dat3,$dat3
 618         aese    $dat4,q8
 619         aesmc   $dat4,$dat4
 620         cmp     $len,#0x40                                      // because .Lecb_enc_tail4x
 621         sub     $len,$len,#0x50
 622
 623         aese    $dat0,q9
 624         aesmc   $dat0,$dat0
 625         aese    $dat1,q9
 626         aesmc   $dat1,$dat1
 627         aese    $dat2,q9
 628         aesmc   $dat2,$dat2
 629         aese    $dat3,q9
 630         aesmc   $dat3,$dat3
 631         aese    $dat4,q9
 632         aesmc   $dat4,$dat4
 633         csel    x6,xzr,$len,gt                  // borrow x6, $cnt, "gt" is not typo
 634         mov     $key_,$key
 635
 636         aese    $dat0,q10
 637         aesmc   $dat0,$dat0
 638         aese    $dat1,q10
 639         aesmc   $dat1,$dat1
 640         aese    $dat2,q10
 641         aesmc   $dat2,$dat2
 642         aese    $dat3,q10
 643         aesmc   $dat3,$dat3
 644         aese    $dat4,q10
 645         aesmc   $dat4,$dat4
 646         add     $inp,$inp,x6                            // $inp is adjusted in such way that
 647                                                         // at exit from the loop $dat1-$dat4
 648                                                         // are loaded with last "words"
 649         add     x6,$len,#0x60               // because .Lecb_enc_tail4x
 650
 651         aese    $dat0,q11
 652         aesmc   $dat0,$dat0
 653         aese    $dat1,q11
 654         aesmc   $dat1,$dat1
 655         aese    $dat2,q11
 656         aesmc   $dat2,$dat2
 657         aese    $dat3,q11
 658         aesmc   $dat3,$dat3
 659         aese    $dat4,q11
 660         aesmc   $dat4,$dat4
 661
 662         aese    $dat0,q12
 663         aesmc   $dat0,$dat0
 664         aese    $dat1,q12
 665         aesmc   $dat1,$dat1
 666         aese    $dat2,q12
 667         aesmc   $dat2,$dat2
 668         aese    $dat3,q12
 669         aesmc   $dat3,$dat3
 670         aese    $dat4,q12
 671         aesmc   $dat4,$dat4
 672
 673         aese    $dat0,q13
 674         aesmc   $dat0,$dat0
 675         aese    $dat1,q13
 676         aesmc   $dat1,$dat1
 677         aese    $dat2,q13
 678         aesmc   $dat2,$dat2
 679         aese    $dat3,q13
 680         aesmc   $dat3,$dat3
 681         aese    $dat4,q13
 682         aesmc   $dat4,$dat4
 683
 684         aese    $dat0,q14
 685         aesmc   $dat0,$dat0
 686         aese    $dat1,q14
 687         aesmc   $dat1,$dat1
 688         aese    $dat2,q14
 689         aesmc   $dat2,$dat2
 690         aese    $dat3,q14
 691         aesmc   $dat3,$dat3
 692         aese    $dat4,q14
 693         aesmc   $dat4,$dat4
 694
 695         aese    $dat0,q15
 696         vld1.8  {$in0},[$inp],#16
 697         aese    $dat1,q15
 698         vld1.8  {$in1},[$inp],#16
 699         aese    $dat2,q15
 700         vld1.8  {$in2},[$inp],#16
 701         aese    $dat3,q15
 702         vld1.8  {$in3},[$inp],#16
 703         aese    $dat4,q15
 704         vld1.8  {$in4},[$inp],#16
 705         cbz     x6,.Lecb_enc_tail4x
 706         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
 707         veor    $tmp0,$rndlast,$dat0
 708         vorr    $dat0,$in0,$in0
 709         veor    $tmp1,$rndlast,$dat1
 710         vorr    $dat1,$in1,$in1
 711         veor    $tmp2,$rndlast,$dat2
 712         vorr    $dat2,$in2,$in2
 713         veor    $tmp3,$rndlast,$dat3
 714         vorr    $dat3,$in3,$in3
 715         veor    $tmp4,$rndlast,$dat4
 716         vst1.8  {$tmp0},[$out],#16
 717         vorr    $dat4,$in4,$in4
 718         vst1.8  {$tmp1},[$out],#16
 719         mov     $cnt,$rounds
 720         vst1.8  {$tmp2},[$out],#16
 721         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
 722         vst1.8  {$tmp3},[$out],#16
 723         vst1.8  {$tmp4},[$out],#16
 724         b.hs    .Loop5x_ecb_enc
 725
 726         add     $len,$len,#0x50
 727         cbz     $len,.Lecb_done
 728
 729         add     $cnt,$rounds,#2
 730         subs    $len,$len,#0x30
 731         vorr    $dat0,$in2,$in2
 732         vorr    $dat1,$in3,$in3
 733         vorr    $dat2,$in4,$in4
 734         b.lo    .Lecb_enc_tail
 735
 736         b       .Loop3x_ecb_enc
 737
 738 .align  4
 739 .Lecb_enc_tail4x:
 740         veor    $tmp1,$rndlast,$dat1
 741         veor    $tmp2,$rndlast,$dat2
 742         veor    $tmp3,$rndlast,$dat3
 743         veor    $tmp4,$rndlast,$dat4
 744         vst1.8  {$tmp1},[$out],#16
 745         vst1.8  {$tmp2},[$out],#16
 746         vst1.8  {$tmp3},[$out],#16
 747         vst1.8  {$tmp4},[$out],#16
 748
 749         b       .Lecb_done
 750 .align  4
 751 ___
 752 $code.=<<___;
 753 .Loop3x_ecb_enc:
 754         aese    $dat0,q8
 755         aesmc   $dat0,$dat0
 756         aese    $dat1,q8
 757         aesmc   $dat1,$dat1
 758         aese    $dat2,q8
 759         aesmc   $dat2,$dat2
 760         vld1.32 {q8},[$key_],#16
 761         subs    $cnt,$cnt,#2
 762         aese    $dat0,q9
 763         aesmc   $dat0,$dat0
 764         aese    $dat1,q9
 765         aesmc   $dat1,$dat1
 766         aese    $dat2,q9
 767         aesmc   $dat2,$dat2
 768         vld1.32 {q9},[$key_],#16
 769         b.gt    .Loop3x_ecb_enc
 770
 771         aese    $dat0,q8
 772         aesmc   $dat0,$dat0
 773         aese    $dat1,q8
 774         aesmc   $dat1,$dat1
 775         aese    $dat2,q8
 776         aesmc   $dat2,$dat2
 777         subs    $len,$len,#0x30
 778         mov.lo  x6,$len                         // x6, $cnt, is zero at this point
 779         aese    $dat0,q9
 780         aesmc   $dat0,$dat0
 781         aese    $dat1,q9
 782         aesmc   $dat1,$dat1
 783         aese    $dat2,q9
 784         aesmc   $dat2,$dat2
 785         add     $inp,$inp,x6                    // $inp is adjusted in such way that
 786                                                 // at exit from the loop $dat1-$dat2
 787                                                 // are loaded with last "words"
 788         mov     $key_,$key
 789         aese    $dat0,q12
 790         aesmc   $dat0,$dat0
 791         aese    $dat1,q12
 792         aesmc   $dat1,$dat1
 793         aese    $dat2,q12
 794         aesmc   $dat2,$dat2
 795         vld1.8  {$in0},[$inp],#16
 796         aese    $dat0,q13
 797         aesmc   $dat0,$dat0
 798         aese    $dat1,q13
 799         aesmc   $dat1,$dat1
 800         aese    $dat2,q13
 801         aesmc   $dat2,$dat2
 802         vld1.8  {$in1},[$inp],#16
 803         aese    $dat0,q14
 804         aesmc   $dat0,$dat0
 805         aese    $dat1,q14
 806         aesmc   $dat1,$dat1
 807         aese    $dat2,q14
 808         aesmc   $dat2,$dat2
 809         vld1.8  {$in2},[$inp],#16
 810         aese    $dat0,q15
 811         aese    $dat1,q15
 812         aese    $dat2,q15
 813         vld1.32 {q8},[$key_],#16                // re-pre-load rndkey[0]
 814         add     $cnt,$rounds,#2
 815         veor    $tmp0,$rndlast,$dat0
 816         veor    $tmp1,$rndlast,$dat1
 817         veor    $dat2,$dat2,$rndlast
 818         vld1.32 {q9},[$key_],#16                // re-pre-load rndkey[1]
 819         vst1.8  {$tmp0},[$out],#16
 820         vorr    $dat0,$in0,$in0
 821         vst1.8  {$tmp1},[$out],#16
 822         vorr    $dat1,$in1,$in1
 823         vst1.8  {$dat2},[$out],#16
 824         vorr    $dat2,$in2,$in2
 825         b.hs    .Loop3x_ecb_enc
 826
 827         cmn     $len,#0x30
 828         b.eq    .Lecb_done
 829         nop
 830
 831 .Lecb_enc_tail:
 832         aese    $dat1,q8
 833         aesmc   $dat1,$dat1
 834         aese    $dat2,q8
 835         aesmc   $dat2,$dat2
 836         vld1.32 {q8},[$key_],#16
 837         subs    $cnt,$cnt,#2
 838         aese    $dat1,q9
 839         aesmc   $dat1,$dat1
 840         aese    $dat2,q9
 841         aesmc   $dat2,$dat2
 842         vld1.32 {q9},[$key_],#16
 843         b.gt    .Lecb_enc_tail
 844
 845         aese    $dat1,q8
 846         aesmc   $dat1,$dat1
 847         aese    $dat2,q8
 848         aesmc   $dat2,$dat2
 849         aese    $dat1,q9
 850         aesmc   $dat1,$dat1
 851         aese    $dat2,q9
 852         aesmc   $dat2,$dat2
 853         aese    $dat1,q12
 854         aesmc   $dat1,$dat1
 855         aese    $dat2,q12
 856         aesmc   $dat2,$dat2
 857         cmn     $len,#0x20
 858         aese    $dat1,q13
 859         aesmc   $dat1,$dat1
 860         aese    $dat2,q13
 861         aesmc   $dat2,$dat2
 862         aese    $dat1,q14
 863         aesmc   $dat1,$dat1
 864         aese    $dat2,q14
 865         aesmc   $dat2,$dat2
 866         aese    $dat1,q15
 867         aese    $dat2,q15
 868         b.eq    .Lecb_enc_one
 869         veor    $tmp1,$rndlast,$dat1
 870         veor    $tmp2,$rndlast,$dat2
 871         vst1.8  {$tmp1},[$out],#16
 872         vst1.8  {$tmp2},[$out],#16
 873         b       .Lecb_done
 874
 875 .Lecb_enc_one:
 876         veor    $tmp1,$rndlast,$dat2
 877         vst1.8  {$tmp1},[$out],#16
 878         b       .Lecb_done
 879 ___
 880
 881 $code.=<<___;
 882 .align  5
 883 .Lecb_dec:
 884         vld1.8  {$dat1},[$inp],#16
 885         subs    $len,$len,#32                   // bias
 886         add     $cnt,$rounds,#2
 887         vorr    $in1,$dat1,$dat1
 888         vorr    $dat2,$dat1,$dat1
 889         vorr    $dat1,$dat,$dat
 890         b.lo    .Lecb_dec_tail
 891
 892         vorr    $dat1,$in1,$in1
 893         vld1.8  {$dat2},[$inp],#16
 894 ___
 895 $code.=<<___    if ($flavour =~ /64/);
 896         cmp     $len,#32
 897         b.lo    .Loop3x_ecb_dec
 898
 899         vld1.8  {$dat3},[$inp],#16
 900         vld1.8  {$dat4},[$inp],#16
 901         sub     $len,$len,#32                           // bias
 902         mov     $cnt,$rounds
 903
 904 .Loop5x_ecb_dec:
 905         aesd    $dat0,q8
 906         aesimc  $dat0,$dat0
 907         aesd    $dat1,q8
 908         aesimc  $dat1,$dat1
 909         aesd    $dat2,q8
 910         aesimc  $dat2,$dat2
 911         aesd    $dat3,q8
 912         aesimc  $dat3,$dat3
 913         aesd    $dat4,q8
 914         aesimc  $dat4,$dat4
 915         vld1.32 {q8},[$key_],#16
 916         subs    $cnt,$cnt,#2
 917         aesd    $dat0,q9
 918         aesimc  $dat0,$dat0
 919         aesd    $dat1,q9
 920         aesimc  $dat1,$dat1
 921         aesd    $dat2,q9
 922         aesimc  $dat2,$dat2
 923         aesd    $dat3,q9
 924         aesimc  $dat3,$dat3
 925         aesd    $dat4,q9
 926         aesimc  $dat4,$dat4
 927         vld1.32 {q9},[$key_],#16
 928         b.gt    .Loop5x_ecb_dec
 929
 930         aesd    $dat0,q8
 931         aesimc  $dat0,$dat0
 932         aesd    $dat1,q8
 933         aesimc  $dat1,$dat1
 934         aesd    $dat2,q8
 935         aesimc  $dat2,$dat2
 936         aesd    $dat3,q8
 937         aesimc  $dat3,$dat3
 938         aesd    $dat4,q8
 939         aesimc  $dat4,$dat4
 940         cmp     $len,#0x40                              // because .Lecb_tail4x
 941         sub     $len,$len,#0x50
 942
 943         aesd    $dat0,q9
 944         aesimc  $dat0,$dat0
 945         aesd    $dat1,q9
 946         aesimc  $dat1,$dat1
 947         aesd    $dat2,q9
 948         aesimc  $dat2,$dat2
 949         aesd    $dat3,q9
 950         aesimc  $dat3,$dat3
 951         aesd    $dat4,q9
 952         aesimc  $dat4,$dat4
 953         csel    x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
 954         mov     $key_,$key
 955
 956         aesd    $dat0,q10
 957         aesimc  $dat0,$dat0
 958         aesd    $dat1,q10
 959         aesimc  $dat1,$dat1
 960         aesd    $dat2,q10
 961         aesimc  $dat2,$dat2
 962         aesd    $dat3,q10
 963         aesimc  $dat3,$dat3
 964         aesd    $dat4,q10
 965         aesimc  $dat4,$dat4
 966         add     $inp,$inp,x6                            // $inp is adjusted in such way that
 967                                                         // at exit from the loop $dat1-$dat4
 968                                                         // are loaded with last "words"
 969         add     x6,$len,#0x60                   // because .Lecb_tail4x
 970
 971         aesd    $dat0,q11
 972         aesimc  $dat0,$dat0
 973         aesd    $dat1,q11
 974         aesimc  $dat1,$dat1
 975         aesd    $dat2,q11
 976         aesimc  $dat2,$dat2
 977         aesd    $dat3,q11
 978         aesimc  $dat3,$dat3
 979         aesd    $dat4,q11
 980         aesimc  $dat4,$dat4
 981
 982         aesd    $dat0,q12
 983         aesimc  $dat0,$dat0
 984         aesd    $dat1,q12
 985         aesimc  $dat1,$dat1
 986         aesd    $dat2,q12
 987         aesimc  $dat2,$dat2
 988         aesd    $dat3,q12
 989         aesimc  $dat3,$dat3
 990         aesd    $dat4,q12
 991         aesimc  $dat4,$dat4
 992
 993         aesd    $dat0,q13
 994         aesimc  $dat0,$dat0
 995         aesd    $dat1,q13
 996         aesimc  $dat1,$dat1
 997         aesd    $dat2,q13
 998         aesimc  $dat2,$dat2
 999         aesd    $dat3,q13
1000         aesimc  $dat3,$dat3
1001         aesd    $dat4,q13
1002         aesimc  $dat4,$dat4
1003
1004         aesd    $dat0,q14
1005         aesimc  $dat0,$dat0
1006         aesd    $dat1,q14
1007         aesimc  $dat1,$dat1
1008         aesd    $dat2,q14
1009         aesimc  $dat2,$dat2
1010         aesd    $dat3,q14
1011         aesimc  $dat3,$dat3
1012         aesd    $dat4,q14
1013         aesimc  $dat4,$dat4
1014
1015         aesd    $dat0,q15
1016         vld1.8  {$in0},[$inp],#16
1017         aesd    $dat1,q15
1018         vld1.8  {$in1},[$inp],#16
1019         aesd    $dat2,q15
1020         vld1.8  {$in2},[$inp],#16
1021         aesd    $dat3,q15
1022         vld1.8  {$in3},[$inp],#16
1023         aesd    $dat4,q15
1024         vld1.8  {$in4},[$inp],#16
1025         cbz     x6,.Lecb_tail4x
1026         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
1027         veor    $tmp0,$rndlast,$dat0
1028         vorr    $dat0,$in0,$in0
1029         veor    $tmp1,$rndlast,$dat1
1030         vorr    $dat1,$in1,$in1
1031         veor    $tmp2,$rndlast,$dat2
1032         vorr    $dat2,$in2,$in2
1033         veor    $tmp3,$rndlast,$dat3
1034         vorr    $dat3,$in3,$in3
1035         veor    $tmp4,$rndlast,$dat4
1036         vst1.8  {$tmp0},[$out],#16
1037         vorr    $dat4,$in4,$in4
1038         vst1.8  {$tmp1},[$out],#16
1039         mov     $cnt,$rounds
1040         vst1.8  {$tmp2},[$out],#16
1041         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
1042         vst1.8  {$tmp3},[$out],#16
1043         vst1.8  {$tmp4},[$out],#16
1044         b.hs    .Loop5x_ecb_dec
1045
1046         add     $len,$len,#0x50
1047         cbz     $len,.Lecb_done
1048
1049         add     $cnt,$rounds,#2
1050         subs    $len,$len,#0x30
1051         vorr    $dat0,$in2,$in2
1052         vorr    $dat1,$in3,$in3
1053         vorr    $dat2,$in4,$in4
1054         b.lo    .Lecb_dec_tail
1055
1056         b       .Loop3x_ecb_dec
1057
1058 .align  4
1059 .Lecb_tail4x:
1060         veor    $tmp1,$rndlast,$dat1
1061         veor    $tmp2,$rndlast,$dat2
1062         veor    $tmp3,$rndlast,$dat3
1063         veor    $tmp4,$rndlast,$dat4
1064         vst1.8  {$tmp1},[$out],#16
1065         vst1.8  {$tmp2},[$out],#16
1066         vst1.8  {$tmp3},[$out],#16
1067         vst1.8  {$tmp4},[$out],#16
1068
1069         b       .Lecb_done
1070 .align  4
1071 ___
1072 $code.=<<___;
1073 .Loop3x_ecb_dec:
1074         aesd    $dat0,q8
1075         aesimc  $dat0,$dat0
1076         aesd    $dat1,q8
1077         aesimc  $dat1,$dat1
1078         aesd    $dat2,q8
1079         aesimc  $dat2,$dat2
1080         vld1.32 {q8},[$key_],#16
1081         subs    $cnt,$cnt,#2
1082         aesd    $dat0,q9
1083         aesimc  $dat0,$dat0
1084         aesd    $dat1,q9
1085         aesimc  $dat1,$dat1
1086         aesd    $dat2,q9
1087         aesimc  $dat2,$dat2
1088         vld1.32 {q9},[$key_],#16
1089         b.gt    .Loop3x_ecb_dec
1090
1091         aesd    $dat0,q8
1092         aesimc  $dat0,$dat0
1093         aesd    $dat1,q8
1094         aesimc  $dat1,$dat1
1095         aesd    $dat2,q8
1096         aesimc  $dat2,$dat2
1097         subs    $len,$len,#0x30
1098         mov.lo  x6,$len                         // x6, $cnt, is zero at this point
1099         aesd    $dat0,q9
1100         aesimc  $dat0,$dat0
1101         aesd    $dat1,q9
1102         aesimc  $dat1,$dat1
1103         aesd    $dat2,q9
1104         aesimc  $dat2,$dat2
1105         add     $inp,$inp,x6                    // $inp is adjusted in such way that
1106                                                 // at exit from the loop $dat1-$dat2
1107                                                 // are loaded with last "words"
1108         mov     $key_,$key
1109         aesd    $dat0,q12
1110         aesimc  $dat0,$dat0
1111         aesd    $dat1,q12
1112         aesimc  $dat1,$dat1
1113         aesd    $dat2,q12
1114         aesimc  $dat2,$dat2
1115         vld1.8  {$in0},[$inp],#16
1116         aesd    $dat0,q13
1117         aesimc  $dat0,$dat0
1118         aesd    $dat1,q13
1119         aesimc  $dat1,$dat1
1120         aesd    $dat2,q13
1121         aesimc  $dat2,$dat2
1122         vld1.8  {$in1},[$inp],#16
1123         aesd    $dat0,q14
1124         aesimc  $dat0,$dat0
1125         aesd    $dat1,q14
1126         aesimc  $dat1,$dat1
1127         aesd    $dat2,q14
1128         aesimc  $dat2,$dat2
1129         vld1.8  {$in2},[$inp],#16
1130         aesd    $dat0,q15
1131         aesd    $dat1,q15
1132         aesd    $dat2,q15
1133         vld1.32 {q8},[$key_],#16                        // re-pre-load rndkey[0]
1134         add     $cnt,$rounds,#2
1135         veor    $tmp0,$rndlast,$dat0
1136         veor    $tmp1,$rndlast,$dat1
1137         veor    $dat2,$dat2,$rndlast
1138         vld1.32 {q9},[$key_],#16                        // re-pre-load rndkey[1]
1139         vst1.8  {$tmp0},[$out],#16
1140         vorr    $dat0,$in0,$in0
1141         vst1.8  {$tmp1},[$out],#16
1142         vorr    $dat1,$in1,$in1
1143         vst1.8  {$dat2},[$out],#16
1144         vorr    $dat2,$in2,$in2
1145         b.hs    .Loop3x_ecb_dec
1146
1147         cmn     $len,#0x30
1148         b.eq    .Lecb_done
1149         nop
1150
1151 .Lecb_dec_tail:
1152         aesd    $dat1,q8
1153         aesimc  $dat1,$dat1
1154         aesd    $dat2,q8
1155         aesimc  $dat2,$dat2
1156         vld1.32 {q8},[$key_],#16
1157         subs    $cnt,$cnt,#2
1158         aesd    $dat1,q9
1159         aesimc  $dat1,$dat1
1160         aesd    $dat2,q9
1161         aesimc  $dat2,$dat2
1162         vld1.32 {q9},[$key_],#16
1163         b.gt    .Lecb_dec_tail
1164
1165         aesd    $dat1,q8
1166         aesimc  $dat1,$dat1
1167         aesd    $dat2,q8
1168         aesimc  $dat2,$dat2
1169         aesd    $dat1,q9
1170         aesimc  $dat1,$dat1
1171         aesd    $dat2,q9
1172         aesimc  $dat2,$dat2
1173         aesd    $dat1,q12
1174         aesimc  $dat1,$dat1
1175         aesd    $dat2,q12
1176         aesimc  $dat2,$dat2
1177         cmn     $len,#0x20
1178         aesd    $dat1,q13
1179         aesimc  $dat1,$dat1
1180         aesd    $dat2,q13
1181         aesimc  $dat2,$dat2
1182         aesd    $dat1,q14
1183         aesimc  $dat1,$dat1
1184         aesd    $dat2,q14
1185         aesimc  $dat2,$dat2
1186         aesd    $dat1,q15
1187         aesd    $dat2,q15
1188         b.eq    .Lecb_dec_one
1189         veor    $tmp1,$rndlast,$dat1
1190         veor    $tmp2,$rndlast,$dat2
1191         vst1.8  {$tmp1},[$out],#16
1192         vst1.8  {$tmp2},[$out],#16
1193         b       .Lecb_done
1194
1195 .Lecb_dec_one:
1196         veor    $tmp1,$rndlast,$dat2
1197         vst1.8  {$tmp1},[$out],#16
1198
1199 .Lecb_done:
1200 ___
1201 }
1202 $code.=<<___    if ($flavour !~ /64/);
1203         vldmia  sp!,{d8-d15}
1204         ldmia   sp!,{r4-r8,pc}
1205 ___
1206 $code.=<<___    if ($flavour =~ /64/);
1207         ldr     x29,[sp],#16
1208 ___
1209 $code.=<<___    if ($flavour =~ /64/);
1210 .Lecb_Final_abort:
1211         ret
1212 ___
1213 $code.=<<___;
1214 .size   ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
1215 ___
1216 }}}
1217 {{{
1218 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
1219 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
1220 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1221
1222 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
1223 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
1224
1225 ### q8-q15      preloaded key schedule
1226
1227 $code.=<<___;
1228 .globl  ${prefix}_cbc_encrypt
1229 .type   ${prefix}_cbc_encrypt,%function
1230 .align  5
1231 ${prefix}_cbc_encrypt:
1232 ___
1233 $code.=<<___    if ($flavour =~ /64/);
1234         stp     x29,x30,[sp,#-16]!
1235         add     x29,sp,#0
1236 ___
1237 $code.=<<___    if ($flavour !~ /64/);
1238         mov     ip,sp
1239         stmdb   sp!,{r4-r8,lr}
1240         vstmdb  sp!,{d8-d15}            @ ABI specification says so
1241         ldmia   ip,{r4-r5}              @ load remaining args
1242 ___
1243 $code.=<<___;
1244         subs    $len,$len,#16
1245         mov     $step,#16
1246         b.lo    .Lcbc_abort
1247         cclr    $step,eq
1248
1249         cmp     $enc,#0                 // en- or decrypting?
1250         ldr     $rounds,[$key,#240]
1251         and     $len,$len,#-16
1252         vld1.8  {$ivec},[$ivp]
1253         vld1.8  {$dat},[$inp],$step
1254
1255         vld1.32 {q8-q9},[$key]          // load key schedule...
1256         sub     $rounds,$rounds,#6
1257         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
1258         sub     $rounds,$rounds,#2
1259         vld1.32 {q10-q11},[$key_],#32
1260         vld1.32 {q12-q13},[$key_],#32
1261         vld1.32 {q14-q15},[$key_],#32
1262         vld1.32 {$rndlast},[$key_]
1263
1264         add     $key_,$key,#32
1265         mov     $cnt,$rounds
1266         b.eq    .Lcbc_dec
1267
1268         cmp     $rounds,#2
1269         veor    $dat,$dat,$ivec
1270         veor    $rndzero_n_last,q8,$rndlast
1271         b.eq    .Lcbc_enc128
1272
1273         vld1.32 {$in0-$in1},[$key_]
1274         add     $key_,$key,#16
1275         add     $key4,$key,#16*4
1276         add     $key5,$key,#16*5
1277         aese    $dat,q8
1278         aesmc   $dat,$dat
1279         add     $key6,$key,#16*6
1280         add     $key7,$key,#16*7
1281         b       .Lenter_cbc_enc
1282
1283 .align  4
1284 .Loop_cbc_enc:
1285         aese    $dat,q8
1286         aesmc   $dat,$dat
1287          vst1.8 {$ivec},[$out],#16
1288 .Lenter_cbc_enc:
1289         aese    $dat,q9
1290         aesmc   $dat,$dat
1291         aese    $dat,$in0
1292         aesmc   $dat,$dat
1293         vld1.32 {q8},[$key4]
1294         cmp     $rounds,#4
1295         aese    $dat,$in1
1296         aesmc   $dat,$dat
1297         vld1.32 {q9},[$key5]
1298         b.eq    .Lcbc_enc192
1299
1300         aese    $dat,q8
1301         aesmc   $dat,$dat
1302         vld1.32 {q8},[$key6]
1303         aese    $dat,q9
1304         aesmc   $dat,$dat
1305         vld1.32 {q9},[$key7]
1306         nop
1307
1308 .Lcbc_enc192:
1309         aese    $dat,q8
1310         aesmc   $dat,$dat
1311          subs   $len,$len,#16
1312         aese    $dat,q9
1313         aesmc   $dat,$dat
1314          cclr   $step,eq
1315         aese    $dat,q10
1316         aesmc   $dat,$dat
1317         aese    $dat,q11
1318         aesmc   $dat,$dat
1319          vld1.8 {q8},[$inp],$step
1320         aese    $dat,q12
1321         aesmc   $dat,$dat
1322          veor   q8,q8,$rndzero_n_last
1323         aese    $dat,q13
1324         aesmc   $dat,$dat
1325          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
1326         aese    $dat,q14
1327         aesmc   $dat,$dat
1328         aese    $dat,q15
1329         veor    $ivec,$dat,$rndlast
1330         b.hs    .Loop_cbc_enc
1331
1332         vst1.8  {$ivec},[$out],#16
1333         b       .Lcbc_done
1334
1335 .align  5
1336 .Lcbc_enc128:
1337         vld1.32 {$in0-$in1},[$key_]
1338         aese    $dat,q8
1339         aesmc   $dat,$dat
1340         b       .Lenter_cbc_enc128
1341 .Loop_cbc_enc128:
1342         aese    $dat,q8
1343         aesmc   $dat,$dat
1344          vst1.8 {$ivec},[$out],#16
1345 .Lenter_cbc_enc128:
1346         aese    $dat,q9
1347         aesmc   $dat,$dat
1348          subs   $len,$len,#16
1349         aese    $dat,$in0
1350         aesmc   $dat,$dat
1351          cclr   $step,eq
1352         aese    $dat,$in1
1353         aesmc   $dat,$dat
1354         aese    $dat,q10
1355         aesmc   $dat,$dat
1356         aese    $dat,q11
1357         aesmc   $dat,$dat
1358          vld1.8 {q8},[$inp],$step
1359         aese    $dat,q12
1360         aesmc   $dat,$dat
1361         aese    $dat,q13
1362         aesmc   $dat,$dat
1363         aese    $dat,q14
1364         aesmc   $dat,$dat
1365          veor   q8,q8,$rndzero_n_last
1366         aese    $dat,q15
1367         veor    $ivec,$dat,$rndlast
1368         b.hs    .Loop_cbc_enc128
1369
1370         vst1.8  {$ivec},[$out],#16
1371         b       .Lcbc_done
1372 ___
1373 {
1374 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1375
1376 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
1377 my ($dat4,$in4,$tmp4);
1378 if ($flavour =~ /64/) {
1379     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
1380 }
1381
1382 $code.=<<___;
1383 .align  5
1384 .Lcbc_dec:
1385         vld1.8  {$dat2},[$inp],#16
1386         subs    $len,$len,#32           // bias
1387         add     $cnt,$rounds,#2
1388         vorr    $in1,$dat,$dat
1389         vorr    $dat1,$dat,$dat
1390         vorr    $in2,$dat2,$dat2
1391         b.lo    .Lcbc_dec_tail
1392
1393         vorr    $dat1,$dat2,$dat2
1394         vld1.8  {$dat2},[$inp],#16
1395         vorr    $in0,$dat,$dat
1396         vorr    $in1,$dat1,$dat1
1397         vorr    $in2,$dat2,$dat2
1398 ___
1399 $code.=<<___    if ($flavour =~ /64/);
1400         cmp     $len,#32
1401         b.lo    .Loop3x_cbc_dec
1402
1403         vld1.8  {$dat3},[$inp],#16
1404         vld1.8  {$dat4},[$inp],#16
1405         sub     $len,$len,#32           // bias
1406         mov     $cnt,$rounds
1407         vorr    $in3,$dat3,$dat3
1408         vorr    $in4,$dat4,$dat4
1409
1410 .Loop5x_cbc_dec:
1411         aesd    $dat0,q8
1412         aesimc  $dat0,$dat0
1413         aesd    $dat1,q8
1414         aesimc  $dat1,$dat1
1415         aesd    $dat2,q8
1416         aesimc  $dat2,$dat2
1417         aesd    $dat3,q8
1418         aesimc  $dat3,$dat3
1419         aesd    $dat4,q8
1420         aesimc  $dat4,$dat4
1421         vld1.32 {q8},[$key_],#16
1422         subs    $cnt,$cnt,#2
1423         aesd    $dat0,q9
1424         aesimc  $dat0,$dat0
1425         aesd    $dat1,q9
1426         aesimc  $dat1,$dat1
1427         aesd    $dat2,q9
1428         aesimc  $dat2,$dat2
1429         aesd    $dat3,q9
1430         aesimc  $dat3,$dat3
1431         aesd    $dat4,q9
1432         aesimc  $dat4,$dat4
1433         vld1.32 {q9},[$key_],#16
1434         b.gt    .Loop5x_cbc_dec
1435
1436         aesd    $dat0,q8
1437         aesimc  $dat0,$dat0
1438         aesd    $dat1,q8
1439         aesimc  $dat1,$dat1
1440         aesd    $dat2,q8
1441         aesimc  $dat2,$dat2
1442         aesd    $dat3,q8
1443         aesimc  $dat3,$dat3
1444         aesd    $dat4,q8
1445         aesimc  $dat4,$dat4
1446          cmp    $len,#0x40              // because .Lcbc_tail4x
1447          sub    $len,$len,#0x50
1448
1449         aesd    $dat0,q9
1450         aesimc  $dat0,$dat0
1451         aesd    $dat1,q9
1452         aesimc  $dat1,$dat1
1453         aesd    $dat2,q9
1454         aesimc  $dat2,$dat2
1455         aesd    $dat3,q9
1456         aesimc  $dat3,$dat3
1457         aesd    $dat4,q9
1458         aesimc  $dat4,$dat4
1459          csel   x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
1460          mov    $key_,$key
1461
1462         aesd    $dat0,q10
1463         aesimc  $dat0,$dat0
1464         aesd    $dat1,q10
1465         aesimc  $dat1,$dat1
1466         aesd    $dat2,q10
1467         aesimc  $dat2,$dat2
1468         aesd    $dat3,q10
1469         aesimc  $dat3,$dat3
1470         aesd    $dat4,q10
1471         aesimc  $dat4,$dat4
1472          add    $inp,$inp,x6            // $inp is adjusted in such way that
1473                                         // at exit from the loop $dat1-$dat4
1474                                         // are loaded with last "words"
1475          add    x6,$len,#0x60           // because .Lcbc_tail4x
1476
1477         aesd    $dat0,q11
1478         aesimc  $dat0,$dat0
1479         aesd    $dat1,q11
1480         aesimc  $dat1,$dat1
1481         aesd    $dat2,q11
1482         aesimc  $dat2,$dat2
1483         aesd    $dat3,q11
1484         aesimc  $dat3,$dat3
1485         aesd    $dat4,q11
1486         aesimc  $dat4,$dat4
1487
1488         aesd    $dat0,q12
1489         aesimc  $dat0,$dat0
1490         aesd    $dat1,q12
1491         aesimc  $dat1,$dat1
1492         aesd    $dat2,q12
1493         aesimc  $dat2,$dat2
1494         aesd    $dat3,q12
1495         aesimc  $dat3,$dat3
1496         aesd    $dat4,q12
1497         aesimc  $dat4,$dat4
1498
1499         aesd    $dat0,q13
1500         aesimc  $dat0,$dat0
1501         aesd    $dat1,q13
1502         aesimc  $dat1,$dat1
1503         aesd    $dat2,q13
1504         aesimc  $dat2,$dat2
1505         aesd    $dat3,q13
1506         aesimc  $dat3,$dat3
1507         aesd    $dat4,q13
1508         aesimc  $dat4,$dat4
1509
1510         aesd    $dat0,q14
1511         aesimc  $dat0,$dat0
1512         aesd    $dat1,q14
1513         aesimc  $dat1,$dat1
1514         aesd    $dat2,q14
1515         aesimc  $dat2,$dat2
1516         aesd    $dat3,q14
1517         aesimc  $dat3,$dat3
1518         aesd    $dat4,q14
1519         aesimc  $dat4,$dat4
1520
1521          veor   $tmp0,$ivec,$rndlast
1522         aesd    $dat0,q15
1523          veor   $tmp1,$in0,$rndlast
1524          vld1.8 {$in0},[$inp],#16
1525         aesd    $dat1,q15
1526          veor   $tmp2,$in1,$rndlast
1527          vld1.8 {$in1},[$inp],#16
1528         aesd    $dat2,q15
1529          veor   $tmp3,$in2,$rndlast
1530          vld1.8 {$in2},[$inp],#16
1531         aesd    $dat3,q15
1532          veor   $tmp4,$in3,$rndlast
1533          vld1.8 {$in3},[$inp],#16
1534         aesd    $dat4,q15
1535          vorr   $ivec,$in4,$in4
1536          vld1.8 {$in4},[$inp],#16
1537         cbz     x6,.Lcbc_tail4x
1538          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
1539         veor    $tmp0,$tmp0,$dat0
1540          vorr   $dat0,$in0,$in0
1541         veor    $tmp1,$tmp1,$dat1
1542          vorr   $dat1,$in1,$in1
1543         veor    $tmp2,$tmp2,$dat2
1544          vorr   $dat2,$in2,$in2
1545         veor    $tmp3,$tmp3,$dat3
1546          vorr   $dat3,$in3,$in3
1547         veor    $tmp4,$tmp4,$dat4
1548         vst1.8  {$tmp0},[$out],#16
1549          vorr   $dat4,$in4,$in4
1550         vst1.8  {$tmp1},[$out],#16
1551          mov    $cnt,$rounds
1552         vst1.8  {$tmp2},[$out],#16
1553          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
1554         vst1.8  {$tmp3},[$out],#16
1555         vst1.8  {$tmp4},[$out],#16
1556         b.hs    .Loop5x_cbc_dec
1557
1558         add     $len,$len,#0x50
1559         cbz     $len,.Lcbc_done
1560
1561         add     $cnt,$rounds,#2
1562         subs    $len,$len,#0x30
1563         vorr    $dat0,$in2,$in2
1564         vorr    $in0,$in2,$in2
1565         vorr    $dat1,$in3,$in3
1566         vorr    $in1,$in3,$in3
1567         vorr    $dat2,$in4,$in4
1568         vorr    $in2,$in4,$in4
1569         b.lo    .Lcbc_dec_tail
1570
1571         b       .Loop3x_cbc_dec
1572
1573 .align  4
1574 .Lcbc_tail4x:
1575         veor    $tmp1,$tmp0,$dat1
1576         veor    $tmp2,$tmp2,$dat2
1577         veor    $tmp3,$tmp3,$dat3
1578         veor    $tmp4,$tmp4,$dat4
1579         vst1.8  {$tmp1},[$out],#16
1580         vst1.8  {$tmp2},[$out],#16
1581         vst1.8  {$tmp3},[$out],#16
1582         vst1.8  {$tmp4},[$out],#16
1583
1584         b       .Lcbc_done
1585 .align  4
1586 ___
1587 $code.=<<___;
1588 .Loop3x_cbc_dec:
1589         aesd    $dat0,q8
1590         aesimc  $dat0,$dat0
1591         aesd    $dat1,q8
1592         aesimc  $dat1,$dat1
1593         aesd    $dat2,q8
1594         aesimc  $dat2,$dat2
1595         vld1.32 {q8},[$key_],#16
1596         subs    $cnt,$cnt,#2
1597         aesd    $dat0,q9
1598         aesimc  $dat0,$dat0
1599         aesd    $dat1,q9
1600         aesimc  $dat1,$dat1
1601         aesd    $dat2,q9
1602         aesimc  $dat2,$dat2
1603         vld1.32 {q9},[$key_],#16
1604         b.gt    .Loop3x_cbc_dec
1605
1606         aesd    $dat0,q8
1607         aesimc  $dat0,$dat0
1608         aesd    $dat1,q8
1609         aesimc  $dat1,$dat1
1610         aesd    $dat2,q8
1611         aesimc  $dat2,$dat2
1612          veor   $tmp0,$ivec,$rndlast
1613          subs   $len,$len,#0x30
1614          veor   $tmp1,$in0,$rndlast
1615          mov.lo x6,$len                 // x6, $cnt, is zero at this point
1616         aesd    $dat0,q9
1617         aesimc  $dat0,$dat0
1618         aesd    $dat1,q9
1619         aesimc  $dat1,$dat1
1620         aesd    $dat2,q9
1621         aesimc  $dat2,$dat2
1622          veor   $tmp2,$in1,$rndlast
1623          add    $inp,$inp,x6            // $inp is adjusted in such way that
1624                                         // at exit from the loop $dat1-$dat2
1625                                         // are loaded with last "words"
1626          vorr   $ivec,$in2,$in2
1627          mov    $key_,$key
1628         aesd    $dat0,q12
1629         aesimc  $dat0,$dat0
1630         aesd    $dat1,q12
1631         aesimc  $dat1,$dat1
1632         aesd    $dat2,q12
1633         aesimc  $dat2,$dat2
1634          vld1.8 {$in0},[$inp],#16
1635         aesd    $dat0,q13
1636         aesimc  $dat0,$dat0
1637         aesd    $dat1,q13
1638         aesimc  $dat1,$dat1
1639         aesd    $dat2,q13
1640         aesimc  $dat2,$dat2
1641          vld1.8 {$in1},[$inp],#16
1642         aesd    $dat0,q14
1643         aesimc  $dat0,$dat0
1644         aesd    $dat1,q14
1645         aesimc  $dat1,$dat1
1646         aesd    $dat2,q14
1647         aesimc  $dat2,$dat2
1648          vld1.8 {$in2},[$inp],#16
1649         aesd    $dat0,q15
1650         aesd    $dat1,q15
1651         aesd    $dat2,q15
1652          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
1653          add    $cnt,$rounds,#2
1654         veor    $tmp0,$tmp0,$dat0
1655         veor    $tmp1,$tmp1,$dat1
1656         veor    $dat2,$dat2,$tmp2
1657          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
1658         vst1.8  {$tmp0},[$out],#16
1659          vorr   $dat0,$in0,$in0
1660         vst1.8  {$tmp1},[$out],#16
1661          vorr   $dat1,$in1,$in1
1662         vst1.8  {$dat2},[$out],#16
1663          vorr   $dat2,$in2,$in2
1664         b.hs    .Loop3x_cbc_dec
1665
1666         cmn     $len,#0x30
1667         b.eq    .Lcbc_done
1668         nop
1669
1670 .Lcbc_dec_tail:
1671         aesd    $dat1,q8
1672         aesimc  $dat1,$dat1
1673         aesd    $dat2,q8
1674         aesimc  $dat2,$dat2
1675         vld1.32 {q8},[$key_],#16
1676         subs    $cnt,$cnt,#2
1677         aesd    $dat1,q9
1678         aesimc  $dat1,$dat1
1679         aesd    $dat2,q9
1680         aesimc  $dat2,$dat2
1681         vld1.32 {q9},[$key_],#16
1682         b.gt    .Lcbc_dec_tail
1683
1684         aesd    $dat1,q8
1685         aesimc  $dat1,$dat1
1686         aesd    $dat2,q8
1687         aesimc  $dat2,$dat2
1688         aesd    $dat1,q9
1689         aesimc  $dat1,$dat1
1690         aesd    $dat2,q9
1691         aesimc  $dat2,$dat2
1692         aesd    $dat1,q12
1693         aesimc  $dat1,$dat1
1694         aesd    $dat2,q12
1695         aesimc  $dat2,$dat2
1696          cmn    $len,#0x20
1697         aesd    $dat1,q13
1698         aesimc  $dat1,$dat1
1699         aesd    $dat2,q13
1700         aesimc  $dat2,$dat2
1701          veor   $tmp1,$ivec,$rndlast
1702         aesd    $dat1,q14
1703         aesimc  $dat1,$dat1
1704         aesd    $dat2,q14
1705         aesimc  $dat2,$dat2
1706          veor   $tmp2,$in1,$rndlast
1707         aesd    $dat1,q15
1708         aesd    $dat2,q15
1709         b.eq    .Lcbc_dec_one
1710         veor    $tmp1,$tmp1,$dat1
1711         veor    $tmp2,$tmp2,$dat2
1712          vorr   $ivec,$in2,$in2
1713         vst1.8  {$tmp1},[$out],#16
1714         vst1.8  {$tmp2},[$out],#16
1715         b       .Lcbc_done
1716
1717 .Lcbc_dec_one:
1718         veor    $tmp1,$tmp1,$dat2
1719          vorr   $ivec,$in2,$in2
1720         vst1.8  {$tmp1},[$out],#16
1721
1722 .Lcbc_done:
1723         vst1.8  {$ivec},[$ivp]
1724 .Lcbc_abort:
1725 ___
1726 }
1727 $code.=<<___    if ($flavour !~ /64/);
1728         vldmia  sp!,{d8-d15}
1729         ldmia   sp!,{r4-r8,pc}
1730 ___
1731 $code.=<<___    if ($flavour =~ /64/);
1732         ldr     x29,[sp],#16
1733         ret
1734 ___
1735 $code.=<<___;
1736 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
1737 ___
1738 }}}
1739 {{{
1740 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
1741 my ($rounds,$cnt,$key_)=("w5","w6","x7");
1742 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
1743 my $step="x12";         # aliases with $tctr2
1744
1745 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
1746 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
1747
1748 # used only in 64-bit mode...
1749 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
1750
1751 my ($dat,$tmp)=($dat0,$tmp0);
1752
1753 ### q8-q15      preloaded key schedule
1754
1755 $code.=<<___;
1756 .globl  ${prefix}_ctr32_encrypt_blocks
1757 .type   ${prefix}_ctr32_encrypt_blocks,%function
1758 .align  5
1759 ${prefix}_ctr32_encrypt_blocks:
1760 ___
1761 $code.=<<___    if ($flavour =~ /64/);
1762         stp             x29,x30,[sp,#-16]!
1763         add             x29,sp,#0
1764 ___
1765 $code.=<<___    if ($flavour !~ /64/);
1766         mov             ip,sp
1767         stmdb           sp!,{r4-r10,lr}
1768         vstmdb          sp!,{d8-d15}            @ ABI specification says so
1769         ldr             r4, [ip]                @ load remaining arg
1770 ___
1771 $code.=<<___;
1772         ldr             $rounds,[$key,#240]
1773
1774         ldr             $ctr, [$ivp, #12]
1775         vld1.32         {$dat0},[$ivp]
1776
1777         vld1.32         {q8-q9},[$key]          // load key schedule...
1778         sub             $rounds,$rounds,#4
1779         mov             $step,#16
1780         cmp             $len,#2
1781         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
1782         sub             $rounds,$rounds,#2
1783         vld1.32         {q12-q13},[$key_],#32
1784         vld1.32         {q14-q15},[$key_],#32
1785         vld1.32         {$rndlast},[$key_]
1786         add             $key_,$key,#32
1787         mov             $cnt,$rounds
1788         cclr            $step,lo
1789 #ifndef __ARMEB__
1790         rev             $ctr, $ctr
1791 #endif
1792         vorr            $dat1,$dat0,$dat0
1793         add             $tctr1, $ctr, #1
1794         vorr            $dat2,$dat0,$dat0
1795         add             $ctr, $ctr, #2
1796         vorr            $ivec,$dat0,$dat0
1797         rev             $tctr1, $tctr1
1798         vmov.32         ${dat1}[3],$tctr1
1799         b.ls            .Lctr32_tail
1800         rev             $tctr2, $ctr
1801         sub             $len,$len,#3            // bias
1802         vmov.32         ${dat2}[3],$tctr2
1803 ___
1804 $code.=<<___    if ($flavour =~ /64/);
1805         cmp             $len,#2
1806         b.lo            .Loop3x_ctr32
1807
1808         add             w13,$ctr,#1
1809         add             w14,$ctr,#2
1810         vorr            $dat3,$dat0,$dat0
1811         rev             w13,w13
1812         vorr            $dat4,$dat0,$dat0
1813         rev             w14,w14
1814         vmov.32         ${dat3}[3],w13
1815         sub             $len,$len,#2            // bias
1816         vmov.32         ${dat4}[3],w14
1817         add             $ctr,$ctr,#2
1818         b               .Loop5x_ctr32
1819
1820 .align  4
1821 .Loop5x_ctr32:
1822         aese            $dat0,q8
1823         aesmc           $dat0,$dat0
1824         aese            $dat1,q8
1825         aesmc           $dat1,$dat1
1826         aese            $dat2,q8
1827         aesmc           $dat2,$dat2
1828         aese            $dat3,q8
1829         aesmc           $dat3,$dat3
1830         aese            $dat4,q8
1831         aesmc           $dat4,$dat4
1832         vld1.32         {q8},[$key_],#16
1833         subs            $cnt,$cnt,#2
1834         aese            $dat0,q9
1835         aesmc           $dat0,$dat0
1836         aese            $dat1,q9
1837         aesmc           $dat1,$dat1
1838         aese            $dat2,q9
1839         aesmc           $dat2,$dat2
1840         aese            $dat3,q9
1841         aesmc           $dat3,$dat3
1842         aese            $dat4,q9
1843         aesmc           $dat4,$dat4
1844         vld1.32         {q9},[$key_],#16
1845         b.gt            .Loop5x_ctr32
1846
1847         mov             $key_,$key
1848         aese            $dat0,q8
1849         aesmc           $dat0,$dat0
1850         aese            $dat1,q8
1851         aesmc           $dat1,$dat1
1852         aese            $dat2,q8
1853         aesmc           $dat2,$dat2
1854         aese            $dat3,q8
1855         aesmc           $dat3,$dat3
1856         aese            $dat4,q8
1857         aesmc           $dat4,$dat4
1858         vld1.32         {q8},[$key_],#16        // re-pre-load rndkey[0]
1859
1860         aese            $dat0,q9
1861         aesmc           $dat0,$dat0
1862         aese            $dat1,q9
1863         aesmc           $dat1,$dat1
1864         aese            $dat2,q9
1865         aesmc           $dat2,$dat2
1866         aese            $dat3,q9
1867         aesmc           $dat3,$dat3
1868         aese            $dat4,q9
1869         aesmc           $dat4,$dat4
1870         vld1.32         {q9},[$key_],#16        // re-pre-load rndkey[1]
1871
1872         aese            $dat0,q12
1873         aesmc           $dat0,$dat0
1874          add            $tctr0,$ctr,#1
1875          add            $tctr1,$ctr,#2
1876         aese            $dat1,q12
1877         aesmc           $dat1,$dat1
1878          add            $tctr2,$ctr,#3
1879          add            w13,$ctr,#4
1880         aese            $dat2,q12
1881         aesmc           $dat2,$dat2
1882          add            w14,$ctr,#5
1883          rev            $tctr0,$tctr0
1884         aese            $dat3,q12
1885         aesmc           $dat3,$dat3
1886          rev            $tctr1,$tctr1
1887          rev            $tctr2,$tctr2
1888         aese            $dat4,q12
1889         aesmc           $dat4,$dat4
1890          rev            w13,w13
1891          rev            w14,w14
1892
1893         aese            $dat0,q13
1894         aesmc           $dat0,$dat0
1895         aese            $dat1,q13
1896         aesmc           $dat1,$dat1
1897         aese            $dat2,q13
1898         aesmc           $dat2,$dat2
1899         aese            $dat3,q13
1900         aesmc           $dat3,$dat3
1901         aese            $dat4,q13
1902         aesmc           $dat4,$dat4
1903
1904         aese            $dat0,q14
1905         aesmc           $dat0,$dat0
1906          vld1.8         {$in0},[$inp],#16
1907         aese            $dat1,q14
1908         aesmc           $dat1,$dat1
1909          vld1.8         {$in1},[$inp],#16
1910         aese            $dat2,q14
1911         aesmc           $dat2,$dat2
1912          vld1.8         {$in2},[$inp],#16
1913         aese            $dat3,q14
1914         aesmc           $dat3,$dat3
1915          vld1.8         {$in3},[$inp],#16
1916         aese            $dat4,q14
1917         aesmc           $dat4,$dat4
1918          vld1.8         {$in4},[$inp],#16
1919
1920         aese            $dat0,q15
1921          veor           $in0,$in0,$rndlast
1922         aese            $dat1,q15
1923          veor           $in1,$in1,$rndlast
1924         aese            $dat2,q15
1925          veor           $in2,$in2,$rndlast
1926         aese            $dat3,q15
1927          veor           $in3,$in3,$rndlast
1928         aese            $dat4,q15
1929          veor           $in4,$in4,$rndlast
1930
1931         veor            $in0,$in0,$dat0
1932          vorr           $dat0,$ivec,$ivec
1933         veor            $in1,$in1,$dat1
1934          vorr           $dat1,$ivec,$ivec
1935         veor            $in2,$in2,$dat2
1936          vorr           $dat2,$ivec,$ivec
1937         veor            $in3,$in3,$dat3
1938          vorr           $dat3,$ivec,$ivec
1939         veor            $in4,$in4,$dat4
1940          vorr           $dat4,$ivec,$ivec
1941
1942         vst1.8          {$in0},[$out],#16
1943          vmov.32        ${dat0}[3],$tctr0
1944         vst1.8          {$in1},[$out],#16
1945          vmov.32        ${dat1}[3],$tctr1
1946         vst1.8          {$in2},[$out],#16
1947          vmov.32        ${dat2}[3],$tctr2
1948         vst1.8          {$in3},[$out],#16
1949          vmov.32        ${dat3}[3],w13
1950         vst1.8          {$in4},[$out],#16
1951          vmov.32        ${dat4}[3],w14
1952
1953         mov             $cnt,$rounds
1954         cbz             $len,.Lctr32_done
1955
1956         add             $ctr,$ctr,#5
1957         subs            $len,$len,#5
1958         b.hs            .Loop5x_ctr32
1959
1960         add             $len,$len,#5
1961         sub             $ctr,$ctr,#5
1962
1963         cmp             $len,#2
1964         mov             $step,#16
1965         cclr            $step,lo
1966         b.ls            .Lctr32_tail
1967
1968         sub             $len,$len,#3            // bias
1969         add             $ctr,$ctr,#3
1970 ___
1971 $code.=<<___;
1972         b               .Loop3x_ctr32
1973
1974 .align  4
1975 .Loop3x_ctr32:
1976         aese            $dat0,q8
1977         aesmc           $dat0,$dat0
1978         aese            $dat1,q8
1979         aesmc           $dat1,$dat1
1980         aese            $dat2,q8
1981         aesmc           $dat2,$dat2
1982         vld1.32         {q8},[$key_],#16
1983         subs            $cnt,$cnt,#2
1984         aese            $dat0,q9
1985         aesmc           $dat0,$dat0
1986         aese            $dat1,q9
1987         aesmc           $dat1,$dat1
1988         aese            $dat2,q9
1989         aesmc           $dat2,$dat2
1990         vld1.32         {q9},[$key_],#16
1991         b.gt            .Loop3x_ctr32
1992
1993         aese            $dat0,q8
1994         aesmc           $tmp0,$dat0
1995         aese            $dat1,q8
1996         aesmc           $tmp1,$dat1
1997          vld1.8         {$in0},[$inp],#16
1998          vorr           $dat0,$ivec,$ivec
1999         aese            $dat2,q8
2000         aesmc           $dat2,$dat2
2001          vld1.8         {$in1},[$inp],#16
2002          vorr           $dat1,$ivec,$ivec
2003         aese            $tmp0,q9
2004         aesmc           $tmp0,$tmp0
2005         aese            $tmp1,q9
2006         aesmc           $tmp1,$tmp1
2007          vld1.8         {$in2},[$inp],#16
2008          mov            $key_,$key
2009         aese            $dat2,q9
2010         aesmc           $tmp2,$dat2
2011          vorr           $dat2,$ivec,$ivec
2012          add            $tctr0,$ctr,#1
2013         aese            $tmp0,q12
2014         aesmc           $tmp0,$tmp0
2015         aese            $tmp1,q12
2016         aesmc           $tmp1,$tmp1
2017          veor           $in0,$in0,$rndlast
2018          add            $tctr1,$ctr,#2
2019         aese            $tmp2,q12
2020         aesmc           $tmp2,$tmp2
2021          veor           $in1,$in1,$rndlast
2022          add            $ctr,$ctr,#3
2023         aese            $tmp0,q13
2024         aesmc           $tmp0,$tmp0
2025         aese            $tmp1,q13
2026         aesmc           $tmp1,$tmp1
2027          veor           $in2,$in2,$rndlast
2028          rev            $tctr0,$tctr0
2029         aese            $tmp2,q13
2030         aesmc           $tmp2,$tmp2
2031          vmov.32        ${dat0}[3], $tctr0
2032          rev            $tctr1,$tctr1
2033         aese            $tmp0,q14
2034         aesmc           $tmp0,$tmp0
2035         aese            $tmp1,q14
2036         aesmc           $tmp1,$tmp1
2037          vmov.32        ${dat1}[3], $tctr1
2038          rev            $tctr2,$ctr
2039         aese            $tmp2,q14
2040         aesmc           $tmp2,$tmp2
2041          vmov.32        ${dat2}[3], $tctr2
2042          subs           $len,$len,#3
2043         aese            $tmp0,q15
2044         aese            $tmp1,q15
2045         aese            $tmp2,q15
2046
2047         veor            $in0,$in0,$tmp0
2048          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
2049         vst1.8          {$in0},[$out],#16
2050         veor            $in1,$in1,$tmp1
2051          mov            $cnt,$rounds
2052         vst1.8          {$in1},[$out],#16
2053         veor            $in2,$in2,$tmp2
2054          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
2055         vst1.8          {$in2},[$out],#16
2056         b.hs            .Loop3x_ctr32
2057
2058         adds            $len,$len,#3
2059         b.eq            .Lctr32_done
2060         cmp             $len,#1
2061         mov             $step,#16
2062         cclr            $step,eq
2063
2064 .Lctr32_tail:
2065         aese            $dat0,q8
2066         aesmc           $dat0,$dat0
2067         aese            $dat1,q8
2068         aesmc           $dat1,$dat1
2069         vld1.32         {q8},[$key_],#16
2070         subs            $cnt,$cnt,#2
2071         aese            $dat0,q9
2072         aesmc           $dat0,$dat0
2073         aese            $dat1,q9
2074         aesmc           $dat1,$dat1
2075         vld1.32         {q9},[$key_],#16
2076         b.gt            .Lctr32_tail
2077
2078         aese            $dat0,q8
2079         aesmc           $dat0,$dat0
2080         aese            $dat1,q8
2081         aesmc           $dat1,$dat1
2082         aese            $dat0,q9
2083         aesmc           $dat0,$dat0
2084         aese            $dat1,q9
2085         aesmc           $dat1,$dat1
2086          vld1.8         {$in0},[$inp],$step
2087         aese            $dat0,q12
2088         aesmc           $dat0,$dat0
2089         aese            $dat1,q12
2090         aesmc           $dat1,$dat1
2091          vld1.8         {$in1},[$inp]
2092         aese            $dat0,q13
2093         aesmc           $dat0,$dat0
2094         aese            $dat1,q13
2095         aesmc           $dat1,$dat1
2096          veor           $in0,$in0,$rndlast
2097         aese            $dat0,q14
2098         aesmc           $dat0,$dat0
2099         aese            $dat1,q14
2100         aesmc           $dat1,$dat1
2101          veor           $in1,$in1,$rndlast
2102         aese            $dat0,q15
2103         aese            $dat1,q15
2104
2105         cmp             $len,#1
2106         veor            $in0,$in0,$dat0
2107         veor            $in1,$in1,$dat1
2108         vst1.8          {$in0},[$out],#16
2109         b.eq            .Lctr32_done
2110         vst1.8          {$in1},[$out]
2111
2112 .Lctr32_done:
2113 ___
2114 $code.=<<___    if ($flavour !~ /64/);
2115         vldmia          sp!,{d8-d15}
2116         ldmia           sp!,{r4-r10,pc}
2117 ___
2118 $code.=<<___    if ($flavour =~ /64/);
2119         ldr             x29,[sp],#16
2120         ret
2121 ___
2122 $code.=<<___;
2123 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
2124 ___
2125 }}}
2126 $code.=<<___;
2127 #endif
2128 ___
2129 ########################################
2130 if ($flavour =~ /64/) {                 ######## 64-bit code
2131     my %opcode = (
2132         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
2133         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
2134
2135     local *unaes = sub {
2136         my ($mnemonic,$arg)=@_;
2137
2138         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
2139         sprintf ".inst\t0x%08x\t//%s %s",
2140                         $opcode{$mnemonic}|$1|($2<<5),
2141                         $mnemonic,$arg;
2142     };
2143
2144     foreach(split("\n",$code)) {
2145         s/\`([^\`]*)\`/eval($1)/geo;
2146
2147         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
2148         s/@\s/\/\//o;                   # old->new style commentary
2149
2150         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
2151         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
2152         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
2153         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
2154         s/vext\.8/ext/o         or
2155         s/vrev32\.8/rev32/o     or
2156         s/vtst\.8/cmtst/o       or
2157         s/vshr/ushr/o           or
2158         s/^(\s+)v/$1/o          or      # strip off v prefix
2159         s/\bbx\s+lr\b/ret/o;
2160
2161         # fix up remaining legacy suffixes
2162         s/\.[ui]?8//o;
2163         m/\],#8/o and s/\.16b/\.8b/go;
2164         s/\.[ui]?32//o and s/\.16b/\.4s/go;
2165         s/\.[ui]?64//o and s/\.16b/\.2d/go;
2166         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
2167
2168         print $_,"\n";
2169     }
2170 } else {                                ######## 32-bit code
2171     my %opcode = (
2172         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
2173         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
2174
2175     local *unaes = sub {
2176         my ($mnemonic,$arg)=@_;
2177
2178         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
2179             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
2180                                          |(($2&7)<<1) |(($2&8)<<2);
2181             # since ARMv7 instructions are always encoded little-endian.
2182             # correct solution is to use .inst directive, but older
2183             # assemblers don't implement it:-(
2184             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
2185                         $word&0xff,($word>>8)&0xff,
2186                         ($word>>16)&0xff,($word>>24)&0xff,
2187                         $mnemonic,$arg;
2188         }
2189     };
2190
2191     sub unvtbl {
2192         my $arg=shift;
2193
2194         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
2195         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
2196                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
2197     }
2198
2199     sub unvdup32 {
2200         my $arg=shift;
2201
2202         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
2203         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
2204     }
2205
2206     sub unvmov32 {
2207         my $arg=shift;
2208
2209         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
2210         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
2211     }
2212
2213     foreach(split("\n",$code)) {
2214         s/\`([^\`]*)\`/eval($1)/geo;
2215
2216         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
2217         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
2218         s/\/\/\s?/@ /o;                         # new->old style commentary
2219
2220         # fix up remaining new-style suffixes
2221         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
2222         s/\],#[0-9]+/]!/o;
2223
2224         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
2225         s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2     $1,#0/o or
2226         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
2227         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
2228         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
2229         s/^(\s+)b\./$1b/o                               or
2230         s/^(\s+)ret/$1bx\tlr/o;
2231
2232         if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
2233             print "     it      $2\n";
2234         }
2235
2236         print $_,"\n";
2237     }
2238 }
2239
2240 close STDOUT;