crypto/aes/asm/aesv8-armx.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # This module implements support for ARMv8 AES instructions. The
  18 # module is endian-agnostic in sense that it supports both big- and
  19 # little-endian cases. As does it support both 32- and 64-bit modes
  20 # of operation. Latter is achieved by limiting amount of utilized
  21 # registers to 16, which implies additional NEON load and integer
  22 # instructions. This has no effect on mighty Apple A7, where results
  23 # are literally equal to the theoretical estimates based on AES
  24 # instruction latencies and issue rates. On Cortex-A53, an in-order
  25 # execution core, this costs up to 10-15%, which is partially
  26 # compensated by implementing dedicated code path for 128-bit
  27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  28 # seems to be limited by sheer amount of NEON instructions...
  29 #
  30 # April 2019
  31 #
  32 # Key to performance of parallelize-able modes is round instruction
  33 # interleaving. But which factor to use? There is optimal one for
  34 # each combination of instruction latency and issue rate, beyond
  35 # which increasing interleave factor doesn't pay off. While on cons
  36 # side we have code size increase and resource waste on platforms for
  37 # which interleave factor is too high. In other words you want it to
  38 # be just right. So far interleave factor of 3x was serving well all
  39 # platforms. But for ThunderX2 optimal interleave factor was measured
  40 # to be 5x...
  41 #
  42 # Performance in cycles per byte processed with 128-bit key:
  43 #
  44 #               CBC enc         CBC dec         CTR
  45 # Apple A7      2.39            1.20            1.20
  46 # Cortex-A53    1.32            1.17/1.29(**)   1.36/1.46
  47 # Cortex-A57(*) 1.95            0.82/0.85       0.89/0.93
  48 # Cortex-A72    1.33            0.85/0.88       0.92/0.96
  49 # Denver        1.96            0.65/0.86       0.76/0.80
  50 # Mongoose      1.33            1.23/1.20       1.30/1.20
  51 # Kryo          1.26            0.87/0.94       1.00/1.00
  52 # ThunderX2     5.95            1.25            1.30
  53 #
  54 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
  55 #       and are still same even for updated module;
  56 # (**)  numbers after slash are for 32-bit code, which is 3x-
  57 #       interleaved;
  58
  59 # $output is the last argument if it looks like a file (it has an extension)
  60 # $flavour is the first argument if it doesn't look like a file
  61 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  62 $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
  63
  64 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  65 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  66 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  67 die "can't locate arm-xlate.pl";
  68
  69 open OUT,"| \"$^X\" $xlate $flavour \"$output\""
  70     or die "can't call $xlate: $!";
  71 *STDOUT=*OUT;
  72
  73 $prefix="aes_v8";
  74
  75 $_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
  76
  77 $code=<<___;
  78 #include "arm_arch.h"
  79
  80 #if __ARM_MAX_ARCH__>=7
  81 ___
  82 $code.=".arch   armv8-a+crypto\n.text\n"                if ($flavour =~ /64/);
  83 $code.=<<___                                            if ($flavour !~ /64/);
  84 .arch   armv7-a // don't confuse not-so-latest binutils with argv8 :-)
  85 .fpu    neon
  86 #ifdef  __thumb2__
  87 .syntax unified
  88 .thumb
  89 # define INST(a,b,c,d)  $_byte  c,d|0xc,a,b
  90 #else
  91 .code   32
  92 # define INST(a,b,c,d)  $_byte  a,b,c,d
  93 #endif
  94
  95 .text
  96 ___
  97
  98 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  99 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
 100 # maintain both 32- and 64-bit codes within single module and
 101 # transliterate common code to either flavour with regex vodoo.
 102 #
 103 {{{
 104 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
 105 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
 106         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
 107
 108
 109 $code.=<<___;
 110 .align  5
 111 .Lrcon:
 112 .long   0x01,0x01,0x01,0x01
 113 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
 114 .long   0x1b,0x1b,0x1b,0x1b
 115
 116 .globl  ${prefix}_set_encrypt_key
 117 .type   ${prefix}_set_encrypt_key,%function
 118 .align  5
 119 ${prefix}_set_encrypt_key:
 120 .Lenc_key:
 121 ___
 122 $code.=<<___    if ($flavour =~ /64/);
 123         stp     x29,x30,[sp,#-16]!
 124         add     x29,sp,#0
 125 ___
 126 $code.=<<___;
 127         mov     $ptr,#-1
 128         cmp     $inp,#0
 129         b.eq    .Lenc_key_abort
 130         cmp     $out,#0
 131         b.eq    .Lenc_key_abort
 132         mov     $ptr,#-2
 133         cmp     $bits,#128
 134         b.lt    .Lenc_key_abort
 135         cmp     $bits,#256
 136         b.gt    .Lenc_key_abort
 137         tst     $bits,#0x3f
 138         b.ne    .Lenc_key_abort
 139
 140         adr     $ptr,.Lrcon
 141         cmp     $bits,#192
 142
 143         veor    $zero,$zero,$zero
 144         vld1.8  {$in0},[$inp],#16
 145         mov     $bits,#8                // reuse $bits
 146         vld1.32 {$rcon,$mask},[$ptr],#32
 147
 148         b.lt    .Loop128
 149         b.eq    .L192
 150         b       .L256
 151
 152 .align  4
 153 .Loop128:
 154         vtbl.8  $key,{$in0},$mask
 155         vext.8  $tmp,$zero,$in0,#12
 156         vst1.32 {$in0},[$out],#16
 157         aese    $key,$zero
 158         subs    $bits,$bits,#1
 159
 160         veor    $in0,$in0,$tmp
 161         vext.8  $tmp,$zero,$tmp,#12
 162         veor    $in0,$in0,$tmp
 163         vext.8  $tmp,$zero,$tmp,#12
 164          veor   $key,$key,$rcon
 165         veor    $in0,$in0,$tmp
 166         vshl.u8 $rcon,$rcon,#1
 167         veor    $in0,$in0,$key
 168         b.ne    .Loop128
 169
 170         vld1.32 {$rcon},[$ptr]
 171
 172         vtbl.8  $key,{$in0},$mask
 173         vext.8  $tmp,$zero,$in0,#12
 174         vst1.32 {$in0},[$out],#16
 175         aese    $key,$zero
 176
 177         veor    $in0,$in0,$tmp
 178         vext.8  $tmp,$zero,$tmp,#12
 179         veor    $in0,$in0,$tmp
 180         vext.8  $tmp,$zero,$tmp,#12
 181          veor   $key,$key,$rcon
 182         veor    $in0,$in0,$tmp
 183         vshl.u8 $rcon,$rcon,#1
 184         veor    $in0,$in0,$key
 185
 186         vtbl.8  $key,{$in0},$mask
 187         vext.8  $tmp,$zero,$in0,#12
 188         vst1.32 {$in0},[$out],#16
 189         aese    $key,$zero
 190
 191         veor    $in0,$in0,$tmp
 192         vext.8  $tmp,$zero,$tmp,#12
 193         veor    $in0,$in0,$tmp
 194         vext.8  $tmp,$zero,$tmp,#12
 195          veor   $key,$key,$rcon
 196         veor    $in0,$in0,$tmp
 197         veor    $in0,$in0,$key
 198         vst1.32 {$in0},[$out]
 199         add     $out,$out,#0x50
 200
 201         mov     $rounds,#10
 202         b       .Ldone
 203
 204 .align  4
 205 .L192:
 206         vld1.8  {$in1},[$inp],#8
 207         vmov.i8 $key,#8                 // borrow $key
 208         vst1.32 {$in0},[$out],#16
 209         vsub.i8 $mask,$mask,$key        // adjust the mask
 210
 211 .Loop192:
 212         vtbl.8  $key,{$in1},$mask
 213         vext.8  $tmp,$zero,$in0,#12
 214         vst1.32 {$in1},[$out],#8
 215         aese    $key,$zero
 216         subs    $bits,$bits,#1
 217
 218         veor    $in0,$in0,$tmp
 219         vext.8  $tmp,$zero,$tmp,#12
 220         veor    $in0,$in0,$tmp
 221         vext.8  $tmp,$zero,$tmp,#12
 222         veor    $in0,$in0,$tmp
 223
 224         vdup.32 $tmp,${in0}[3]
 225         veor    $tmp,$tmp,$in1
 226          veor   $key,$key,$rcon
 227         vext.8  $in1,$zero,$in1,#12
 228         vshl.u8 $rcon,$rcon,#1
 229         veor    $in1,$in1,$tmp
 230         veor    $in0,$in0,$key
 231         veor    $in1,$in1,$key
 232         vst1.32 {$in0},[$out],#16
 233         b.ne    .Loop192
 234
 235         mov     $rounds,#12
 236         add     $out,$out,#0x20
 237         b       .Ldone
 238
 239 .align  4
 240 .L256:
 241         vld1.8  {$in1},[$inp]
 242         mov     $bits,#7
 243         mov     $rounds,#14
 244         vst1.32 {$in0},[$out],#16
 245
 246 .Loop256:
 247         vtbl.8  $key,{$in1},$mask
 248         vext.8  $tmp,$zero,$in0,#12
 249         vst1.32 {$in1},[$out],#16
 250         aese    $key,$zero
 251         subs    $bits,$bits,#1
 252
 253         veor    $in0,$in0,$tmp
 254         vext.8  $tmp,$zero,$tmp,#12
 255         veor    $in0,$in0,$tmp
 256         vext.8  $tmp,$zero,$tmp,#12
 257          veor   $key,$key,$rcon
 258         veor    $in0,$in0,$tmp
 259         vshl.u8 $rcon,$rcon,#1
 260         veor    $in0,$in0,$key
 261         vst1.32 {$in0},[$out],#16
 262         b.eq    .Ldone
 263
 264         vdup.32 $key,${in0}[3]          // just splat
 265         vext.8  $tmp,$zero,$in1,#12
 266         aese    $key,$zero
 267
 268         veor    $in1,$in1,$tmp
 269         vext.8  $tmp,$zero,$tmp,#12
 270         veor    $in1,$in1,$tmp
 271         vext.8  $tmp,$zero,$tmp,#12
 272         veor    $in1,$in1,$tmp
 273
 274         veor    $in1,$in1,$key
 275         b       .Loop256
 276
 277 .Ldone:
 278         str     $rounds,[$out]
 279         mov     $ptr,#0
 280
 281 .Lenc_key_abort:
 282         mov     x0,$ptr                 // return value
 283         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 284         ret
 285 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 286
 287 .globl  ${prefix}_set_decrypt_key
 288 .type   ${prefix}_set_decrypt_key,%function
 289 .align  5
 290 ${prefix}_set_decrypt_key:
 291 ___
 292 $code.=<<___    if ($flavour =~ /64/);
 293         .inst   0xd503233f              // paciasp
 294         stp     x29,x30,[sp,#-16]!
 295         add     x29,sp,#0
 296 ___
 297 $code.=<<___    if ($flavour !~ /64/);
 298         stmdb   sp!,{r4,lr}
 299 ___
 300 $code.=<<___;
 301         bl      .Lenc_key
 302
 303         cmp     x0,#0
 304         b.ne    .Ldec_key_abort
 305
 306         sub     $out,$out,#240          // restore original $out
 307         mov     x4,#-16
 308         add     $inp,$out,x12,lsl#4     // end of key schedule
 309
 310         vld1.32 {v0.16b},[$out]
 311         vld1.32 {v1.16b},[$inp]
 312         vst1.32 {v0.16b},[$inp],x4
 313         vst1.32 {v1.16b},[$out],#16
 314
 315 .Loop_imc:
 316         vld1.32 {v0.16b},[$out]
 317         vld1.32 {v1.16b},[$inp]
 318         aesimc  v0.16b,v0.16b
 319         aesimc  v1.16b,v1.16b
 320         vst1.32 {v0.16b},[$inp],x4
 321         vst1.32 {v1.16b},[$out],#16
 322         cmp     $inp,$out
 323         b.hi    .Loop_imc
 324
 325         vld1.32 {v0.16b},[$out]
 326         aesimc  v0.16b,v0.16b
 327         vst1.32 {v0.16b},[$inp]
 328
 329         eor     x0,x0,x0                // return value
 330 .Ldec_key_abort:
 331 ___
 332 $code.=<<___    if ($flavour !~ /64/);
 333         ldmia   sp!,{r4,pc}
 334 ___
 335 $code.=<<___    if ($flavour =~ /64/);
 336         ldp     x29,x30,[sp],#16
 337         .inst   0xd50323bf              // autiasp
 338         ret
 339 ___
 340 $code.=<<___;
 341 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 342 ___
 343 }}}
 344 {{{
 345 sub gen_block () {
 346 my $dir = shift;
 347 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 348 my ($inp,$out,$key)=map("x$_",(0..2));
 349 my $rounds="w3";
 350 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 351
 352 $code.=<<___;
 353 .globl  ${prefix}_${dir}crypt
 354 .type   ${prefix}_${dir}crypt,%function
 355 .align  5
 356 ${prefix}_${dir}crypt:
 357         ldr     $rounds,[$key,#240]
 358         vld1.32 {$rndkey0},[$key],#16
 359         vld1.8  {$inout},[$inp]
 360         sub     $rounds,$rounds,#2
 361         vld1.32 {$rndkey1},[$key],#16
 362
 363 .Loop_${dir}c:
 364         aes$e   $inout,$rndkey0
 365         aes$mc  $inout,$inout
 366         vld1.32 {$rndkey0},[$key],#16
 367         subs    $rounds,$rounds,#2
 368         aes$e   $inout,$rndkey1
 369         aes$mc  $inout,$inout
 370         vld1.32 {$rndkey1},[$key],#16
 371         b.gt    .Loop_${dir}c
 372
 373         aes$e   $inout,$rndkey0
 374         aes$mc  $inout,$inout
 375         vld1.32 {$rndkey0},[$key]
 376         aes$e   $inout,$rndkey1
 377         veor    $inout,$inout,$rndkey0
 378
 379         vst1.8  {$inout},[$out]
 380         ret
 381 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 382 ___
 383 }
 384 &gen_block("en");
 385 &gen_block("de");
 386 }}}
 387 {{{
 388 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 389 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
 390 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 391
 392 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 393 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
 394
 395 ### q8-q15      preloaded key schedule
 396
 397 $code.=<<___;
 398 .globl  ${prefix}_cbc_encrypt
 399 .type   ${prefix}_cbc_encrypt,%function
 400 .align  5
 401 ${prefix}_cbc_encrypt:
 402 ___
 403 $code.=<<___    if ($flavour =~ /64/);
 404         stp     x29,x30,[sp,#-16]!
 405         add     x29,sp,#0
 406 ___
 407 $code.=<<___    if ($flavour !~ /64/);
 408         mov     ip,sp
 409         stmdb   sp!,{r4-r8,lr}
 410         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 411         ldmia   ip,{r4-r5}              @ load remaining args
 412 ___
 413 $code.=<<___;
 414         subs    $len,$len,#16
 415         mov     $step,#16
 416         b.lo    .Lcbc_abort
 417         cclr    $step,eq
 418
 419         cmp     $enc,#0                 // en- or decrypting?
 420         ldr     $rounds,[$key,#240]
 421         and     $len,$len,#-16
 422         vld1.8  {$ivec},[$ivp]
 423         vld1.8  {$dat},[$inp],$step
 424
 425         vld1.32 {q8-q9},[$key]          // load key schedule...
 426         sub     $rounds,$rounds,#6
 427         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
 428         sub     $rounds,$rounds,#2
 429         vld1.32 {q10-q11},[$key_],#32
 430         vld1.32 {q12-q13},[$key_],#32
 431         vld1.32 {q14-q15},[$key_],#32
 432         vld1.32 {$rndlast},[$key_]
 433
 434         add     $key_,$key,#32
 435         mov     $cnt,$rounds
 436         b.eq    .Lcbc_dec
 437
 438         cmp     $rounds,#2
 439         veor    $dat,$dat,$ivec
 440         veor    $rndzero_n_last,q8,$rndlast
 441         b.eq    .Lcbc_enc128
 442
 443         vld1.32 {$in0-$in1},[$key_]
 444         add     $key_,$key,#16
 445         add     $key4,$key,#16*4
 446         add     $key5,$key,#16*5
 447         aese    $dat,q8
 448         aesmc   $dat,$dat
 449         add     $key6,$key,#16*6
 450         add     $key7,$key,#16*7
 451         b       .Lenter_cbc_enc
 452
 453 .align  4
 454 .Loop_cbc_enc:
 455         aese    $dat,q8
 456         aesmc   $dat,$dat
 457          vst1.8 {$ivec},[$out],#16
 458 .Lenter_cbc_enc:
 459         aese    $dat,q9
 460         aesmc   $dat,$dat
 461         aese    $dat,$in0
 462         aesmc   $dat,$dat
 463         vld1.32 {q8},[$key4]
 464         cmp     $rounds,#4
 465         aese    $dat,$in1
 466         aesmc   $dat,$dat
 467         vld1.32 {q9},[$key5]
 468         b.eq    .Lcbc_enc192
 469
 470         aese    $dat,q8
 471         aesmc   $dat,$dat
 472         vld1.32 {q8},[$key6]
 473         aese    $dat,q9
 474         aesmc   $dat,$dat
 475         vld1.32 {q9},[$key7]
 476         nop
 477
 478 .Lcbc_enc192:
 479         aese    $dat,q8
 480         aesmc   $dat,$dat
 481          subs   $len,$len,#16
 482         aese    $dat,q9
 483         aesmc   $dat,$dat
 484          cclr   $step,eq
 485         aese    $dat,q10
 486         aesmc   $dat,$dat
 487         aese    $dat,q11
 488         aesmc   $dat,$dat
 489          vld1.8 {q8},[$inp],$step
 490         aese    $dat,q12
 491         aesmc   $dat,$dat
 492          veor   q8,q8,$rndzero_n_last
 493         aese    $dat,q13
 494         aesmc   $dat,$dat
 495          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
 496         aese    $dat,q14
 497         aesmc   $dat,$dat
 498         aese    $dat,q15
 499         veor    $ivec,$dat,$rndlast
 500         b.hs    .Loop_cbc_enc
 501
 502         vst1.8  {$ivec},[$out],#16
 503         b       .Lcbc_done
 504
 505 .align  5
 506 .Lcbc_enc128:
 507         vld1.32 {$in0-$in1},[$key_]
 508         aese    $dat,q8
 509         aesmc   $dat,$dat
 510         b       .Lenter_cbc_enc128
 511 .Loop_cbc_enc128:
 512         aese    $dat,q8
 513         aesmc   $dat,$dat
 514          vst1.8 {$ivec},[$out],#16
 515 .Lenter_cbc_enc128:
 516         aese    $dat,q9
 517         aesmc   $dat,$dat
 518          subs   $len,$len,#16
 519         aese    $dat,$in0
 520         aesmc   $dat,$dat
 521          cclr   $step,eq
 522         aese    $dat,$in1
 523         aesmc   $dat,$dat
 524         aese    $dat,q10
 525         aesmc   $dat,$dat
 526         aese    $dat,q11
 527         aesmc   $dat,$dat
 528          vld1.8 {q8},[$inp],$step
 529         aese    $dat,q12
 530         aesmc   $dat,$dat
 531         aese    $dat,q13
 532         aesmc   $dat,$dat
 533         aese    $dat,q14
 534         aesmc   $dat,$dat
 535          veor   q8,q8,$rndzero_n_last
 536         aese    $dat,q15
 537         veor    $ivec,$dat,$rndlast
 538         b.hs    .Loop_cbc_enc128
 539
 540         vst1.8  {$ivec},[$out],#16
 541         b       .Lcbc_done
 542 ___
 543 {
 544 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 545
 546 my ($dat3,$in3,$tmp3);  # used only in 64-bit mode
 547 my ($dat4,$in4,$tmp4);
 548 if ($flavour =~ /64/) {
 549     ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
 550 }
 551
 552 $code.=<<___;
 553 .align  5
 554 .Lcbc_dec:
 555         vld1.8  {$dat2},[$inp],#16
 556         subs    $len,$len,#32           // bias
 557         add     $cnt,$rounds,#2
 558         vorr    $in1,$dat,$dat
 559         vorr    $dat1,$dat,$dat
 560         vorr    $in2,$dat2,$dat2
 561         b.lo    .Lcbc_dec_tail
 562
 563         vorr    $dat1,$dat2,$dat2
 564         vld1.8  {$dat2},[$inp],#16
 565         vorr    $in0,$dat,$dat
 566         vorr    $in1,$dat1,$dat1
 567         vorr    $in2,$dat2,$dat2
 568 ___
 569 $code.=<<___    if ($flavour =~ /64/);
 570         cmp     $len,#32
 571         b.lo    .Loop3x_cbc_dec
 572
 573         vld1.8  {$dat3},[$inp],#16
 574         vld1.8  {$dat4},[$inp],#16
 575         sub     $len,$len,#32           // bias
 576         mov     $cnt,$rounds
 577         vorr    $in3,$dat3,$dat3
 578         vorr    $in4,$dat4,$dat4
 579
 580 .Loop5x_cbc_dec:
 581         aesd    $dat0,q8
 582         aesimc  $dat0,$dat0
 583         aesd    $dat1,q8
 584         aesimc  $dat1,$dat1
 585         aesd    $dat2,q8
 586         aesimc  $dat2,$dat2
 587         aesd    $dat3,q8
 588         aesimc  $dat3,$dat3
 589         aesd    $dat4,q8
 590         aesimc  $dat4,$dat4
 591         vld1.32 {q8},[$key_],#16
 592         subs    $cnt,$cnt,#2
 593         aesd    $dat0,q9
 594         aesimc  $dat0,$dat0
 595         aesd    $dat1,q9
 596         aesimc  $dat1,$dat1
 597         aesd    $dat2,q9
 598         aesimc  $dat2,$dat2
 599         aesd    $dat3,q9
 600         aesimc  $dat3,$dat3
 601         aesd    $dat4,q9
 602         aesimc  $dat4,$dat4
 603         vld1.32 {q9},[$key_],#16
 604         b.gt    .Loop5x_cbc_dec
 605
 606         aesd    $dat0,q8
 607         aesimc  $dat0,$dat0
 608         aesd    $dat1,q8
 609         aesimc  $dat1,$dat1
 610         aesd    $dat2,q8
 611         aesimc  $dat2,$dat2
 612         aesd    $dat3,q8
 613         aesimc  $dat3,$dat3
 614         aesd    $dat4,q8
 615         aesimc  $dat4,$dat4
 616          cmp    $len,#0x40              // because .Lcbc_tail4x
 617          sub    $len,$len,#0x50
 618
 619         aesd    $dat0,q9
 620         aesimc  $dat0,$dat0
 621         aesd    $dat1,q9
 622         aesimc  $dat1,$dat1
 623         aesd    $dat2,q9
 624         aesimc  $dat2,$dat2
 625         aesd    $dat3,q9
 626         aesimc  $dat3,$dat3
 627         aesd    $dat4,q9
 628         aesimc  $dat4,$dat4
 629          csel   x6,xzr,$len,gt          // borrow x6, $cnt, "gt" is not typo
 630          mov    $key_,$key
 631
 632         aesd    $dat0,q10
 633         aesimc  $dat0,$dat0
 634         aesd    $dat1,q10
 635         aesimc  $dat1,$dat1
 636         aesd    $dat2,q10
 637         aesimc  $dat2,$dat2
 638         aesd    $dat3,q10
 639         aesimc  $dat3,$dat3
 640         aesd    $dat4,q10
 641         aesimc  $dat4,$dat4
 642          add    $inp,$inp,x6            // $inp is adjusted in such way that
 643                                         // at exit from the loop $dat1-$dat4
 644                                         // are loaded with last "words"
 645          add    x6,$len,#0x60           // because .Lcbc_tail4x
 646
 647         aesd    $dat0,q11
 648         aesimc  $dat0,$dat0
 649         aesd    $dat1,q11
 650         aesimc  $dat1,$dat1
 651         aesd    $dat2,q11
 652         aesimc  $dat2,$dat2
 653         aesd    $dat3,q11
 654         aesimc  $dat3,$dat3
 655         aesd    $dat4,q11
 656         aesimc  $dat4,$dat4
 657
 658         aesd    $dat0,q12
 659         aesimc  $dat0,$dat0
 660         aesd    $dat1,q12
 661         aesimc  $dat1,$dat1
 662         aesd    $dat2,q12
 663         aesimc  $dat2,$dat2
 664         aesd    $dat3,q12
 665         aesimc  $dat3,$dat3
 666         aesd    $dat4,q12
 667         aesimc  $dat4,$dat4
 668
 669         aesd    $dat0,q13
 670         aesimc  $dat0,$dat0
 671         aesd    $dat1,q13
 672         aesimc  $dat1,$dat1
 673         aesd    $dat2,q13
 674         aesimc  $dat2,$dat2
 675         aesd    $dat3,q13
 676         aesimc  $dat3,$dat3
 677         aesd    $dat4,q13
 678         aesimc  $dat4,$dat4
 679
 680         aesd    $dat0,q14
 681         aesimc  $dat0,$dat0
 682         aesd    $dat1,q14
 683         aesimc  $dat1,$dat1
 684         aesd    $dat2,q14
 685         aesimc  $dat2,$dat2
 686         aesd    $dat3,q14
 687         aesimc  $dat3,$dat3
 688         aesd    $dat4,q14
 689         aesimc  $dat4,$dat4
 690
 691          veor   $tmp0,$ivec,$rndlast
 692         aesd    $dat0,q15
 693          veor   $tmp1,$in0,$rndlast
 694          vld1.8 {$in0},[$inp],#16
 695         aesd    $dat1,q15
 696          veor   $tmp2,$in1,$rndlast
 697          vld1.8 {$in1},[$inp],#16
 698         aesd    $dat2,q15
 699          veor   $tmp3,$in2,$rndlast
 700          vld1.8 {$in2},[$inp],#16
 701         aesd    $dat3,q15
 702          veor   $tmp4,$in3,$rndlast
 703          vld1.8 {$in3},[$inp],#16
 704         aesd    $dat4,q15
 705          vorr   $ivec,$in4,$in4
 706          vld1.8 {$in4},[$inp],#16
 707         cbz     x6,.Lcbc_tail4x
 708          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
 709         veor    $tmp0,$tmp0,$dat0
 710          vorr   $dat0,$in0,$in0
 711         veor    $tmp1,$tmp1,$dat1
 712          vorr   $dat1,$in1,$in1
 713         veor    $tmp2,$tmp2,$dat2
 714          vorr   $dat2,$in2,$in2
 715         veor    $tmp3,$tmp3,$dat3
 716          vorr   $dat3,$in3,$in3
 717         veor    $tmp4,$tmp4,$dat4
 718         vst1.8  {$tmp0},[$out],#16
 719          vorr   $dat4,$in4,$in4
 720         vst1.8  {$tmp1},[$out],#16
 721          mov    $cnt,$rounds
 722         vst1.8  {$tmp2},[$out],#16
 723          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 724         vst1.8  {$tmp3},[$out],#16
 725         vst1.8  {$tmp4},[$out],#16
 726         b.hs    .Loop5x_cbc_dec
 727
 728         add     $len,$len,#0x50
 729         cbz     $len,.Lcbc_done
 730
 731         add     $cnt,$rounds,#2
 732         subs    $len,$len,#0x30
 733         vorr    $dat0,$in2,$in2
 734         vorr    $in0,$in2,$in2
 735         vorr    $dat1,$in3,$in3
 736         vorr    $in1,$in3,$in3
 737         vorr    $dat2,$in4,$in4
 738         vorr    $in2,$in4,$in4
 739         b.lo    .Lcbc_dec_tail
 740
 741         b       .Loop3x_cbc_dec
 742
 743 .align  4
 744 .Lcbc_tail4x:
 745         veor    $tmp1,$tmp0,$dat1
 746         veor    $tmp2,$tmp2,$dat2
 747         veor    $tmp3,$tmp3,$dat3
 748         veor    $tmp4,$tmp4,$dat4
 749         vst1.8  {$tmp1},[$out],#16
 750         vst1.8  {$tmp2},[$out],#16
 751         vst1.8  {$tmp3},[$out],#16
 752         vst1.8  {$tmp4},[$out],#16
 753
 754         b       .Lcbc_done
 755 .align  4
 756 ___
 757 $code.=<<___;
 758 .Loop3x_cbc_dec:
 759         aesd    $dat0,q8
 760         aesimc  $dat0,$dat0
 761         aesd    $dat1,q8
 762         aesimc  $dat1,$dat1
 763         aesd    $dat2,q8
 764         aesimc  $dat2,$dat2
 765         vld1.32 {q8},[$key_],#16
 766         subs    $cnt,$cnt,#2
 767         aesd    $dat0,q9
 768         aesimc  $dat0,$dat0
 769         aesd    $dat1,q9
 770         aesimc  $dat1,$dat1
 771         aesd    $dat2,q9
 772         aesimc  $dat2,$dat2
 773         vld1.32 {q9},[$key_],#16
 774         b.gt    .Loop3x_cbc_dec
 775
 776         aesd    $dat0,q8
 777         aesimc  $dat0,$dat0
 778         aesd    $dat1,q8
 779         aesimc  $dat1,$dat1
 780         aesd    $dat2,q8
 781         aesimc  $dat2,$dat2
 782          veor   $tmp0,$ivec,$rndlast
 783          subs   $len,$len,#0x30
 784          veor   $tmp1,$in0,$rndlast
 785          mov.lo x6,$len                 // x6, $cnt, is zero at this point
 786         aesd    $dat0,q9
 787         aesimc  $dat0,$dat0
 788         aesd    $dat1,q9
 789         aesimc  $dat1,$dat1
 790         aesd    $dat2,q9
 791         aesimc  $dat2,$dat2
 792          veor   $tmp2,$in1,$rndlast
 793          add    $inp,$inp,x6            // $inp is adjusted in such way that
 794                                         // at exit from the loop $dat1-$dat2
 795                                         // are loaded with last "words"
 796          vorr   $ivec,$in2,$in2
 797          mov    $key_,$key
 798         aesd    $dat0,q12
 799         aesimc  $dat0,$dat0
 800         aesd    $dat1,q12
 801         aesimc  $dat1,$dat1
 802         aesd    $dat2,q12
 803         aesimc  $dat2,$dat2
 804          vld1.8 {$in0},[$inp],#16
 805         aesd    $dat0,q13
 806         aesimc  $dat0,$dat0
 807         aesd    $dat1,q13
 808         aesimc  $dat1,$dat1
 809         aesd    $dat2,q13
 810         aesimc  $dat2,$dat2
 811          vld1.8 {$in1},[$inp],#16
 812         aesd    $dat0,q14
 813         aesimc  $dat0,$dat0
 814         aesd    $dat1,q14
 815         aesimc  $dat1,$dat1
 816         aesd    $dat2,q14
 817         aesimc  $dat2,$dat2
 818          vld1.8 {$in2},[$inp],#16
 819         aesd    $dat0,q15
 820         aesd    $dat1,q15
 821         aesd    $dat2,q15
 822          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
 823          add    $cnt,$rounds,#2
 824         veor    $tmp0,$tmp0,$dat0
 825         veor    $tmp1,$tmp1,$dat1
 826         veor    $dat2,$dat2,$tmp2
 827          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 828         vst1.8  {$tmp0},[$out],#16
 829          vorr   $dat0,$in0,$in0
 830         vst1.8  {$tmp1},[$out],#16
 831          vorr   $dat1,$in1,$in1
 832         vst1.8  {$dat2},[$out],#16
 833          vorr   $dat2,$in2,$in2
 834         b.hs    .Loop3x_cbc_dec
 835
 836         cmn     $len,#0x30
 837         b.eq    .Lcbc_done
 838         nop
 839
 840 .Lcbc_dec_tail:
 841         aesd    $dat1,q8
 842         aesimc  $dat1,$dat1
 843         aesd    $dat2,q8
 844         aesimc  $dat2,$dat2
 845         vld1.32 {q8},[$key_],#16
 846         subs    $cnt,$cnt,#2
 847         aesd    $dat1,q9
 848         aesimc  $dat1,$dat1
 849         aesd    $dat2,q9
 850         aesimc  $dat2,$dat2
 851         vld1.32 {q9},[$key_],#16
 852         b.gt    .Lcbc_dec_tail
 853
 854         aesd    $dat1,q8
 855         aesimc  $dat1,$dat1
 856         aesd    $dat2,q8
 857         aesimc  $dat2,$dat2
 858         aesd    $dat1,q9
 859         aesimc  $dat1,$dat1
 860         aesd    $dat2,q9
 861         aesimc  $dat2,$dat2
 862         aesd    $dat1,q12
 863         aesimc  $dat1,$dat1
 864         aesd    $dat2,q12
 865         aesimc  $dat2,$dat2
 866          cmn    $len,#0x20
 867         aesd    $dat1,q13
 868         aesimc  $dat1,$dat1
 869         aesd    $dat2,q13
 870         aesimc  $dat2,$dat2
 871          veor   $tmp1,$ivec,$rndlast
 872         aesd    $dat1,q14
 873         aesimc  $dat1,$dat1
 874         aesd    $dat2,q14
 875         aesimc  $dat2,$dat2
 876          veor   $tmp2,$in1,$rndlast
 877         aesd    $dat1,q15
 878         aesd    $dat2,q15
 879         b.eq    .Lcbc_dec_one
 880         veor    $tmp1,$tmp1,$dat1
 881         veor    $tmp2,$tmp2,$dat2
 882          vorr   $ivec,$in2,$in2
 883         vst1.8  {$tmp1},[$out],#16
 884         vst1.8  {$tmp2},[$out],#16
 885         b       .Lcbc_done
 886
 887 .Lcbc_dec_one:
 888         veor    $tmp1,$tmp1,$dat2
 889          vorr   $ivec,$in2,$in2
 890         vst1.8  {$tmp1},[$out],#16
 891
 892 .Lcbc_done:
 893         vst1.8  {$ivec},[$ivp]
 894 .Lcbc_abort:
 895 ___
 896 }
 897 $code.=<<___    if ($flavour !~ /64/);
 898         vldmia  sp!,{d8-d15}
 899         ldmia   sp!,{r4-r8,pc}
 900 ___
 901 $code.=<<___    if ($flavour =~ /64/);
 902         ldr     x29,[sp],#16
 903         ret
 904 ___
 905 $code.=<<___;
 906 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 907 ___
 908 }}}
 909 {{{
 910 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 911 my ($rounds,$cnt,$key_)=("w5","w6","x7");
 912 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
 913 my $step="x12";         # aliases with $tctr2
 914
 915 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 916 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 917
 918 # used only in 64-bit mode...
 919 my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
 920
 921 my ($dat,$tmp)=($dat0,$tmp0);
 922
 923 ### q8-q15      preloaded key schedule
 924
 925 $code.=<<___;
 926 .globl  ${prefix}_ctr32_encrypt_blocks
 927 .type   ${prefix}_ctr32_encrypt_blocks,%function
 928 .align  5
 929 ${prefix}_ctr32_encrypt_blocks:
 930 ___
 931 $code.=<<___    if ($flavour =~ /64/);
 932         stp             x29,x30,[sp,#-16]!
 933         add             x29,sp,#0
 934 ___
 935 $code.=<<___    if ($flavour !~ /64/);
 936         mov             ip,sp
 937         stmdb           sp!,{r4-r10,lr}
 938         vstmdb          sp!,{d8-d15}            @ ABI specification says so
 939         ldr             r4, [ip]                @ load remaining arg
 940 ___
 941 $code.=<<___;
 942         ldr             $rounds,[$key,#240]
 943
 944         ldr             $ctr, [$ivp, #12]
 945         vld1.32         {$dat0},[$ivp]
 946
 947         vld1.32         {q8-q9},[$key]          // load key schedule...
 948         sub             $rounds,$rounds,#4
 949         mov             $step,#16
 950         cmp             $len,#2
 951         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
 952         sub             $rounds,$rounds,#2
 953         vld1.32         {q12-q13},[$key_],#32
 954         vld1.32         {q14-q15},[$key_],#32
 955         vld1.32         {$rndlast},[$key_]
 956         add             $key_,$key,#32
 957         mov             $cnt,$rounds
 958         cclr            $step,lo
 959 #ifndef __ARMEB__
 960         rev             $ctr, $ctr
 961 #endif
 962         vorr            $dat1,$dat0,$dat0
 963         add             $tctr1, $ctr, #1
 964         vorr            $dat2,$dat0,$dat0
 965         add             $ctr, $ctr, #2
 966         vorr            $ivec,$dat0,$dat0
 967         rev             $tctr1, $tctr1
 968         vmov.32         ${dat1}[3],$tctr1
 969         b.ls            .Lctr32_tail
 970         rev             $tctr2, $ctr
 971         sub             $len,$len,#3            // bias
 972         vmov.32         ${dat2}[3],$tctr2
 973 ___
 974 $code.=<<___    if ($flavour =~ /64/);
 975         cmp             $len,#2
 976         b.lo            .Loop3x_ctr32
 977
 978         add             w13,$ctr,#1
 979         add             w14,$ctr,#2
 980         vorr            $dat3,$dat0,$dat0
 981         rev             w13,w13
 982         vorr            $dat4,$dat0,$dat0
 983         rev             w14,w14
 984         vmov.32         ${dat3}[3],w13
 985         sub             $len,$len,#2            // bias
 986         vmov.32         ${dat4}[3],w14
 987         add             $ctr,$ctr,#2
 988         b               .Loop5x_ctr32
 989
 990 .align  4
 991 .Loop5x_ctr32:
 992         aese            $dat0,q8
 993         aesmc           $dat0,$dat0
 994         aese            $dat1,q8
 995         aesmc           $dat1,$dat1
 996         aese            $dat2,q8
 997         aesmc           $dat2,$dat2
 998         aese            $dat3,q8
 999         aesmc           $dat3,$dat3
1000         aese            $dat4,q8
1001         aesmc           $dat4,$dat4
1002         vld1.32         {q8},[$key_],#16
1003         subs            $cnt,$cnt,#2
1004         aese            $dat0,q9
1005         aesmc           $dat0,$dat0
1006         aese            $dat1,q9
1007         aesmc           $dat1,$dat1
1008         aese            $dat2,q9
1009         aesmc           $dat2,$dat2
1010         aese            $dat3,q9
1011         aesmc           $dat3,$dat3
1012         aese            $dat4,q9
1013         aesmc           $dat4,$dat4
1014         vld1.32         {q9},[$key_],#16
1015         b.gt            .Loop5x_ctr32
1016
1017         mov             $key_,$key
1018         aese            $dat0,q8
1019         aesmc           $dat0,$dat0
1020         aese            $dat1,q8
1021         aesmc           $dat1,$dat1
1022         aese            $dat2,q8
1023         aesmc           $dat2,$dat2
1024         aese            $dat3,q8
1025         aesmc           $dat3,$dat3
1026         aese            $dat4,q8
1027         aesmc           $dat4,$dat4
1028         vld1.32         {q8},[$key_],#16        // re-pre-load rndkey[0]
1029
1030         aese            $dat0,q9
1031         aesmc           $dat0,$dat0
1032         aese            $dat1,q9
1033         aesmc           $dat1,$dat1
1034         aese            $dat2,q9
1035         aesmc           $dat2,$dat2
1036         aese            $dat3,q9
1037         aesmc           $dat3,$dat3
1038         aese            $dat4,q9
1039         aesmc           $dat4,$dat4
1040         vld1.32         {q9},[$key_],#16        // re-pre-load rndkey[1]
1041
1042         aese            $dat0,q12
1043         aesmc           $dat0,$dat0
1044          add            $tctr0,$ctr,#1
1045          add            $tctr1,$ctr,#2
1046         aese            $dat1,q12
1047         aesmc           $dat1,$dat1
1048          add            $tctr2,$ctr,#3
1049          add            w13,$ctr,#4
1050         aese            $dat2,q12
1051         aesmc           $dat2,$dat2
1052          add            w14,$ctr,#5
1053          rev            $tctr0,$tctr0
1054         aese            $dat3,q12
1055         aesmc           $dat3,$dat3
1056          rev            $tctr1,$tctr1
1057          rev            $tctr2,$tctr2
1058         aese            $dat4,q12
1059         aesmc           $dat4,$dat4
1060          rev            w13,w13
1061          rev            w14,w14
1062
1063         aese            $dat0,q13
1064         aesmc           $dat0,$dat0
1065         aese            $dat1,q13
1066         aesmc           $dat1,$dat1
1067         aese            $dat2,q13
1068         aesmc           $dat2,$dat2
1069         aese            $dat3,q13
1070         aesmc           $dat3,$dat3
1071         aese            $dat4,q13
1072         aesmc           $dat4,$dat4
1073
1074         aese            $dat0,q14
1075         aesmc           $dat0,$dat0
1076          vld1.8         {$in0},[$inp],#16
1077         aese            $dat1,q14
1078         aesmc           $dat1,$dat1
1079          vld1.8         {$in1},[$inp],#16
1080         aese            $dat2,q14
1081         aesmc           $dat2,$dat2
1082          vld1.8         {$in2},[$inp],#16
1083         aese            $dat3,q14
1084         aesmc           $dat3,$dat3
1085          vld1.8         {$in3},[$inp],#16
1086         aese            $dat4,q14
1087         aesmc           $dat4,$dat4
1088          vld1.8         {$in4},[$inp],#16
1089
1090         aese            $dat0,q15
1091          veor           $in0,$in0,$rndlast
1092         aese            $dat1,q15
1093          veor           $in1,$in1,$rndlast
1094         aese            $dat2,q15
1095          veor           $in2,$in2,$rndlast
1096         aese            $dat3,q15
1097          veor           $in3,$in3,$rndlast
1098         aese            $dat4,q15
1099          veor           $in4,$in4,$rndlast
1100
1101         veor            $in0,$in0,$dat0
1102          vorr           $dat0,$ivec,$ivec
1103         veor            $in1,$in1,$dat1
1104          vorr           $dat1,$ivec,$ivec
1105         veor            $in2,$in2,$dat2
1106          vorr           $dat2,$ivec,$ivec
1107         veor            $in3,$in3,$dat3
1108          vorr           $dat3,$ivec,$ivec
1109         veor            $in4,$in4,$dat4
1110          vorr           $dat4,$ivec,$ivec
1111
1112         vst1.8          {$in0},[$out],#16
1113          vmov.32        ${dat0}[3],$tctr0
1114         vst1.8          {$in1},[$out],#16
1115          vmov.32        ${dat1}[3],$tctr1
1116         vst1.8          {$in2},[$out],#16
1117          vmov.32        ${dat2}[3],$tctr2
1118         vst1.8          {$in3},[$out],#16
1119          vmov.32        ${dat3}[3],w13
1120         vst1.8          {$in4},[$out],#16
1121          vmov.32        ${dat4}[3],w14
1122
1123         mov             $cnt,$rounds
1124         cbz             $len,.Lctr32_done
1125
1126         add             $ctr,$ctr,#5
1127         subs            $len,$len,#5
1128         b.hs            .Loop5x_ctr32
1129
1130         add             $len,$len,#5
1131         sub             $ctr,$ctr,#5
1132
1133         cmp             $len,#2
1134         mov             $step,#16
1135         cclr            $step,lo
1136         b.ls            .Lctr32_tail
1137
1138         sub             $len,$len,#3            // bias
1139         add             $ctr,$ctr,#3
1140 ___
1141 $code.=<<___;
1142         b               .Loop3x_ctr32
1143
1144 .align  4
1145 .Loop3x_ctr32:
1146         aese            $dat0,q8
1147         aesmc           $dat0,$dat0
1148         aese            $dat1,q8
1149         aesmc           $dat1,$dat1
1150         aese            $dat2,q8
1151         aesmc           $dat2,$dat2
1152         vld1.32         {q8},[$key_],#16
1153         subs            $cnt,$cnt,#2
1154         aese            $dat0,q9
1155         aesmc           $dat0,$dat0
1156         aese            $dat1,q9
1157         aesmc           $dat1,$dat1
1158         aese            $dat2,q9
1159         aesmc           $dat2,$dat2
1160         vld1.32         {q9},[$key_],#16
1161         b.gt            .Loop3x_ctr32
1162
1163         aese            $dat0,q8
1164         aesmc           $tmp0,$dat0
1165         aese            $dat1,q8
1166         aesmc           $tmp1,$dat1
1167          vld1.8         {$in0},[$inp],#16
1168          vorr           $dat0,$ivec,$ivec
1169         aese            $dat2,q8
1170         aesmc           $dat2,$dat2
1171          vld1.8         {$in1},[$inp],#16
1172          vorr           $dat1,$ivec,$ivec
1173         aese            $tmp0,q9
1174         aesmc           $tmp0,$tmp0
1175         aese            $tmp1,q9
1176         aesmc           $tmp1,$tmp1
1177          vld1.8         {$in2},[$inp],#16
1178          mov            $key_,$key
1179         aese            $dat2,q9
1180         aesmc           $tmp2,$dat2
1181          vorr           $dat2,$ivec,$ivec
1182          add            $tctr0,$ctr,#1
1183         aese            $tmp0,q12
1184         aesmc           $tmp0,$tmp0
1185         aese            $tmp1,q12
1186         aesmc           $tmp1,$tmp1
1187          veor           $in0,$in0,$rndlast
1188          add            $tctr1,$ctr,#2
1189         aese            $tmp2,q12
1190         aesmc           $tmp2,$tmp2
1191          veor           $in1,$in1,$rndlast
1192          add            $ctr,$ctr,#3
1193         aese            $tmp0,q13
1194         aesmc           $tmp0,$tmp0
1195         aese            $tmp1,q13
1196         aesmc           $tmp1,$tmp1
1197          veor           $in2,$in2,$rndlast
1198          rev            $tctr0,$tctr0
1199         aese            $tmp2,q13
1200         aesmc           $tmp2,$tmp2
1201          vmov.32        ${dat0}[3], $tctr0
1202          rev            $tctr1,$tctr1
1203         aese            $tmp0,q14
1204         aesmc           $tmp0,$tmp0
1205         aese            $tmp1,q14
1206         aesmc           $tmp1,$tmp1
1207          vmov.32        ${dat1}[3], $tctr1
1208          rev            $tctr2,$ctr
1209         aese            $tmp2,q14
1210         aesmc           $tmp2,$tmp2
1211          vmov.32        ${dat2}[3], $tctr2
1212          subs           $len,$len,#3
1213         aese            $tmp0,q15
1214         aese            $tmp1,q15
1215         aese            $tmp2,q15
1216
1217         veor            $in0,$in0,$tmp0
1218          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
1219         vst1.8          {$in0},[$out],#16
1220         veor            $in1,$in1,$tmp1
1221          mov            $cnt,$rounds
1222         vst1.8          {$in1},[$out],#16
1223         veor            $in2,$in2,$tmp2
1224          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
1225         vst1.8          {$in2},[$out],#16
1226         b.hs            .Loop3x_ctr32
1227
1228         adds            $len,$len,#3
1229         b.eq            .Lctr32_done
1230         cmp             $len,#1
1231         mov             $step,#16
1232         cclr            $step,eq
1233
1234 .Lctr32_tail:
1235         aese            $dat0,q8
1236         aesmc           $dat0,$dat0
1237         aese            $dat1,q8
1238         aesmc           $dat1,$dat1
1239         vld1.32         {q8},[$key_],#16
1240         subs            $cnt,$cnt,#2
1241         aese            $dat0,q9
1242         aesmc           $dat0,$dat0
1243         aese            $dat1,q9
1244         aesmc           $dat1,$dat1
1245         vld1.32         {q9},[$key_],#16
1246         b.gt            .Lctr32_tail
1247
1248         aese            $dat0,q8
1249         aesmc           $dat0,$dat0
1250         aese            $dat1,q8
1251         aesmc           $dat1,$dat1
1252         aese            $dat0,q9
1253         aesmc           $dat0,$dat0
1254         aese            $dat1,q9
1255         aesmc           $dat1,$dat1
1256          vld1.8         {$in0},[$inp],$step
1257         aese            $dat0,q12
1258         aesmc           $dat0,$dat0
1259         aese            $dat1,q12
1260         aesmc           $dat1,$dat1
1261          vld1.8         {$in1},[$inp]
1262         aese            $dat0,q13
1263         aesmc           $dat0,$dat0
1264         aese            $dat1,q13
1265         aesmc           $dat1,$dat1
1266          veor           $in0,$in0,$rndlast
1267         aese            $dat0,q14
1268         aesmc           $dat0,$dat0
1269         aese            $dat1,q14
1270         aesmc           $dat1,$dat1
1271          veor           $in1,$in1,$rndlast
1272         aese            $dat0,q15
1273         aese            $dat1,q15
1274
1275         cmp             $len,#1
1276         veor            $in0,$in0,$dat0
1277         veor            $in1,$in1,$dat1
1278         vst1.8          {$in0},[$out],#16
1279         b.eq            .Lctr32_done
1280         vst1.8          {$in1},[$out]
1281
1282 .Lctr32_done:
1283 ___
1284 $code.=<<___    if ($flavour !~ /64/);
1285         vldmia          sp!,{d8-d15}
1286         ldmia           sp!,{r4-r10,pc}
1287 ___
1288 $code.=<<___    if ($flavour =~ /64/);
1289         ldr             x29,[sp],#16
1290         ret
1291 ___
1292 $code.=<<___;
1293 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
1294 ___
1295 }}}
1296 $code.=<<___;
1297 #endif
1298 ___
1299 ########################################
1300 if ($flavour =~ /64/) {                 ######## 64-bit code
1301     my %opcode = (
1302         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
1303         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
1304
1305     local *unaes = sub {
1306         my ($mnemonic,$arg)=@_;
1307
1308         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
1309         sprintf ".inst\t0x%08x\t//%s %s",
1310                         $opcode{$mnemonic}|$1|($2<<5),
1311                         $mnemonic,$arg;
1312     };
1313
1314     foreach(split("\n",$code)) {
1315         s/\`([^\`]*)\`/eval($1)/geo;
1316
1317         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
1318         s/@\s/\/\//o;                   # old->new style commentary
1319
1320         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
1321         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
1322         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
1323         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
1324         s/vext\.8/ext/o         or
1325         s/vrev32\.8/rev32/o     or
1326         s/vtst\.8/cmtst/o       or
1327         s/vshr/ushr/o           or
1328         s/^(\s+)v/$1/o          or      # strip off v prefix
1329         s/\bbx\s+lr\b/ret/o;
1330
1331         # fix up remaining legacy suffixes
1332         s/\.[ui]?8//o;
1333         m/\],#8/o and s/\.16b/\.8b/go;
1334         s/\.[ui]?32//o and s/\.16b/\.4s/go;
1335         s/\.[ui]?64//o and s/\.16b/\.2d/go;
1336         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
1337
1338         print $_,"\n";
1339     }
1340 } else {                                ######## 32-bit code
1341     my %opcode = (
1342         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
1343         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
1344
1345     local *unaes = sub {
1346         my ($mnemonic,$arg)=@_;
1347
1348         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
1349             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
1350                                          |(($2&7)<<1) |(($2&8)<<2);
1351             # since ARMv7 instructions are always encoded little-endian.
1352             # correct solution is to use .inst directive, but older
1353             # assemblers don't implement it:-(
1354             sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
1355                         $word&0xff,($word>>8)&0xff,
1356                         ($word>>16)&0xff,($word>>24)&0xff,
1357                         $mnemonic,$arg;
1358         }
1359     };
1360
1361     sub unvtbl {
1362         my $arg=shift;
1363
1364         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
1365         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
1366                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
1367     }
1368
1369     sub unvdup32 {
1370         my $arg=shift;
1371
1372         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
1373         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
1374     }
1375
1376     sub unvmov32 {
1377         my $arg=shift;
1378
1379         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
1380         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
1381     }
1382
1383     foreach(split("\n",$code)) {
1384         s/\`([^\`]*)\`/eval($1)/geo;
1385
1386         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
1387         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
1388         s/\/\/\s?/@ /o;                         # new->old style commentary
1389
1390         # fix up remaining new-style suffixes
1391         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
1392         s/\],#[0-9]+/]!/o;
1393
1394         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
1395         s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2     $1,#0/o or
1396         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
1397         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
1398         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
1399         s/^(\s+)b\./$1b/o                               or
1400         s/^(\s+)ret/$1bx\tlr/o;
1401
1402         if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
1403             print "     it      $2\n";
1404         }
1405
1406         print $_,"\n";
1407     }
1408 }
1409
1410 close STDOUT;