crypto/aes/asm/aesv8-armx.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # This module implements support for ARMv8 AES instructions. The
  18 # module is endian-agnostic in sense that it supports both big- and
  19 # little-endian cases. As does it support both 32- and 64-bit modes
  20 # of operation. Latter is achieved by limiting amount of utilized
  21 # registers to 16, which implies additional NEON load and integer
  22 # instructions. This has no effect on mighty Apple A7, where results
  23 # are literally equal to the theoretical estimates based on AES
  24 # instruction latencies and issue rates. On Cortex-A53, an in-order
  25 # execution core, this costs up to 10-15%, which is partially
  26 # compensated by implementing dedicated code path for 128-bit
  27 # CBC encrypt case. On Cortex-A57 parallelizable mode performance
  28 # seems to be limited by sheer amount of NEON instructions...
  29 #
  30 # Performance in cycles per byte processed with 128-bit key:
  31 #
  32 #               CBC enc         CBC dec         CTR
  33 # Apple A7      2.39            1.20            1.20
  34 # Cortex-A53    1.32            1.29            1.46
  35 # Cortex-A57(*) 1.95            0.85            0.93
  36 # Denver        1.96            0.86            0.80
  37 # Mongoose      1.33            1.20            1.20
  38 #
  39 # (*)   original 3.64/1.34/1.32 results were for r0p0 revision
  40 #       and are still same even for updated module;
  41
  42 $flavour = shift;
  43 $output  = shift;
  44
  45 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  46 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
  47 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
  48 die "can't locate arm-xlate.pl";
  49
  50 open OUT,"| \"$^X\" $xlate $flavour $output";
  51 *STDOUT=*OUT;
  52
  53 $prefix="aes_v8";
  54
  55 $code=<<___;
  56 #include "arm_arch.h"
  57
  58 #if __ARM_MAX_ARCH__>=7
  59 .text
  60 ___
  61 $code.=".arch   armv8-a+crypto\n"                       if ($flavour =~ /64/);
  62 $code.=".arch   armv7-a\n.fpu   neon\n.code     32\n"   if ($flavour !~ /64/);
  63                 #^^^^^^ this is done to simplify adoption by not depending
  64                 #       on latest binutils.
  65
  66 # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
  67 # NEON is mostly 32-bit mnemonics, integer - mostly 64. Goal is to
  68 # maintain both 32- and 64-bit codes within single module and
  69 # transliterate common code to either flavour with regex vodoo.
  70 #
  71 {{{
  72 my ($inp,$bits,$out,$ptr,$rounds)=("x0","w1","x2","x3","w12");
  73 my ($zero,$rcon,$mask,$in0,$in1,$tmp,$key)=
  74         $flavour=~/64/? map("q$_",(0..6)) : map("q$_",(0..3,8..10));
  75
  76
  77 $code.=<<___;
  78 .align  5
  79 .Lrcon:
  80 .long   0x01,0x01,0x01,0x01
  81 .long   0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d     // rotate-n-splat
  82 .long   0x1b,0x1b,0x1b,0x1b
  83
  84 .globl  ${prefix}_set_encrypt_key
  85 .type   ${prefix}_set_encrypt_key,%function
  86 .align  5
  87 ${prefix}_set_encrypt_key:
  88 .Lenc_key:
  89 ___
  90 $code.=<<___    if ($flavour =~ /64/);
  91         stp     x29,x30,[sp,#-16]!
  92         add     x29,sp,#0
  93 ___
  94 $code.=<<___;
  95         mov     $ptr,#-1
  96         cmp     $inp,#0
  97         b.eq    .Lenc_key_abort
  98         cmp     $out,#0
  99         b.eq    .Lenc_key_abort
 100         mov     $ptr,#-2
 101         cmp     $bits,#128
 102         b.lt    .Lenc_key_abort
 103         cmp     $bits,#256
 104         b.gt    .Lenc_key_abort
 105         tst     $bits,#0x3f
 106         b.ne    .Lenc_key_abort
 107
 108         adr     $ptr,.Lrcon
 109         cmp     $bits,#192
 110
 111         veor    $zero,$zero,$zero
 112         vld1.8  {$in0},[$inp],#16
 113         mov     $bits,#8                // reuse $bits
 114         vld1.32 {$rcon,$mask},[$ptr],#32
 115
 116         b.lt    .Loop128
 117         b.eq    .L192
 118         b       .L256
 119
 120 .align  4
 121 .Loop128:
 122         vtbl.8  $key,{$in0},$mask
 123         vext.8  $tmp,$zero,$in0,#12
 124         vst1.32 {$in0},[$out],#16
 125         aese    $key,$zero
 126         subs    $bits,$bits,#1
 127
 128         veor    $in0,$in0,$tmp
 129         vext.8  $tmp,$zero,$tmp,#12
 130         veor    $in0,$in0,$tmp
 131         vext.8  $tmp,$zero,$tmp,#12
 132          veor   $key,$key,$rcon
 133         veor    $in0,$in0,$tmp
 134         vshl.u8 $rcon,$rcon,#1
 135         veor    $in0,$in0,$key
 136         b.ne    .Loop128
 137
 138         vld1.32 {$rcon},[$ptr]
 139
 140         vtbl.8  $key,{$in0},$mask
 141         vext.8  $tmp,$zero,$in0,#12
 142         vst1.32 {$in0},[$out],#16
 143         aese    $key,$zero
 144
 145         veor    $in0,$in0,$tmp
 146         vext.8  $tmp,$zero,$tmp,#12
 147         veor    $in0,$in0,$tmp
 148         vext.8  $tmp,$zero,$tmp,#12
 149          veor   $key,$key,$rcon
 150         veor    $in0,$in0,$tmp
 151         vshl.u8 $rcon,$rcon,#1
 152         veor    $in0,$in0,$key
 153
 154         vtbl.8  $key,{$in0},$mask
 155         vext.8  $tmp,$zero,$in0,#12
 156         vst1.32 {$in0},[$out],#16
 157         aese    $key,$zero
 158
 159         veor    $in0,$in0,$tmp
 160         vext.8  $tmp,$zero,$tmp,#12
 161         veor    $in0,$in0,$tmp
 162         vext.8  $tmp,$zero,$tmp,#12
 163          veor   $key,$key,$rcon
 164         veor    $in0,$in0,$tmp
 165         veor    $in0,$in0,$key
 166         vst1.32 {$in0},[$out]
 167         add     $out,$out,#0x50
 168
 169         mov     $rounds,#10
 170         b       .Ldone
 171
 172 .align  4
 173 .L192:
 174         vld1.8  {$in1},[$inp],#8
 175         vmov.i8 $key,#8                 // borrow $key
 176         vst1.32 {$in0},[$out],#16
 177         vsub.i8 $mask,$mask,$key        // adjust the mask
 178
 179 .Loop192:
 180         vtbl.8  $key,{$in1},$mask
 181         vext.8  $tmp,$zero,$in0,#12
 182         vst1.32 {$in1},[$out],#8
 183         aese    $key,$zero
 184         subs    $bits,$bits,#1
 185
 186         veor    $in0,$in0,$tmp
 187         vext.8  $tmp,$zero,$tmp,#12
 188         veor    $in0,$in0,$tmp
 189         vext.8  $tmp,$zero,$tmp,#12
 190         veor    $in0,$in0,$tmp
 191
 192         vdup.32 $tmp,${in0}[3]
 193         veor    $tmp,$tmp,$in1
 194          veor   $key,$key,$rcon
 195         vext.8  $in1,$zero,$in1,#12
 196         vshl.u8 $rcon,$rcon,#1
 197         veor    $in1,$in1,$tmp
 198         veor    $in0,$in0,$key
 199         veor    $in1,$in1,$key
 200         vst1.32 {$in0},[$out],#16
 201         b.ne    .Loop192
 202
 203         mov     $rounds,#12
 204         add     $out,$out,#0x20
 205         b       .Ldone
 206
 207 .align  4
 208 .L256:
 209         vld1.8  {$in1},[$inp]
 210         mov     $bits,#7
 211         mov     $rounds,#14
 212         vst1.32 {$in0},[$out],#16
 213
 214 .Loop256:
 215         vtbl.8  $key,{$in1},$mask
 216         vext.8  $tmp,$zero,$in0,#12
 217         vst1.32 {$in1},[$out],#16
 218         aese    $key,$zero
 219         subs    $bits,$bits,#1
 220
 221         veor    $in0,$in0,$tmp
 222         vext.8  $tmp,$zero,$tmp,#12
 223         veor    $in0,$in0,$tmp
 224         vext.8  $tmp,$zero,$tmp,#12
 225          veor   $key,$key,$rcon
 226         veor    $in0,$in0,$tmp
 227         vshl.u8 $rcon,$rcon,#1
 228         veor    $in0,$in0,$key
 229         vst1.32 {$in0},[$out],#16
 230         b.eq    .Ldone
 231
 232         vdup.32 $key,${in0}[3]          // just splat
 233         vext.8  $tmp,$zero,$in1,#12
 234         aese    $key,$zero
 235
 236         veor    $in1,$in1,$tmp
 237         vext.8  $tmp,$zero,$tmp,#12
 238         veor    $in1,$in1,$tmp
 239         vext.8  $tmp,$zero,$tmp,#12
 240         veor    $in1,$in1,$tmp
 241
 242         veor    $in1,$in1,$key
 243         b       .Loop256
 244
 245 .Ldone:
 246         str     $rounds,[$out]
 247         mov     $ptr,#0
 248
 249 .Lenc_key_abort:
 250         mov     x0,$ptr                 // return value
 251         `"ldr   x29,[sp],#16"           if ($flavour =~ /64/)`
 252         ret
 253 .size   ${prefix}_set_encrypt_key,.-${prefix}_set_encrypt_key
 254
 255 .globl  ${prefix}_set_decrypt_key
 256 .type   ${prefix}_set_decrypt_key,%function
 257 .align  5
 258 ${prefix}_set_decrypt_key:
 259 ___
 260 $code.=<<___    if ($flavour =~ /64/);
 261         stp     x29,x30,[sp,#-16]!
 262         add     x29,sp,#0
 263 ___
 264 $code.=<<___    if ($flavour !~ /64/);
 265         stmdb   sp!,{r4,lr}
 266 ___
 267 $code.=<<___;
 268         bl      .Lenc_key
 269
 270         cmp     x0,#0
 271         b.ne    .Ldec_key_abort
 272
 273         sub     $out,$out,#240          // restore original $out
 274         mov     x4,#-16
 275         add     $inp,$out,x12,lsl#4     // end of key schedule
 276
 277         vld1.32 {v0.16b},[$out]
 278         vld1.32 {v1.16b},[$inp]
 279         vst1.32 {v0.16b},[$inp],x4
 280         vst1.32 {v1.16b},[$out],#16
 281
 282 .Loop_imc:
 283         vld1.32 {v0.16b},[$out]
 284         vld1.32 {v1.16b},[$inp]
 285         aesimc  v0.16b,v0.16b
 286         aesimc  v1.16b,v1.16b
 287         vst1.32 {v0.16b},[$inp],x4
 288         vst1.32 {v1.16b},[$out],#16
 289         cmp     $inp,$out
 290         b.hi    .Loop_imc
 291
 292         vld1.32 {v0.16b},[$out]
 293         aesimc  v0.16b,v0.16b
 294         vst1.32 {v0.16b},[$inp]
 295
 296         eor     x0,x0,x0                // return value
 297 .Ldec_key_abort:
 298 ___
 299 $code.=<<___    if ($flavour !~ /64/);
 300         ldmia   sp!,{r4,pc}
 301 ___
 302 $code.=<<___    if ($flavour =~ /64/);
 303         ldp     x29,x30,[sp],#16
 304         ret
 305 ___
 306 $code.=<<___;
 307 .size   ${prefix}_set_decrypt_key,.-${prefix}_set_decrypt_key
 308 ___
 309 }}}
 310 {{{
 311 sub gen_block () {
 312 my $dir = shift;
 313 my ($e,$mc) = $dir eq "en" ? ("e","mc") : ("d","imc");
 314 my ($inp,$out,$key)=map("x$_",(0..2));
 315 my $rounds="w3";
 316 my ($rndkey0,$rndkey1,$inout)=map("q$_",(0..3));
 317
 318 $code.=<<___;
 319 .globl  ${prefix}_${dir}crypt
 320 .type   ${prefix}_${dir}crypt,%function
 321 .align  5
 322 ${prefix}_${dir}crypt:
 323         ldr     $rounds,[$key,#240]
 324         vld1.32 {$rndkey0},[$key],#16
 325         vld1.8  {$inout},[$inp]
 326         sub     $rounds,$rounds,#2
 327         vld1.32 {$rndkey1},[$key],#16
 328
 329 .Loop_${dir}c:
 330         aes$e   $inout,$rndkey0
 331         aes$mc  $inout,$inout
 332         vld1.32 {$rndkey0},[$key],#16
 333         subs    $rounds,$rounds,#2
 334         aes$e   $inout,$rndkey1
 335         aes$mc  $inout,$inout
 336         vld1.32 {$rndkey1},[$key],#16
 337         b.gt    .Loop_${dir}c
 338
 339         aes$e   $inout,$rndkey0
 340         aes$mc  $inout,$inout
 341         vld1.32 {$rndkey0},[$key]
 342         aes$e   $inout,$rndkey1
 343         veor    $inout,$inout,$rndkey0
 344
 345         vst1.8  {$inout},[$out]
 346         ret
 347 .size   ${prefix}_${dir}crypt,.-${prefix}_${dir}crypt
 348 ___
 349 }
 350 &gen_block("en");
 351 &gen_block("de");
 352 }}}
 353 {{{
 354 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
 355 my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
 356 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 357
 358 my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
 359 my ($key4,$key5,$key6,$key7)=("x6","x12","x14",$key);
 360
 361 ### q8-q15      preloaded key schedule
 362
 363 $code.=<<___;
 364 .globl  ${prefix}_cbc_encrypt
 365 .type   ${prefix}_cbc_encrypt,%function
 366 .align  5
 367 ${prefix}_cbc_encrypt:
 368 ___
 369 $code.=<<___    if ($flavour =~ /64/);
 370         stp     x29,x30,[sp,#-16]!
 371         add     x29,sp,#0
 372 ___
 373 $code.=<<___    if ($flavour !~ /64/);
 374         mov     ip,sp
 375         stmdb   sp!,{r4-r8,lr}
 376         vstmdb  sp!,{d8-d15}            @ ABI specification says so
 377         ldmia   ip,{r4-r5}              @ load remaining args
 378 ___
 379 $code.=<<___;
 380         subs    $len,$len,#16
 381         mov     $step,#16
 382         b.lo    .Lcbc_abort
 383         cclr    $step,eq
 384
 385         cmp     $enc,#0                 // en- or decrypting?
 386         ldr     $rounds,[$key,#240]
 387         and     $len,$len,#-16
 388         vld1.8  {$ivec},[$ivp]
 389         vld1.8  {$dat},[$inp],$step
 390
 391         vld1.32 {q8-q9},[$key]          // load key schedule...
 392         sub     $rounds,$rounds,#6
 393         add     $key_,$key,x5,lsl#4     // pointer to last 7 round keys
 394         sub     $rounds,$rounds,#2
 395         vld1.32 {q10-q11},[$key_],#32
 396         vld1.32 {q12-q13},[$key_],#32
 397         vld1.32 {q14-q15},[$key_],#32
 398         vld1.32 {$rndlast},[$key_]
 399
 400         add     $key_,$key,#32
 401         mov     $cnt,$rounds
 402         b.eq    .Lcbc_dec
 403
 404         cmp     $rounds,#2
 405         veor    $dat,$dat,$ivec
 406         veor    $rndzero_n_last,q8,$rndlast
 407         b.eq    .Lcbc_enc128
 408
 409         vld1.32 {$in0-$in1},[$key_]
 410         add     $key_,$key,#16
 411         add     $key4,$key,#16*4
 412         add     $key5,$key,#16*5
 413         aese    $dat,q8
 414         aesmc   $dat,$dat
 415         add     $key6,$key,#16*6
 416         add     $key7,$key,#16*7
 417         b       .Lenter_cbc_enc
 418
 419 .align  4
 420 .Loop_cbc_enc:
 421         aese    $dat,q8
 422         aesmc   $dat,$dat
 423          vst1.8 {$ivec},[$out],#16
 424 .Lenter_cbc_enc:
 425         aese    $dat,q9
 426         aesmc   $dat,$dat
 427         aese    $dat,$in0
 428         aesmc   $dat,$dat
 429         vld1.32 {q8},[$key4]
 430         cmp     $rounds,#4
 431         aese    $dat,$in1
 432         aesmc   $dat,$dat
 433         vld1.32 {q9},[$key5]
 434         b.eq    .Lcbc_enc192
 435
 436         aese    $dat,q8
 437         aesmc   $dat,$dat
 438         vld1.32 {q8},[$key6]
 439         aese    $dat,q9
 440         aesmc   $dat,$dat
 441         vld1.32 {q9},[$key7]
 442         nop
 443
 444 .Lcbc_enc192:
 445         aese    $dat,q8
 446         aesmc   $dat,$dat
 447          subs   $len,$len,#16
 448         aese    $dat,q9
 449         aesmc   $dat,$dat
 450          cclr   $step,eq
 451         aese    $dat,q10
 452         aesmc   $dat,$dat
 453         aese    $dat,q11
 454         aesmc   $dat,$dat
 455          vld1.8 {q8},[$inp],$step
 456         aese    $dat,q12
 457         aesmc   $dat,$dat
 458          veor   q8,q8,$rndzero_n_last
 459         aese    $dat,q13
 460         aesmc   $dat,$dat
 461          vld1.32 {q9},[$key_]           // re-pre-load rndkey[1]
 462         aese    $dat,q14
 463         aesmc   $dat,$dat
 464         aese    $dat,q15
 465         veor    $ivec,$dat,$rndlast
 466         b.hs    .Loop_cbc_enc
 467
 468         vst1.8  {$ivec},[$out],#16
 469         b       .Lcbc_done
 470
 471 .align  5
 472 .Lcbc_enc128:
 473         vld1.32 {$in0-$in1},[$key_]
 474         aese    $dat,q8
 475         aesmc   $dat,$dat
 476         b       .Lenter_cbc_enc128
 477 .Loop_cbc_enc128:
 478         aese    $dat,q8
 479         aesmc   $dat,$dat
 480          vst1.8 {$ivec},[$out],#16
 481 .Lenter_cbc_enc128:
 482         aese    $dat,q9
 483         aesmc   $dat,$dat
 484          subs   $len,$len,#16
 485         aese    $dat,$in0
 486         aesmc   $dat,$dat
 487          cclr   $step,eq
 488         aese    $dat,$in1
 489         aesmc   $dat,$dat
 490         aese    $dat,q10
 491         aesmc   $dat,$dat
 492         aese    $dat,q11
 493         aesmc   $dat,$dat
 494          vld1.8 {q8},[$inp],$step
 495         aese    $dat,q12
 496         aesmc   $dat,$dat
 497         aese    $dat,q13
 498         aesmc   $dat,$dat
 499         aese    $dat,q14
 500         aesmc   $dat,$dat
 501          veor   q8,q8,$rndzero_n_last
 502         aese    $dat,q15
 503         veor    $ivec,$dat,$rndlast
 504         b.hs    .Loop_cbc_enc128
 505
 506         vst1.8  {$ivec},[$out],#16
 507         b       .Lcbc_done
 508 ___
 509 {
 510 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 511 $code.=<<___;
 512 .align  5
 513 .Lcbc_dec:
 514         vld1.8  {$dat2},[$inp],#16
 515         subs    $len,$len,#32           // bias
 516         add     $cnt,$rounds,#2
 517         vorr    $in1,$dat,$dat
 518         vorr    $dat1,$dat,$dat
 519         vorr    $in2,$dat2,$dat2
 520         b.lo    .Lcbc_dec_tail
 521
 522         vorr    $dat1,$dat2,$dat2
 523         vld1.8  {$dat2},[$inp],#16
 524         vorr    $in0,$dat,$dat
 525         vorr    $in1,$dat1,$dat1
 526         vorr    $in2,$dat2,$dat2
 527
 528 .Loop3x_cbc_dec:
 529         aesd    $dat0,q8
 530         aesimc  $dat0,$dat0
 531         aesd    $dat1,q8
 532         aesimc  $dat1,$dat1
 533         aesd    $dat2,q8
 534         aesimc  $dat2,$dat2
 535         vld1.32 {q8},[$key_],#16
 536         subs    $cnt,$cnt,#2
 537         aesd    $dat0,q9
 538         aesimc  $dat0,$dat0
 539         aesd    $dat1,q9
 540         aesimc  $dat1,$dat1
 541         aesd    $dat2,q9
 542         aesimc  $dat2,$dat2
 543         vld1.32 {q9},[$key_],#16
 544         b.gt    .Loop3x_cbc_dec
 545
 546         aesd    $dat0,q8
 547         aesimc  $dat0,$dat0
 548         aesd    $dat1,q8
 549         aesimc  $dat1,$dat1
 550         aesd    $dat2,q8
 551         aesimc  $dat2,$dat2
 552          veor   $tmp0,$ivec,$rndlast
 553          subs   $len,$len,#0x30
 554          veor   $tmp1,$in0,$rndlast
 555          mov.lo x6,$len                 // x6, $cnt, is zero at this point
 556         aesd    $dat0,q9
 557         aesimc  $dat0,$dat0
 558         aesd    $dat1,q9
 559         aesimc  $dat1,$dat1
 560         aesd    $dat2,q9
 561         aesimc  $dat2,$dat2
 562          veor   $tmp2,$in1,$rndlast
 563          add    $inp,$inp,x6            // $inp is adjusted in such way that
 564                                         // at exit from the loop $dat1-$dat2
 565                                         // are loaded with last "words"
 566          vorr   $ivec,$in2,$in2
 567          mov    $key_,$key
 568         aesd    $dat0,q12
 569         aesimc  $dat0,$dat0
 570         aesd    $dat1,q12
 571         aesimc  $dat1,$dat1
 572         aesd    $dat2,q12
 573         aesimc  $dat2,$dat2
 574          vld1.8 {$in0},[$inp],#16
 575         aesd    $dat0,q13
 576         aesimc  $dat0,$dat0
 577         aesd    $dat1,q13
 578         aesimc  $dat1,$dat1
 579         aesd    $dat2,q13
 580         aesimc  $dat2,$dat2
 581          vld1.8 {$in1},[$inp],#16
 582         aesd    $dat0,q14
 583         aesimc  $dat0,$dat0
 584         aesd    $dat1,q14
 585         aesimc  $dat1,$dat1
 586         aesd    $dat2,q14
 587         aesimc  $dat2,$dat2
 588          vld1.8 {$in2},[$inp],#16
 589         aesd    $dat0,q15
 590         aesd    $dat1,q15
 591         aesd    $dat2,q15
 592          vld1.32 {q8},[$key_],#16       // re-pre-load rndkey[0]
 593          add    $cnt,$rounds,#2
 594         veor    $tmp0,$tmp0,$dat0
 595         veor    $tmp1,$tmp1,$dat1
 596         veor    $dat2,$dat2,$tmp2
 597          vld1.32 {q9},[$key_],#16       // re-pre-load rndkey[1]
 598         vst1.8  {$tmp0},[$out],#16
 599          vorr   $dat0,$in0,$in0
 600         vst1.8  {$tmp1},[$out],#16
 601          vorr   $dat1,$in1,$in1
 602         vst1.8  {$dat2},[$out],#16
 603          vorr   $dat2,$in2,$in2
 604         b.hs    .Loop3x_cbc_dec
 605
 606         cmn     $len,#0x30
 607         b.eq    .Lcbc_done
 608         nop
 609
 610 .Lcbc_dec_tail:
 611         aesd    $dat1,q8
 612         aesimc  $dat1,$dat1
 613         aesd    $dat2,q8
 614         aesimc  $dat2,$dat2
 615         vld1.32 {q8},[$key_],#16
 616         subs    $cnt,$cnt,#2
 617         aesd    $dat1,q9
 618         aesimc  $dat1,$dat1
 619         aesd    $dat2,q9
 620         aesimc  $dat2,$dat2
 621         vld1.32 {q9},[$key_],#16
 622         b.gt    .Lcbc_dec_tail
 623
 624         aesd    $dat1,q8
 625         aesimc  $dat1,$dat1
 626         aesd    $dat2,q8
 627         aesimc  $dat2,$dat2
 628         aesd    $dat1,q9
 629         aesimc  $dat1,$dat1
 630         aesd    $dat2,q9
 631         aesimc  $dat2,$dat2
 632         aesd    $dat1,q12
 633         aesimc  $dat1,$dat1
 634         aesd    $dat2,q12
 635         aesimc  $dat2,$dat2
 636          cmn    $len,#0x20
 637         aesd    $dat1,q13
 638         aesimc  $dat1,$dat1
 639         aesd    $dat2,q13
 640         aesimc  $dat2,$dat2
 641          veor   $tmp1,$ivec,$rndlast
 642         aesd    $dat1,q14
 643         aesimc  $dat1,$dat1
 644         aesd    $dat2,q14
 645         aesimc  $dat2,$dat2
 646          veor   $tmp2,$in1,$rndlast
 647         aesd    $dat1,q15
 648         aesd    $dat2,q15
 649         b.eq    .Lcbc_dec_one
 650         veor    $tmp1,$tmp1,$dat1
 651         veor    $tmp2,$tmp2,$dat2
 652          vorr   $ivec,$in2,$in2
 653         vst1.8  {$tmp1},[$out],#16
 654         vst1.8  {$tmp2},[$out],#16
 655         b       .Lcbc_done
 656
 657 .Lcbc_dec_one:
 658         veor    $tmp1,$tmp1,$dat2
 659          vorr   $ivec,$in2,$in2
 660         vst1.8  {$tmp1},[$out],#16
 661
 662 .Lcbc_done:
 663         vst1.8  {$ivec},[$ivp]
 664 .Lcbc_abort:
 665 ___
 666 }
 667 $code.=<<___    if ($flavour !~ /64/);
 668         vldmia  sp!,{d8-d15}
 669         ldmia   sp!,{r4-r8,pc}
 670 ___
 671 $code.=<<___    if ($flavour =~ /64/);
 672         ldr     x29,[sp],#16
 673         ret
 674 ___
 675 $code.=<<___;
 676 .size   ${prefix}_cbc_encrypt,.-${prefix}_cbc_encrypt
 677 ___
 678 }}}
 679 {{{
 680 my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4));
 681 my ($rounds,$cnt,$key_)=("w5","w6","x7");
 682 my ($ctr,$tctr0,$tctr1,$tctr2)=map("w$_",(8..10,12));
 683 my $step="x12";         # aliases with $tctr2
 684
 685 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
 686 my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
 687
 688 my ($dat,$tmp)=($dat0,$tmp0);
 689
 690 ### q8-q15      preloaded key schedule
 691
 692 $code.=<<___;
 693 .globl  ${prefix}_ctr32_encrypt_blocks
 694 .type   ${prefix}_ctr32_encrypt_blocks,%function
 695 .align  5
 696 ${prefix}_ctr32_encrypt_blocks:
 697 ___
 698 $code.=<<___    if ($flavour =~ /64/);
 699         stp             x29,x30,[sp,#-16]!
 700         add             x29,sp,#0
 701 ___
 702 $code.=<<___    if ($flavour !~ /64/);
 703         mov             ip,sp
 704         stmdb           sp!,{r4-r10,lr}
 705         vstmdb          sp!,{d8-d15}            @ ABI specification says so
 706         ldr             r4, [ip]                @ load remaining arg
 707 ___
 708 $code.=<<___;
 709         ldr             $rounds,[$key,#240]
 710
 711         ldr             $ctr, [$ivp, #12]
 712         vld1.32         {$dat0},[$ivp]
 713
 714         vld1.32         {q8-q9},[$key]          // load key schedule...
 715         sub             $rounds,$rounds,#4
 716         mov             $step,#16
 717         cmp             $len,#2
 718         add             $key_,$key,x5,lsl#4     // pointer to last 5 round keys
 719         sub             $rounds,$rounds,#2
 720         vld1.32         {q12-q13},[$key_],#32
 721         vld1.32         {q14-q15},[$key_],#32
 722         vld1.32         {$rndlast},[$key_]
 723         add             $key_,$key,#32
 724         mov             $cnt,$rounds
 725         cclr            $step,lo
 726 #ifndef __ARMEB__
 727         rev             $ctr, $ctr
 728 #endif
 729         vorr            $dat1,$dat0,$dat0
 730         add             $tctr1, $ctr, #1
 731         vorr            $dat2,$dat0,$dat0
 732         add             $ctr, $ctr, #2
 733         vorr            $ivec,$dat0,$dat0
 734         rev             $tctr1, $tctr1
 735         vmov.32         ${dat1}[3],$tctr1
 736         b.ls            .Lctr32_tail
 737         rev             $tctr2, $ctr
 738         sub             $len,$len,#3            // bias
 739         vmov.32         ${dat2}[3],$tctr2
 740         b               .Loop3x_ctr32
 741
 742 .align  4
 743 .Loop3x_ctr32:
 744         aese            $dat0,q8
 745         aesmc           $dat0,$dat0
 746         aese            $dat1,q8
 747         aesmc           $dat1,$dat1
 748         aese            $dat2,q8
 749         aesmc           $dat2,$dat2
 750         vld1.32         {q8},[$key_],#16
 751         subs            $cnt,$cnt,#2
 752         aese            $dat0,q9
 753         aesmc           $dat0,$dat0
 754         aese            $dat1,q9
 755         aesmc           $dat1,$dat1
 756         aese            $dat2,q9
 757         aesmc           $dat2,$dat2
 758         vld1.32         {q9},[$key_],#16
 759         b.gt            .Loop3x_ctr32
 760
 761         aese            $dat0,q8
 762         aesmc           $tmp0,$dat0
 763         aese            $dat1,q8
 764         aesmc           $tmp1,$dat1
 765          vld1.8         {$in0},[$inp],#16
 766          vorr           $dat0,$ivec,$ivec
 767         aese            $dat2,q8
 768         aesmc           $dat2,$dat2
 769          vld1.8         {$in1},[$inp],#16
 770          vorr           $dat1,$ivec,$ivec
 771         aese            $tmp0,q9
 772         aesmc           $tmp0,$tmp0
 773         aese            $tmp1,q9
 774         aesmc           $tmp1,$tmp1
 775          vld1.8         {$in2},[$inp],#16
 776          mov            $key_,$key
 777         aese            $dat2,q9
 778         aesmc           $tmp2,$dat2
 779          vorr           $dat2,$ivec,$ivec
 780          add            $tctr0,$ctr,#1
 781         aese            $tmp0,q12
 782         aesmc           $tmp0,$tmp0
 783         aese            $tmp1,q12
 784         aesmc           $tmp1,$tmp1
 785          veor           $in0,$in0,$rndlast
 786          add            $tctr1,$ctr,#2
 787         aese            $tmp2,q12
 788         aesmc           $tmp2,$tmp2
 789          veor           $in1,$in1,$rndlast
 790          add            $ctr,$ctr,#3
 791         aese            $tmp0,q13
 792         aesmc           $tmp0,$tmp0
 793         aese            $tmp1,q13
 794         aesmc           $tmp1,$tmp1
 795          veor           $in2,$in2,$rndlast
 796          rev            $tctr0,$tctr0
 797         aese            $tmp2,q13
 798         aesmc           $tmp2,$tmp2
 799          vmov.32        ${dat0}[3], $tctr0
 800          rev            $tctr1,$tctr1
 801         aese            $tmp0,q14
 802         aesmc           $tmp0,$tmp0
 803         aese            $tmp1,q14
 804         aesmc           $tmp1,$tmp1
 805          vmov.32        ${dat1}[3], $tctr1
 806          rev            $tctr2,$ctr
 807         aese            $tmp2,q14
 808         aesmc           $tmp2,$tmp2
 809          vmov.32        ${dat2}[3], $tctr2
 810          subs           $len,$len,#3
 811         aese            $tmp0,q15
 812         aese            $tmp1,q15
 813         aese            $tmp2,q15
 814
 815         veor            $in0,$in0,$tmp0
 816          vld1.32         {q8},[$key_],#16       // re-pre-load rndkey[0]
 817         vst1.8          {$in0},[$out],#16
 818         veor            $in1,$in1,$tmp1
 819          mov            $cnt,$rounds
 820         vst1.8          {$in1},[$out],#16
 821         veor            $in2,$in2,$tmp2
 822          vld1.32         {q9},[$key_],#16       // re-pre-load rndkey[1]
 823         vst1.8          {$in2},[$out],#16
 824         b.hs            .Loop3x_ctr32
 825
 826         adds            $len,$len,#3
 827         b.eq            .Lctr32_done
 828         cmp             $len,#1
 829         mov             $step,#16
 830         cclr            $step,eq
 831
 832 .Lctr32_tail:
 833         aese            $dat0,q8
 834         aesmc           $dat0,$dat0
 835         aese            $dat1,q8
 836         aesmc           $dat1,$dat1
 837         vld1.32         {q8},[$key_],#16
 838         subs            $cnt,$cnt,#2
 839         aese            $dat0,q9
 840         aesmc           $dat0,$dat0
 841         aese            $dat1,q9
 842         aesmc           $dat1,$dat1
 843         vld1.32         {q9},[$key_],#16
 844         b.gt            .Lctr32_tail
 845
 846         aese            $dat0,q8
 847         aesmc           $dat0,$dat0
 848         aese            $dat1,q8
 849         aesmc           $dat1,$dat1
 850         aese            $dat0,q9
 851         aesmc           $dat0,$dat0
 852         aese            $dat1,q9
 853         aesmc           $dat1,$dat1
 854          vld1.8         {$in0},[$inp],$step
 855         aese            $dat0,q12
 856         aesmc           $dat0,$dat0
 857         aese            $dat1,q12
 858         aesmc           $dat1,$dat1
 859          vld1.8         {$in1},[$inp]
 860         aese            $dat0,q13
 861         aesmc           $dat0,$dat0
 862         aese            $dat1,q13
 863         aesmc           $dat1,$dat1
 864          veor           $in0,$in0,$rndlast
 865         aese            $dat0,q14
 866         aesmc           $dat0,$dat0
 867         aese            $dat1,q14
 868         aesmc           $dat1,$dat1
 869          veor           $in1,$in1,$rndlast
 870         aese            $dat0,q15
 871         aese            $dat1,q15
 872
 873         cmp             $len,#1
 874         veor            $in0,$in0,$dat0
 875         veor            $in1,$in1,$dat1
 876         vst1.8          {$in0},[$out],#16
 877         b.eq            .Lctr32_done
 878         vst1.8          {$in1},[$out]
 879
 880 .Lctr32_done:
 881 ___
 882 $code.=<<___    if ($flavour !~ /64/);
 883         vldmia          sp!,{d8-d15}
 884         ldmia           sp!,{r4-r10,pc}
 885 ___
 886 $code.=<<___    if ($flavour =~ /64/);
 887         ldr             x29,[sp],#16
 888         ret
 889 ___
 890 $code.=<<___;
 891 .size   ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
 892 ___
 893 }}}
 894 $code.=<<___;
 895 #endif
 896 ___
 897 ########################################
 898 if ($flavour =~ /64/) {                 ######## 64-bit code
 899     my %opcode = (
 900         "aesd"  =>      0x4e285800,     "aese"  =>      0x4e284800,
 901         "aesimc"=>      0x4e287800,     "aesmc" =>      0x4e286800      );
 902
 903     local *unaes = sub {
 904         my ($mnemonic,$arg)=@_;
 905
 906         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o   &&
 907         sprintf ".inst\t0x%08x\t//%s %s",
 908                         $opcode{$mnemonic}|$1|($2<<5),
 909                         $mnemonic,$arg;
 910     };
 911
 912     foreach(split("\n",$code)) {
 913         s/\`([^\`]*)\`/eval($1)/geo;
 914
 915         s/\bq([0-9]+)\b/"v".($1<8?$1:$1+8).".16b"/geo;  # old->new registers
 916         s/@\s/\/\//o;                   # old->new style commentary
 917
 918         #s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo     or
 919         s/cclr\s+([wx])([^,]+),\s*([a-z]+)/csel $1$2,$1zr,$1$2,$3/o     or
 920         s/mov\.([a-z]+)\s+([wx][0-9]+),\s*([wx][0-9]+)/csel     $2,$3,$2,$1/o   or
 921         s/vmov\.i8/movi/o       or      # fix up legacy mnemonics
 922         s/vext\.8/ext/o         or
 923         s/vrev32\.8/rev32/o     or
 924         s/vtst\.8/cmtst/o       or
 925         s/vshr/ushr/o           or
 926         s/^(\s+)v/$1/o          or      # strip off v prefix
 927         s/\bbx\s+lr\b/ret/o;
 928
 929         # fix up remainig legacy suffixes
 930         s/\.[ui]?8//o;
 931         m/\],#8/o and s/\.16b/\.8b/go;
 932         s/\.[ui]?32//o and s/\.16b/\.4s/go;
 933         s/\.[ui]?64//o and s/\.16b/\.2d/go;
 934         s/\.[42]([sd])\[([0-3])\]/\.$1\[$2\]/o;
 935
 936         print $_,"\n";
 937     }
 938 } else {                                ######## 32-bit code
 939     my %opcode = (
 940         "aesd"  =>      0xf3b00340,     "aese"  =>      0xf3b00300,
 941         "aesimc"=>      0xf3b003c0,     "aesmc" =>      0xf3b00380      );
 942
 943     local *unaes = sub {
 944         my ($mnemonic,$arg)=@_;
 945
 946         if ($arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)/o) {
 947             my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19)
 948                                          |(($2&7)<<1) |(($2&8)<<2);
 949             # since ARMv7 instructions are always encoded little-endian.
 950             # correct solution is to use .inst directive, but older
 951             # assemblers don't implement it:-(
 952             sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
 953                         $word&0xff,($word>>8)&0xff,
 954                         ($word>>16)&0xff,($word>>24)&0xff,
 955                         $mnemonic,$arg;
 956         }
 957     };
 958
 959     sub unvtbl {
 960         my $arg=shift;
 961
 962         $arg =~ m/q([0-9]+),\s*\{q([0-9]+)\},\s*q([0-9]+)/o &&
 963         sprintf "vtbl.8 d%d,{q%d},d%d\n\t".
 964                 "vtbl.8 d%d,{q%d},d%d", 2*$1,$2,2*$3, 2*$1+1,$2,2*$3+1;
 965     }
 966
 967     sub unvdup32 {
 968         my $arg=shift;
 969
 970         $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
 971         sprintf "vdup.32        q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
 972     }
 973
 974     sub unvmov32 {
 975         my $arg=shift;
 976
 977         $arg =~ m/q([0-9]+)\[([0-3])\],(.*)/o &&
 978         sprintf "vmov.32        d%d[%d],%s",2*$1+($2>>1),$2&1,$3;
 979     }
 980
 981     foreach(split("\n",$code)) {
 982         s/\`([^\`]*)\`/eval($1)/geo;
 983
 984         s/\b[wx]([0-9]+)\b/r$1/go;              # new->old registers
 985         s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
 986         s/\/\/\s?/@ /o;                         # new->old style commentary
 987
 988         # fix up remainig new-style suffixes
 989         s/\{q([0-9]+)\},\s*\[(.+)\],#8/sprintf "{d%d},[$2]!",2*$1/eo    or
 990         s/\],#[0-9]+/]!/o;
 991
 992         s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo      or
 993         s/cclr\s+([^,]+),\s*([a-z]+)/mov$2      $1,#0/o or
 994         s/vtbl\.8\s+(.*)/unvtbl($1)/geo                 or
 995         s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
 996         s/vmov\.32\s+(.*)/unvmov32($1)/geo              or
 997         s/^(\s+)b\./$1b/o                               or
 998         s/^(\s+)mov\./$1mov/o                           or
 999         s/^(\s+)ret/$1bx\tlr/o;
1000
1001         print $_,"\n";
1002     }
1003 }
1004
1005 close STDOUT;