crypto/aes/asm/aesfx-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2016-2021 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # March 2016
  18 #
  19 # Initial support for Fujitsu SPARC64 X/X+ comprises minimally
  20 # required key setup and single-block procedures.
  21 #
  22 # April 2016
  23 #
  24 # Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
  25 # that parallelizable nature of CBC decrypt and CTR is not utilized
  26 # yet. CBC encrypt on the other hand is as good as it can possibly
  27 # get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
  28 # This is ~6x faster than pure software implementation...
  29 #
  30 # July 2016
  31 #
  32 # Switch from faligndata to fshiftorx, which allows to omit alignaddr
  33 # instructions and improve single-block and short-input performance
  34 # with misaligned data.
  35
  36 $output = pop and open STDOUT,">$output";
  37
  38 {
  39 my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
  40
  41 $code.=<<___;
  42 #ifndef __ASSEMBLER__
  43 # define __ASSEMBLER__ 1
  44 #endif
  45 #include "crypto/sparc_arch.h"
  46
  47 #define LOCALS (STACK_BIAS+STACK_FRAME)
  48
  49 .text
  50
  51 .globl  aes_fx_encrypt
  52 .align  32
  53 aes_fx_encrypt:
  54         and             $inp, 7, $tmp           ! is input aligned?
  55         andn            $inp, 7, $inp
  56         ldd             [$key +  0], %f6        ! round[0]
  57         ldd             [$key +  8], %f8
  58         mov             %o7, %g1
  59         ld              [$key + 240], $rounds
  60
  61 1:      call            .+8
  62         add             %o7, .Linp_align-1b, %o7
  63
  64         sll             $tmp, 3, $tmp
  65         ldd             [$inp + 0], %f0         ! load input
  66         brz,pt          $tmp, .Lenc_inp_aligned
  67         ldd             [$inp + 8], %f2
  68
  69         ldd             [%o7 + $tmp], %f14      ! shift left params
  70         ldd             [$inp + 16], %f4
  71         fshiftorx       %f0, %f2, %f14, %f0
  72         fshiftorx       %f2, %f4, %f14, %f2
  73
  74 .Lenc_inp_aligned:
  75         ldd             [$key + 16], %f10       ! round[1]
  76         ldd             [$key + 24], %f12
  77
  78         fxor            %f0, %f6, %f0           ! ^=round[0]
  79         fxor            %f2, %f8, %f2
  80         ldd             [$key + 32], %f6        ! round[2]
  81         ldd             [$key + 40], %f8
  82         add             $key, 32, $key
  83         sub             $rounds, 4, $rounds
  84
  85 .Loop_enc:
  86         fmovd           %f0, %f4
  87         faesencx        %f2, %f10, %f0
  88         faesencx        %f4, %f12, %f2
  89         ldd             [$key + 16], %f10
  90         ldd             [$key + 24], %f12
  91         add             $key, 32, $key
  92
  93         fmovd           %f0, %f4
  94         faesencx        %f2, %f6, %f0
  95         faesencx        %f4, %f8, %f2
  96         ldd             [$key +  0], %f6
  97         ldd             [$key +  8], %f8
  98
  99         brnz,a          $rounds, .Loop_enc
 100         sub             $rounds, 2, $rounds
 101
 102         andcc           $out, 7, $tmp           ! is output aligned?
 103         andn            $out, 7, $out
 104         mov             0xff, $mask
 105         srl             $mask, $tmp, $mask
 106         add             %o7, 64, %o7
 107         sll             $tmp, 3, $tmp
 108
 109         fmovd           %f0, %f4
 110         faesencx        %f2, %f10, %f0
 111         faesencx        %f4, %f12, %f2
 112         ldd             [%o7 + $tmp], %f14      ! shift right params
 113
 114         fmovd           %f0, %f4
 115         faesenclx       %f2, %f6, %f0
 116         faesenclx       %f4, %f8, %f2
 117
 118         bnz,pn          %icc, .Lenc_out_unaligned
 119         mov             %g1, %o7
 120
 121         std             %f0, [$out + 0]
 122         retl
 123         std             %f2, [$out + 8]
 124
 125 .align  16
 126 .Lenc_out_unaligned:
 127         add             $out, 16, $inp
 128         orn             %g0, $mask, $tmp
 129         fshiftorx       %f0, %f0, %f14, %f4
 130         fshiftorx       %f0, %f2, %f14, %f6
 131         fshiftorx       %f2, %f2, %f14, %f8
 132
 133         stda            %f4, [$out + $mask]0xc0 ! partial store
 134         std             %f6, [$out + 8]
 135         stda            %f8, [$inp + $tmp]0xc0  ! partial store
 136         retl
 137         nop
 138 .type   aes_fx_encrypt,#function
 139 .size   aes_fx_encrypt,.-aes_fx_encrypt
 140
 141 .globl  aes_fx_decrypt
 142 .align  32
 143 aes_fx_decrypt:
 144         and             $inp, 7, $tmp           ! is input aligned?
 145         andn            $inp, 7, $inp
 146         ldd             [$key +  0], %f6        ! round[0]
 147         ldd             [$key +  8], %f8
 148         mov             %o7, %g1
 149         ld              [$key + 240], $rounds
 150
 151 1:      call            .+8
 152         add             %o7, .Linp_align-1b, %o7
 153
 154         sll             $tmp, 3, $tmp
 155         ldd             [$inp + 0], %f0         ! load input
 156         brz,pt          $tmp, .Ldec_inp_aligned
 157         ldd             [$inp + 8], %f2
 158
 159         ldd             [%o7 + $tmp], %f14      ! shift left params
 160         ldd             [$inp + 16], %f4
 161         fshiftorx       %f0, %f2, %f14, %f0
 162         fshiftorx       %f2, %f4, %f14, %f2
 163
 164 .Ldec_inp_aligned:
 165         ldd             [$key + 16], %f10       ! round[1]
 166         ldd             [$key + 24], %f12
 167
 168         fxor            %f0, %f6, %f0           ! ^=round[0]
 169         fxor            %f2, %f8, %f2
 170         ldd             [$key + 32], %f6        ! round[2]
 171         ldd             [$key + 40], %f8
 172         add             $key, 32, $key
 173         sub             $rounds, 4, $rounds
 174
 175 .Loop_dec:
 176         fmovd           %f0, %f4
 177         faesdecx        %f2, %f10, %f0
 178         faesdecx        %f4, %f12, %f2
 179         ldd             [$key + 16], %f10
 180         ldd             [$key + 24], %f12
 181         add             $key, 32, $key
 182
 183         fmovd           %f0, %f4
 184         faesdecx        %f2, %f6, %f0
 185         faesdecx        %f4, %f8, %f2
 186         ldd             [$key +  0], %f6
 187         ldd             [$key +  8], %f8
 188
 189         brnz,a          $rounds, .Loop_dec
 190         sub             $rounds, 2, $rounds
 191
 192         andcc           $out, 7, $tmp           ! is output aligned?
 193         andn            $out, 7, $out
 194         mov             0xff, $mask
 195         srl             $mask, $tmp, $mask
 196         add             %o7, 64, %o7
 197         sll             $tmp, 3, $tmp
 198
 199         fmovd           %f0, %f4
 200         faesdecx        %f2, %f10, %f0
 201         faesdecx        %f4, %f12, %f2
 202         ldd             [%o7 + $tmp], %f14      ! shift right params
 203
 204         fmovd           %f0, %f4
 205         faesdeclx       %f2, %f6, %f0
 206         faesdeclx       %f4, %f8, %f2
 207
 208         bnz,pn          %icc, .Ldec_out_unaligned
 209         mov             %g1, %o7
 210
 211         std             %f0, [$out + 0]
 212         retl
 213         std             %f2, [$out + 8]
 214
 215 .align  16
 216 .Ldec_out_unaligned:
 217         add             $out, 16, $inp
 218         orn             %g0, $mask, $tmp
 219         fshiftorx       %f0, %f0, %f14, %f4
 220         fshiftorx       %f0, %f2, %f14, %f6
 221         fshiftorx       %f2, %f2, %f14, %f8
 222
 223         stda            %f4, [$out + $mask]0xc0 ! partial store
 224         std             %f6, [$out + 8]
 225         stda            %f8, [$inp + $tmp]0xc0  ! partial store
 226         retl
 227         nop
 228 .type   aes_fx_decrypt,#function
 229 .size   aes_fx_decrypt,.-aes_fx_decrypt
 230 ___
 231 }
 232 {
 233 my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
 234 $code.=<<___;
 235 .globl  aes_fx_set_decrypt_key
 236 .align  32
 237 aes_fx_set_decrypt_key:
 238         b               .Lset_encrypt_key
 239         mov             -1, $inc
 240         retl
 241         nop
 242 .type   aes_fx_set_decrypt_key,#function
 243 .size   aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
 244
 245 .globl  aes_fx_set_encrypt_key
 246 .align  32
 247 aes_fx_set_encrypt_key:
 248         mov             1, $inc
 249         nop
 250 .Lset_encrypt_key:
 251         and             $inp, 7, $tmp
 252         andn            $inp, 7, $inp
 253         sll             $tmp, 3, $tmp
 254         mov             %o7, %g1
 255
 256 1:      call            .+8
 257         add             %o7, .Linp_align-1b, %o7
 258
 259         ldd             [%o7 + $tmp], %f10      ! shift left params
 260         mov             %g1, %o7
 261
 262         cmp             $bits, 192
 263         ldd             [$inp + 0], %f0
 264         bl,pt           %icc, .L128
 265         ldd             [$inp + 8], %f2
 266
 267         be,pt           %icc, .L192
 268         ldd             [$inp + 16], %f4
 269         brz,pt          $tmp, .L256aligned
 270         ldd             [$inp + 24], %f6
 271
 272         ldd             [$inp + 32], %f8
 273         fshiftorx       %f0, %f2, %f10, %f0
 274         fshiftorx       %f2, %f4, %f10, %f2
 275         fshiftorx       %f4, %f6, %f10, %f4
 276         fshiftorx       %f6, %f8, %f10, %f6
 277
 278 .L256aligned:
 279         mov             14, $bits
 280         and             $inc, `14*16`, $tmp
 281         st              $bits, [$out + 240]     ! store rounds
 282         add             $out, $tmp, $out        ! start or end of key schedule
 283         sllx            $inc, 4, $inc           ! 16 or -16
 284 ___
 285 for ($i=0; $i<6; $i++) {
 286     $code.=<<___;
 287         std             %f0, [$out + 0]
 288         faeskeyx        %f6, `0x10+$i`, %f0
 289         std             %f2, [$out + 8]
 290         add             $out, $inc, $out
 291         faeskeyx        %f0, 0x00, %f2
 292         std             %f4, [$out + 0]
 293         faeskeyx        %f2, 0x01, %f4
 294         std             %f6, [$out + 8]
 295         add             $out, $inc, $out
 296         faeskeyx        %f4, 0x00, %f6
 297 ___
 298 }
 299 $code.=<<___;
 300         std             %f0, [$out + 0]
 301         faeskeyx        %f6, `0x10+$i`, %f0
 302         std             %f2, [$out + 8]
 303         add             $out, $inc, $out
 304         faeskeyx        %f0, 0x00, %f2
 305         std             %f4,[$out + 0]
 306         std             %f6,[$out + 8]
 307         add             $out, $inc, $out
 308         std             %f0,[$out + 0]
 309         std             %f2,[$out + 8]
 310         retl
 311         xor             %o0, %o0, %o0           ! return 0
 312
 313 .align  16
 314 .L192:
 315         brz,pt          $tmp, .L192aligned
 316         nop
 317
 318         ldd             [$inp + 24], %f6
 319         fshiftorx       %f0, %f2, %f10, %f0
 320         fshiftorx       %f2, %f4, %f10, %f2
 321         fshiftorx       %f4, %f6, %f10, %f4
 322
 323 .L192aligned:
 324         mov             12, $bits
 325         and             $inc, `12*16`, $tmp
 326         st              $bits, [$out + 240]     ! store rounds
 327         add             $out, $tmp, $out        ! start or end of key schedule
 328         sllx            $inc, 4, $inc           ! 16 or -16
 329 ___
 330 for ($i=0; $i<8; $i+=2) {
 331     $code.=<<___;
 332         std             %f0, [$out + 0]
 333         faeskeyx        %f4, `0x10+$i`, %f0
 334         std             %f2, [$out + 8]
 335         add             $out, $inc, $out
 336         faeskeyx        %f0, 0x00, %f2
 337         std             %f4, [$out + 0]
 338         faeskeyx        %f2, 0x00, %f4
 339         std             %f0, [$out + 8]
 340         add             $out, $inc, $out
 341         faeskeyx        %f4, `0x10+$i+1`, %f0
 342         std             %f2, [$out + 0]
 343         faeskeyx        %f0, 0x00, %f2
 344         std             %f4, [$out + 8]
 345         add             $out, $inc, $out
 346 ___
 347 $code.=<<___            if ($i<6);
 348         faeskeyx        %f2, 0x00, %f4
 349 ___
 350 }
 351 $code.=<<___;
 352         std             %f0, [$out + 0]
 353         std             %f2, [$out + 8]
 354         retl
 355         xor             %o0, %o0, %o0           ! return 0
 356
 357 .align  16
 358 .L128:
 359         brz,pt          $tmp, .L128aligned
 360         nop
 361
 362         ldd             [$inp + 16], %f4
 363         fshiftorx       %f0, %f2, %f10, %f0
 364         fshiftorx       %f2, %f4, %f10, %f2
 365
 366 .L128aligned:
 367         mov             10, $bits
 368         and             $inc, `10*16`, $tmp
 369         st              $bits, [$out + 240]     ! store rounds
 370         add             $out, $tmp, $out        ! start or end of key schedule
 371         sllx            $inc, 4, $inc           ! 16 or -16
 372 ___
 373 for ($i=0; $i<10; $i++) {
 374     $code.=<<___;
 375         std             %f0, [$out + 0]
 376         faeskeyx        %f2, `0x10+$i`, %f0
 377         std             %f2, [$out + 8]
 378         add             $out, $inc, $out
 379         faeskeyx        %f0, 0x00, %f2
 380 ___
 381 }
 382 $code.=<<___;
 383         std             %f0, [$out + 0]
 384         std             %f2, [$out + 8]
 385         retl
 386         xor             %o0, %o0, %o0           ! return 0
 387 .type   aes_fx_set_encrypt_key,#function
 388 .size   aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
 389 ___
 390 }
 391 {
 392 my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
 393 my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
 394 my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
 395    = map("%f$_",grep { !($_ & 1) } (16 .. 62));
 396 my ($ileft,$iright) = ($ialign,$oalign);
 397
 398 $code.=<<___;
 399 .globl  aes_fx_cbc_encrypt
 400 .align  32
 401 aes_fx_cbc_encrypt:
 402         save            %sp, -STACK_FRAME-16, %sp
 403         srln            $len, 4, $len
 404         and             $inp, 7, $ialign
 405         andn            $inp, 7, $inp
 406         brz,pn          $len, .Lcbc_no_data
 407         sll             $ialign, 3, $ileft
 408
 409 1:      call            .+8
 410         add             %o7, .Linp_align-1b, %o7
 411
 412         ld              [$key + 240], $rounds
 413         and             $out, 7, $oalign
 414         ld              [$ivp + 0], %f0         ! load ivec
 415         andn            $out, 7, $out
 416         ld              [$ivp + 4], %f1
 417         sll             $oalign, 3, $mask
 418         ld              [$ivp + 8], %f2
 419         ld              [$ivp + 12], %f3
 420
 421         sll             $rounds, 4, $rounds
 422         add             $rounds, $key, $end
 423         ldd             [$key + 0], $r0hi       ! round[0]
 424         ldd             [$key + 8], $r0lo
 425
 426         add             $inp, 16, $inp
 427         sub             $len,  1, $len
 428         ldd             [$end + 0], $rlhi       ! round[last]
 429         ldd             [$end + 8], $rllo
 430
 431         mov             16, $inc
 432         movrz           $len, 0, $inc
 433         ldd             [$key + 16], %f10       ! round[1]
 434         ldd             [$key + 24], %f12
 435
 436         ldd             [%o7 + $ileft], $fshift ! shift left params
 437         add             %o7, 64, %o7
 438         ldd             [$inp - 16], $in0       ! load input
 439         ldd             [$inp -  8], $in1
 440         ldda            [$inp]0x82, $intail     ! non-faulting load
 441         brz             $dir, .Lcbc_decrypt
 442         add             $inp, $inc, $inp        ! inp+=16
 443
 444         fxor            $r0hi, %f0, %f0         ! ivec^=round[0]
 445         fxor            $r0lo, %f2, %f2
 446         fshiftorx       $in0, $in1, $fshift, $in0
 447         fshiftorx       $in1, $intail, $fshift, $in1
 448         nop
 449
 450 .Loop_cbc_enc:
 451         fxor            $in0, %f0, %f0          ! inp^ivec^round[0]
 452         fxor            $in1, %f2, %f2
 453         ldd             [$key + 32], %f6        ! round[2]
 454         ldd             [$key + 40], %f8
 455         add             $key, 32, $end
 456         sub             $rounds, 16*6, $inner
 457
 458 .Lcbc_enc:
 459         fmovd           %f0, %f4
 460         faesencx        %f2, %f10, %f0
 461         faesencx        %f4, %f12, %f2
 462         ldd             [$end + 16], %f10
 463         ldd             [$end + 24], %f12
 464         add             $end, 32, $end
 465
 466         fmovd           %f0, %f4
 467         faesencx        %f2, %f6, %f0
 468         faesencx        %f4, %f8, %f2
 469         ldd             [$end + 0], %f6
 470         ldd             [$end + 8], %f8
 471
 472         brnz,a          $inner, .Lcbc_enc
 473         sub             $inner, 16*2, $inner
 474
 475         fmovd           %f0, %f4
 476         faesencx        %f2, %f10, %f0
 477         faesencx        %f4, %f12, %f2
 478         ldd             [$end + 16], %f10       ! round[last-1]
 479         ldd             [$end + 24], %f12
 480
 481         movrz           $len, 0, $inc
 482         fmovd           $intail, $in0
 483         ldd             [$inp - 8], $in1        ! load next input block
 484         ldda            [$inp]0x82, $intail     ! non-faulting load
 485         add             $inp, $inc, $inp        ! inp+=16
 486
 487         fmovd           %f0, %f4
 488         faesencx        %f2, %f6, %f0
 489         faesencx        %f4, %f8, %f2
 490
 491         fshiftorx       $in0, $in1, $fshift, $in0
 492         fshiftorx       $in1, $intail, $fshift, $in1
 493
 494         fmovd           %f0, %f4
 495         faesencx        %f2, %f10, %f0
 496         faesencx        %f4, %f12, %f2
 497         ldd             [$key + 16], %f10       ! round[1]
 498         ldd             [$key + 24], %f12
 499
 500         fxor            $r0hi, $in0, $in0       ! inp^=round[0]
 501         fxor            $r0lo, $in1, $in1
 502
 503         fmovd           %f0, %f4
 504         faesenclx       %f2, $rlhi, %f0
 505         faesenclx       %f4, $rllo, %f2
 506
 507         brnz,pn         $oalign, .Lcbc_enc_unaligned_out
 508         nop
 509
 510         std             %f0, [$out + 0]
 511         std             %f2, [$out + 8]
 512         add             $out, 16, $out
 513
 514         brnz,a          $len, .Loop_cbc_enc
 515         sub             $len, 1, $len
 516
 517         st              %f0, [$ivp + 0]         ! output ivec
 518         st              %f1, [$ivp + 4]
 519         st              %f2, [$ivp + 8]
 520         st              %f3, [$ivp + 12]
 521
 522 .Lcbc_no_data:
 523         ret
 524         restore
 525
 526 .align  32
 527 .Lcbc_enc_unaligned_out:
 528         ldd             [%o7 + $mask], $fshift  ! shift right params
 529         mov             0xff, $mask
 530         srl             $mask, $oalign, $mask
 531         sub             %g0, $ileft, $iright
 532
 533         fshiftorx       %f0, %f0, $fshift, %f6
 534         fshiftorx       %f0, %f2, $fshift, %f8
 535
 536         stda            %f6, [$out + $mask]0xc0 ! partial store
 537         orn             %g0, $mask, $mask
 538         std             %f8, [$out + 8]
 539         add             $out, 16, $out
 540         brz             $len, .Lcbc_enc_unaligned_out_done
 541         sub             $len, 1, $len
 542         b               .Loop_cbc_enc_unaligned_out
 543         nop
 544
 545 .align  32
 546 .Loop_cbc_enc_unaligned_out:
 547         fmovd           %f2, $outhead
 548         fxor            $in0, %f0, %f0          ! inp^ivec^round[0]
 549         fxor            $in1, %f2, %f2
 550         ldd             [$key + 32], %f6        ! round[2]
 551         ldd             [$key + 40], %f8
 552
 553         fmovd           %f0, %f4
 554         faesencx        %f2, %f10, %f0
 555         faesencx        %f4, %f12, %f2
 556         ldd             [$key + 48], %f10       ! round[3]
 557         ldd             [$key + 56], %f12
 558
 559         ldx             [$inp - 16], %o0
 560         ldx             [$inp -  8], %o1
 561         brz             $ileft, .Lcbc_enc_aligned_inp
 562         movrz           $len, 0, $inc
 563
 564         ldx             [$inp], %o2
 565         sllx            %o0, $ileft, %o0
 566         srlx            %o1, $iright, %g1
 567         sllx            %o1, $ileft, %o1
 568         or              %g1, %o0, %o0
 569         srlx            %o2, $iright, %o2
 570         or              %o2, %o1, %o1
 571
 572 .Lcbc_enc_aligned_inp:
 573         fmovd           %f0, %f4
 574         faesencx        %f2, %f6, %f0
 575         faesencx        %f4, %f8, %f2
 576         ldd             [$key + 64], %f6        ! round[4]
 577         ldd             [$key + 72], %f8
 578         add             $key, 64, $end
 579         sub             $rounds, 16*8, $inner
 580
 581         stx             %o0, [%sp + LOCALS + 0]
 582         stx             %o1, [%sp + LOCALS + 8]
 583         add             $inp, $inc, $inp        ! inp+=16
 584         nop
 585
 586 .Lcbc_enc_unaligned:
 587         fmovd           %f0, %f4
 588         faesencx        %f2, %f10, %f0
 589         faesencx        %f4, %f12, %f2
 590         ldd             [$end + 16], %f10
 591         ldd             [$end + 24], %f12
 592         add             $end, 32, $end
 593
 594         fmovd           %f0, %f4
 595         faesencx        %f2, %f6, %f0
 596         faesencx        %f4, %f8, %f2
 597         ldd             [$end + 0], %f6
 598         ldd             [$end + 8], %f8
 599
 600         brnz,a          $inner, .Lcbc_enc_unaligned
 601         sub             $inner, 16*2, $inner
 602
 603         fmovd           %f0, %f4
 604         faesencx        %f2, %f10, %f0
 605         faesencx        %f4, %f12, %f2
 606         ldd             [$end + 16], %f10       ! round[last-1]
 607         ldd             [$end + 24], %f12
 608
 609         fmovd           %f0, %f4
 610         faesencx        %f2, %f6, %f0
 611         faesencx        %f4, %f8, %f2
 612
 613         ldd             [%sp + LOCALS + 0], $in0
 614         ldd             [%sp + LOCALS + 8], $in1
 615
 616         fmovd           %f0, %f4
 617         faesencx        %f2, %f10, %f0
 618         faesencx        %f4, %f12, %f2
 619         ldd             [$key + 16], %f10       ! round[1]
 620         ldd             [$key + 24], %f12
 621
 622         fxor            $r0hi, $in0, $in0       ! inp^=round[0]
 623         fxor            $r0lo, $in1, $in1
 624
 625         fmovd           %f0, %f4
 626         faesenclx       %f2, $rlhi, %f0
 627         faesenclx       %f4, $rllo, %f2
 628
 629         fshiftorx       $outhead, %f0, $fshift, %f6
 630         fshiftorx       %f0, %f2, $fshift, %f8
 631         std             %f6, [$out + 0]
 632         std             %f8, [$out + 8]
 633         add             $out, 16, $out
 634
 635         brnz,a          $len, .Loop_cbc_enc_unaligned_out
 636         sub             $len, 1, $len
 637
 638 .Lcbc_enc_unaligned_out_done:
 639         fshiftorx       %f2, %f2, $fshift, %f8
 640         stda            %f8, [$out + $mask]0xc0 ! partial store
 641
 642         st              %f0, [$ivp + 0]         ! output ivec
 643         st              %f1, [$ivp + 4]
 644         st              %f2, [$ivp + 8]
 645         st              %f3, [$ivp + 12]
 646
 647         ret
 648         restore
 649
 650 .align  32
 651 .Lcbc_decrypt:
 652         fshiftorx       $in0, $in1, $fshift, $in0
 653         fshiftorx       $in1, $intail, $fshift, $in1
 654         fmovd           %f0, $iv0
 655         fmovd           %f2, $iv1
 656
 657 .Loop_cbc_dec:
 658         fxor            $in0, $r0hi, %f0        ! inp^round[0]
 659         fxor            $in1, $r0lo, %f2
 660         ldd             [$key + 32], %f6        ! round[2]
 661         ldd             [$key + 40], %f8
 662         add             $key, 32, $end
 663         sub             $rounds, 16*6, $inner
 664
 665 .Lcbc_dec:
 666         fmovd           %f0, %f4
 667         faesdecx        %f2, %f10, %f0
 668         faesdecx        %f4, %f12, %f2
 669         ldd             [$end + 16], %f10
 670         ldd             [$end + 24], %f12
 671         add             $end, 32, $end
 672
 673         fmovd           %f0, %f4
 674         faesdecx        %f2, %f6, %f0
 675         faesdecx        %f4, %f8, %f2
 676         ldd             [$end + 0], %f6
 677         ldd             [$end + 8], %f8
 678
 679         brnz,a          $inner, .Lcbc_dec
 680         sub             $inner, 16*2, $inner
 681
 682         fmovd           %f0, %f4
 683         faesdecx        %f2, %f10, %f0
 684         faesdecx        %f4, %f12, %f2
 685         ldd             [$end + 16], %f10       ! round[last-1]
 686         ldd             [$end + 24], %f12
 687
 688         fmovd           %f0, %f4
 689         faesdecx        %f2, %f6, %f0
 690         faesdecx        %f4, %f8, %f2
 691         fxor            $iv0, $rlhi, %f6        ! ivec^round[last]
 692         fxor            $iv1, $rllo, %f8
 693         fmovd           $in0, $iv0
 694         fmovd           $in1, $iv1
 695
 696         movrz           $len, 0, $inc
 697         fmovd           $intail, $in0
 698         ldd             [$inp - 8], $in1        ! load next input block
 699         ldda            [$inp]0x82, $intail     ! non-faulting load
 700         add             $inp, $inc, $inp        ! inp+=16
 701
 702         fmovd           %f0, %f4
 703         faesdecx        %f2, %f10, %f0
 704         faesdecx        %f4, %f12, %f2
 705         ldd             [$key + 16], %f10       ! round[1]
 706         ldd             [$key + 24], %f12
 707
 708         fshiftorx       $in0, $in1, $fshift, $in0
 709         fshiftorx       $in1, $intail, $fshift, $in1
 710
 711         fmovd           %f0, %f4
 712         faesdeclx       %f2, %f6, %f0
 713         faesdeclx       %f4, %f8, %f2
 714
 715         brnz,pn         $oalign, .Lcbc_dec_unaligned_out
 716         nop
 717
 718         std             %f0, [$out + 0]
 719         std             %f2, [$out + 8]
 720         add             $out, 16, $out
 721
 722         brnz,a          $len, .Loop_cbc_dec
 723         sub             $len, 1, $len
 724
 725         st              $iv0,    [$ivp + 0]     ! output ivec
 726         st              $iv0#lo, [$ivp + 4]
 727         st              $iv1,    [$ivp + 8]
 728         st              $iv1#lo, [$ivp + 12]
 729
 730         ret
 731         restore
 732
 733 .align  32
 734 .Lcbc_dec_unaligned_out:
 735         ldd             [%o7 + $mask], $fshift  ! shift right params
 736         mov             0xff, $mask
 737         srl             $mask, $oalign, $mask
 738         sub             %g0, $ileft, $iright
 739
 740         fshiftorx       %f0, %f0, $fshift, %f6
 741         fshiftorx       %f0, %f2, $fshift, %f8
 742
 743         stda            %f6, [$out + $mask]0xc0 ! partial store
 744         orn             %g0, $mask, $mask
 745         std             %f8, [$out + 8]
 746         add             $out, 16, $out
 747         brz             $len, .Lcbc_dec_unaligned_out_done
 748         sub             $len, 1, $len
 749         b               .Loop_cbc_dec_unaligned_out
 750         nop
 751
 752 .align  32
 753 .Loop_cbc_dec_unaligned_out:
 754         fmovd           %f2, $outhead
 755         fxor            $in0, $r0hi, %f0        ! inp^round[0]
 756         fxor            $in1, $r0lo, %f2
 757         ldd             [$key + 32], %f6        ! round[2]
 758         ldd             [$key + 40], %f8
 759
 760         fmovd           %f0, %f4
 761         faesdecx        %f2, %f10, %f0
 762         faesdecx        %f4, %f12, %f2
 763         ldd             [$key + 48], %f10       ! round[3]
 764         ldd             [$key + 56], %f12
 765
 766         ldx             [$inp - 16], %o0
 767         ldx             [$inp - 8], %o1
 768         brz             $ileft, .Lcbc_dec_aligned_inp
 769         movrz           $len, 0, $inc
 770
 771         ldx             [$inp], %o2
 772         sllx            %o0, $ileft, %o0
 773         srlx            %o1, $iright, %g1
 774         sllx            %o1, $ileft, %o1
 775         or              %g1, %o0, %o0
 776         srlx            %o2, $iright, %o2
 777         or              %o2, %o1, %o1
 778
 779 .Lcbc_dec_aligned_inp:
 780         fmovd           %f0, %f4
 781         faesdecx        %f2, %f6, %f0
 782         faesdecx        %f4, %f8, %f2
 783         ldd             [$key + 64], %f6        ! round[4]
 784         ldd             [$key + 72], %f8
 785         add             $key, 64, $end
 786         sub             $rounds, 16*8, $inner
 787
 788         stx             %o0, [%sp + LOCALS + 0]
 789         stx             %o1, [%sp + LOCALS + 8]
 790         add             $inp, $inc, $inp        ! inp+=16
 791         nop
 792
 793 .Lcbc_dec_unaligned:
 794         fmovd           %f0, %f4
 795         faesdecx        %f2, %f10, %f0
 796         faesdecx        %f4, %f12, %f2
 797         ldd             [$end + 16], %f10
 798         ldd             [$end + 24], %f12
 799         add             $end, 32, $end
 800
 801         fmovd           %f0, %f4
 802         faesdecx        %f2, %f6, %f0
 803         faesdecx        %f4, %f8, %f2
 804         ldd             [$end + 0], %f6
 805         ldd             [$end + 8], %f8
 806
 807         brnz,a          $inner, .Lcbc_dec_unaligned
 808         sub             $inner, 16*2, $inner
 809
 810         fmovd           %f0, %f4
 811         faesdecx        %f2, %f10, %f0
 812         faesdecx        %f4, %f12, %f2
 813         ldd             [$end + 16], %f10       ! round[last-1]
 814         ldd             [$end + 24], %f12
 815
 816         fmovd           %f0, %f4
 817         faesdecx        %f2, %f6, %f0
 818         faesdecx        %f4, %f8, %f2
 819
 820         fxor            $iv0, $rlhi, %f6        ! ivec^round[last]
 821         fxor            $iv1, $rllo, %f8
 822         fmovd           $in0, $iv0
 823         fmovd           $in1, $iv1
 824         ldd             [%sp + LOCALS + 0], $in0
 825         ldd             [%sp + LOCALS + 8], $in1
 826
 827         fmovd           %f0, %f4
 828         faesdecx        %f2, %f10, %f0
 829         faesdecx        %f4, %f12, %f2
 830         ldd             [$key + 16], %f10       ! round[1]
 831         ldd             [$key + 24], %f12
 832
 833         fmovd           %f0, %f4
 834         faesdeclx       %f2, %f6, %f0
 835         faesdeclx       %f4, %f8, %f2
 836
 837         fshiftorx       $outhead, %f0, $fshift, %f6
 838         fshiftorx       %f0, %f2, $fshift, %f8
 839         std             %f6, [$out + 0]
 840         std             %f8, [$out + 8]
 841         add             $out, 16, $out
 842
 843         brnz,a          $len, .Loop_cbc_dec_unaligned_out
 844         sub             $len, 1, $len
 845
 846 .Lcbc_dec_unaligned_out_done:
 847         fshiftorx       %f2, %f2, $fshift, %f8
 848         stda            %f8, [$out + $mask]0xc0 ! partial store
 849
 850         st              $iv0,    [$ivp + 0]     ! output ivec
 851         st              $iv0#lo, [$ivp + 4]
 852         st              $iv1,    [$ivp + 8]
 853         st              $iv1#lo, [$ivp + 12]
 854
 855         ret
 856         restore
 857 .type   aes_fx_cbc_encrypt,#function
 858 .size   aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
 859 ___
 860 }
 861 {
 862 my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
 863 my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
 864 my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
 865    = map("%f$_",grep { !($_ & 1) } (16 .. 62));
 866 my ($ileft,$iright) = ($ialign, $oalign);
 867 my $one = "%f14";
 868
 869 $code.=<<___;
 870 .globl  aes_fx_ctr32_encrypt_blocks
 871 .align  32
 872 aes_fx_ctr32_encrypt_blocks:
 873         save            %sp, -STACK_FRAME-16, %sp
 874         srln            $len, 0, $len
 875         and             $inp, 7, $ialign
 876         andn            $inp, 7, $inp
 877         brz,pn          $len, .Lctr32_no_data
 878         sll             $ialign, 3, $ileft
 879
 880 .Lpic:  call            .+8
 881         add             %o7, .Linp_align - .Lpic, %o7
 882
 883         ld              [$key + 240], $rounds
 884         and             $out, 7, $oalign
 885         ld              [$ivp +  0], $ctr0      ! load counter
 886         andn            $out, 7, $out
 887         ld              [$ivp +  4], $ctr0#lo
 888         sll             $oalign, 3, $mask
 889         ld              [$ivp +  8], $ctr1
 890         ld              [$ivp + 12], $ctr1#lo
 891         ldd             [%o7 + 128], $one
 892
 893         sll             $rounds, 4, $rounds
 894         add             $rounds, $key, $end
 895         ldd             [$key + 0], $r0hi       ! round[0]
 896         ldd             [$key + 8], $r0lo
 897
 898         add             $inp, 16, $inp
 899         sub             $len, 1, $len
 900         ldd             [$key + 16], %f10       ! round[1]
 901         ldd             [$key + 24], %f12
 902
 903         mov             16, $inc
 904         movrz           $len, 0, $inc
 905         ldd             [$end + 0], $rlhi       ! round[last]
 906         ldd             [$end + 8], $rllo
 907
 908         ldd             [%o7 + $ileft], $fshift ! shiftleft params
 909         add             %o7, 64, %o7
 910         ldd             [$inp - 16], $in0       ! load input
 911         ldd             [$inp -  8], $in1
 912         ldda            [$inp]0x82, $intail     ! non-faulting load
 913         add             $inp, $inc, $inp        ! inp+=16
 914
 915         fshiftorx       $in0, $in1, $fshift, $in0
 916         fshiftorx       $in1, $intail, $fshift, $in1
 917
 918 .Loop_ctr32:
 919         fxor            $ctr0, $r0hi, %f0       ! counter^round[0]
 920         fxor            $ctr1, $r0lo, %f2
 921         ldd             [$key + 32], %f6        ! round[2]
 922         ldd             [$key + 40], %f8
 923         add             $key, 32, $end
 924         sub             $rounds, 16*6, $inner
 925
 926 .Lctr32_enc:
 927         fmovd           %f0, %f4
 928         faesencx        %f2, %f10, %f0
 929         faesencx        %f4, %f12, %f2
 930         ldd             [$end + 16], %f10
 931         ldd             [$end + 24], %f12
 932         add             $end, 32, $end
 933
 934         fmovd           %f0, %f4
 935         faesencx        %f2, %f6, %f0
 936         faesencx        %f4, %f8, %f2
 937         ldd             [$end + 0], %f6
 938         ldd             [$end + 8], %f8
 939
 940         brnz,a          $inner, .Lctr32_enc
 941         sub             $inner, 16*2, $inner
 942
 943         fmovd           %f0, %f4
 944         faesencx        %f2, %f10, %f0
 945         faesencx        %f4, %f12, %f2
 946         ldd             [$end + 16], %f10       ! round[last-1]
 947         ldd             [$end + 24], %f12
 948
 949         fmovd           %f0, %f4
 950         faesencx        %f2, %f6, %f0
 951         faesencx        %f4, %f8, %f2
 952         fxor            $in0, $rlhi, %f6        ! inp^round[last]
 953         fxor            $in1, $rllo, %f8
 954
 955         movrz           $len, 0, $inc
 956         fmovd           $intail, $in0
 957         ldd             [$inp - 8], $in1        ! load next input block
 958         ldda            [$inp]0x82, $intail     ! non-faulting load
 959         add             $inp, $inc, $inp        ! inp+=16
 960
 961         fmovd           %f0, %f4
 962         faesencx        %f2, %f10, %f0
 963         faesencx        %f4, %f12, %f2
 964         ldd             [$key + 16], %f10       ! round[1]
 965         ldd             [$key + 24], %f12
 966
 967         fshiftorx       $in0, $in1, $fshift, $in0
 968         fshiftorx       $in1, $intail, $fshift, $in1
 969         fpadd32         $ctr1, $one, $ctr1      ! increment counter
 970
 971         fmovd           %f0, %f4
 972         faesenclx       %f2, %f6, %f0
 973         faesenclx       %f4, %f8, %f2
 974
 975         brnz,pn         $oalign, .Lctr32_unaligned_out
 976         nop
 977
 978         std             %f0, [$out + 0]
 979         std             %f2, [$out + 8]
 980         add             $out, 16, $out
 981
 982         brnz,a          $len, .Loop_ctr32
 983         sub             $len, 1, $len
 984
 985 .Lctr32_no_data:
 986         ret
 987         restore
 988
 989 .align  32
 990 .Lctr32_unaligned_out:
 991         ldd             [%o7 + $mask], $fshift  ! shift right params
 992         mov             0xff, $mask
 993         srl             $mask, $oalign, $mask
 994         sub             %g0, $ileft, $iright
 995
 996         fshiftorx       %f0, %f0, $fshift, %f6
 997         fshiftorx       %f0, %f2, $fshift, %f8
 998
 999         stda            %f6, [$out + $mask]0xc0 ! partial store
1000         orn             %g0, $mask, $mask
1001         std             %f8, [$out + 8]
1002         add             $out, 16, $out
1003         brz             $len, .Lctr32_unaligned_out_done
1004         sub             $len, 1, $len
1005         b               .Loop_ctr32_unaligned_out
1006         nop
1007
1008 .align  32
1009 .Loop_ctr32_unaligned_out:
1010         fmovd           %f2, $outhead
1011         fxor            $ctr0, $r0hi, %f0       ! counter^round[0]
1012         fxor            $ctr1, $r0lo, %f2
1013         ldd             [$key + 32], %f6        ! round[2]
1014         ldd             [$key + 40], %f8
1015
1016         fmovd           %f0, %f4
1017         faesencx        %f2, %f10, %f0
1018         faesencx        %f4, %f12, %f2
1019         ldd             [$key + 48], %f10       ! round[3]
1020         ldd             [$key + 56], %f12
1021
1022         ldx             [$inp - 16], %o0
1023         ldx             [$inp -  8], %o1
1024         brz             $ileft, .Lctr32_aligned_inp
1025         movrz           $len, 0, $inc
1026
1027         ldx             [$inp], %o2
1028         sllx            %o0, $ileft, %o0
1029         srlx            %o1, $iright, %g1
1030         sllx            %o1, $ileft, %o1
1031         or              %g1, %o0, %o0
1032         srlx            %o2, $iright, %o2
1033         or              %o2, %o1, %o1
1034
1035 .Lctr32_aligned_inp:
1036         fmovd           %f0, %f4
1037         faesencx        %f2, %f6, %f0
1038         faesencx        %f4, %f8, %f2
1039         ldd             [$key + 64], %f6        ! round[4]
1040         ldd             [$key + 72], %f8
1041         add             $key, 64, $end
1042         sub             $rounds, 16*8, $inner
1043
1044         stx             %o0, [%sp + LOCALS + 0]
1045         stx             %o1, [%sp + LOCALS + 8]
1046         add             $inp, $inc, $inp        ! inp+=16
1047         nop
1048
1049 .Lctr32_enc_unaligned:
1050         fmovd           %f0, %f4
1051         faesencx        %f2, %f10, %f0
1052         faesencx        %f4, %f12, %f2
1053         ldd             [$end + 16], %f10
1054         ldd             [$end + 24], %f12
1055         add             $end, 32, $end
1056
1057         fmovd           %f0, %f4
1058         faesencx        %f2, %f6, %f0
1059         faesencx        %f4, %f8, %f2
1060         ldd             [$end + 0], %f6
1061         ldd             [$end + 8], %f8
1062
1063         brnz,a          $inner, .Lctr32_enc_unaligned
1064         sub             $inner, 16*2, $inner
1065
1066         fmovd           %f0, %f4
1067         faesencx        %f2, %f10, %f0
1068         faesencx        %f4, %f12, %f2
1069         ldd             [$end + 16], %f10       ! round[last-1]
1070         ldd             [$end + 24], %f12
1071         fpadd32         $ctr1, $one, $ctr1      ! increment counter
1072
1073         fmovd           %f0, %f4
1074         faesencx        %f2, %f6, %f0
1075         faesencx        %f4, %f8, %f2
1076         fxor            $in0, $rlhi, %f6        ! inp^round[last]
1077         fxor            $in1, $rllo, %f8
1078         ldd             [%sp + LOCALS + 0], $in0
1079         ldd             [%sp + LOCALS + 8], $in1
1080
1081         fmovd           %f0, %f4
1082         faesencx        %f2, %f10, %f0
1083         faesencx        %f4, %f12, %f2
1084         ldd             [$key + 16], %f10       ! round[1]
1085         ldd             [$key + 24], %f12
1086
1087         fmovd           %f0, %f4
1088         faesenclx       %f2, %f6, %f0
1089         faesenclx       %f4, %f8, %f2
1090
1091         fshiftorx       $outhead, %f0, $fshift, %f6
1092         fshiftorx       %f0, %f2, $fshift, %f8
1093         std             %f6, [$out + 0]
1094         std             %f8, [$out + 8]
1095         add             $out, 16, $out
1096
1097         brnz,a          $len, .Loop_ctr32_unaligned_out
1098         sub             $len, 1, $len
1099
1100 .Lctr32_unaligned_out_done:
1101         fshiftorx       %f2, %f2, $fshift, %f8
1102         stda            %f8, [$out + $mask]0xc0 ! partial store
1103
1104         ret
1105         restore
1106 .type   aes_fx_ctr32_encrypt_blocks,#function
1107 .size   aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1108
1109 .align  32
1110 .Linp_align:            ! fshiftorx parameters for left shift toward %rs1
1111         .byte   0, 0, 64,  0,   0, 64,  0, -64
1112         .byte   0, 0, 56,  8,   0, 56,  8, -56
1113         .byte   0, 0, 48, 16,   0, 48, 16, -48
1114         .byte   0, 0, 40, 24,   0, 40, 24, -40
1115         .byte   0, 0, 32, 32,   0, 32, 32, -32
1116         .byte   0, 0, 24, 40,   0, 24, 40, -24
1117         .byte   0, 0, 16, 48,   0, 16, 48, -16
1118         .byte   0, 0,  8, 56,   0,  8, 56, -8
1119 .Lout_align:            ! fshiftorx parameters for right shift toward %rs2
1120         .byte   0, 0,  0, 64,   0,  0, 64,   0
1121         .byte   0, 0,  8, 56,   0,  8, 56,  -8
1122         .byte   0, 0, 16, 48,   0, 16, 48, -16
1123         .byte   0, 0, 24, 40,   0, 24, 40, -24
1124         .byte   0, 0, 32, 32,   0, 32, 32, -32
1125         .byte   0, 0, 40, 24,   0, 40, 24, -40
1126         .byte   0, 0, 48, 16,   0, 48, 16, -48
1127         .byte   0, 0, 56,  8,   0, 56,  8, -56
1128 .Lone:
1129         .word   0, 1
1130 .asciz  "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1131 .align  4
1132 ___
1133 }
1134 # Purpose of these subroutines is to explicitly encode VIS instructions,
1135 # so that one can compile the module without having to specify VIS
1136 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1137 # Idea is to reserve for option to produce "universal" binary and let
1138 # programmer detect if current CPU is VIS capable at run-time.
1139 sub unvis {
1140 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1141 my ($ref,$opf);
1142 my %visopf = (  "faligndata"    => 0x048,
1143                 "bshuffle"      => 0x04c,
1144                 "fpadd32"       => 0x052,
1145                 "fxor"          => 0x06c,
1146                 "fsrc2"         => 0x078        );
1147
1148     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1149
1150     if ($opf=$visopf{$mnemonic}) {
1151         foreach ($rs1,$rs2,$rd) {
1152             return $ref if (!/%f([0-9]{1,2})/);
1153             $_=$1;
1154             if ($1>=32) {
1155                 return $ref if ($1&1);
1156                 # re-encode for upper double register addressing
1157                 $_=($1|$1>>5)&31;
1158             }
1159         }
1160
1161         return  sprintf ".word\t0x%08x !%s",
1162                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1163                         $ref;
1164     } else {
1165         return $ref;
1166     }
1167 }
1168
1169 sub unvis3 {
1170 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1171 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1172 my ($ref,$opf);
1173 my %visopf = (  "alignaddr"     => 0x018,
1174                 "bmask"         => 0x019,
1175                 "alignaddrl"    => 0x01a        );
1176
1177     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1178
1179     if ($opf=$visopf{$mnemonic}) {
1180         foreach ($rs1,$rs2,$rd) {
1181             return $ref if (!/%([goli])([0-9])/);
1182             $_=$bias{$1}+$2;
1183         }
1184
1185         return  sprintf ".word\t0x%08x !%s",
1186                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1187                         $ref;
1188     } else {
1189         return $ref;
1190     }
1191 }
1192
1193 sub unfx {
1194 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1195 my ($ref,$opf);
1196 my %aesopf = (  "faesencx"      => 0x90,
1197                 "faesdecx"      => 0x91,
1198                 "faesenclx"     => 0x92,
1199                 "faesdeclx"     => 0x93,
1200                 "faeskeyx"      => 0x94 );
1201
1202     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1203
1204     if (defined($opf=$aesopf{$mnemonic})) {
1205         $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
1206         $rs2 = oct($rs2) if ($rs2 =~ /^0/);
1207
1208         foreach ($rs1,$rd) {
1209             return $ref if (!/%f([0-9]{1,2})/);
1210             $_=$1;
1211             if ($1>=32) {
1212                 return $ref if ($1&1);
1213                 # re-encode for upper double register addressing
1214                 $_=($1|$1>>5)&31;
1215             }
1216         }
1217
1218         return  sprintf ".word\t0x%08x !%s",
1219                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1220                         $ref;
1221     } else {
1222         return $ref;
1223     }
1224 }
1225
1226 sub unfx3src {
1227 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1228 my ($ref,$opf);
1229 my %aesopf = (  "fshiftorx"     => 0x0b );
1230
1231     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1232
1233     if (defined($opf=$aesopf{$mnemonic})) {
1234         foreach ($rs1,$rs2,$rs3,$rd) {
1235             return $ref if (!/%f([0-9]{1,2})/);
1236             $_=$1;
1237             if ($1>=32) {
1238                 return $ref if ($1&1);
1239                 # re-encode for upper double register addressing
1240                 $_=($1|$1>>5)&31;
1241             }
1242         }
1243
1244         return  sprintf ".word\t0x%08x !%s",
1245                         2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1246                         $ref;
1247     } else {
1248         return $ref;
1249     }
1250 }
1251
1252 foreach (split("\n",$code)) {
1253     s/\`([^\`]*)\`/eval $1/ge;
1254
1255     s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1256
1257     s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1258                 &unfx($1,$2,$3,$4)
1259      /ge or
1260     s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1261                 &unfx3src($1,$2,$3,$4,$5)
1262      /ge or
1263     s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1264                 &unvis($1,$2,$3,$4)
1265      /ge or
1266     s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1267                 &unvis3($1,$2,$3,$4)
1268      /ge;
1269     print $_,"\n";
1270 }
1271
1272 close STDOUT or die "error closing STDOUT: $!";