crypto/aes/asm/aesfx-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # March 2016
  18 #
  19 # Initial support for Fujitsu SPARC64 X/X+ comprises minimally
  20 # required key setup and single-block procedures.
  21 #
  22 # April 2016
  23 #
  24 # Add "teaser" CBC and CTR mode-specific subroutines. "Teaser" means
  25 # that parallelizeable nature of CBC decrypt and CTR is not utilized
  26 # yet. CBC encrypt on the other hand is as good as it can possibly
  27 # get processing one byte in 4.1 cycles with 128-bit key on SPARC64 X.
  28 # This is ~6x faster than pure software implementation...
  29 #
  30 # July 2016
  31 #
  32 # Switch from faligndata to fshiftorx, which allows to omit alignaddr
  33 # instructions and improve single-block and short-input performance
  34 # with misaligned data.
  35
  36 $output = pop;
  37 open STDOUT,">$output";
  38
  39 {
  40 my ($inp,$out,$key,$rounds,$tmp,$mask) = map("%o$_",(0..5));
  41
  42 $code.=<<___;
  43 #include "sparc_arch.h"
  44
  45 #define LOCALS (STACK_BIAS+STACK_FRAME)
  46
  47 .text
  48
  49 .globl  aes_fx_encrypt
  50 .align  32
  51 aes_fx_encrypt:
  52         and             $inp, 7, $tmp           ! is input aligned?
  53         andn            $inp, 7, $inp
  54         ldd             [$key +  0], %f6        ! round[0]
  55         ldd             [$key +  8], %f8
  56         mov             %o7, %g1
  57         ld              [$key + 240], $rounds
  58
  59 1:      call            .+8
  60         add             %o7, .Linp_align-1b, %o7
  61
  62         sll             $tmp, 3, $tmp
  63         ldd             [$inp + 0], %f0         ! load input
  64         brz,pt          $tmp, .Lenc_inp_aligned
  65         ldd             [$inp + 8], %f2
  66
  67         ldd             [%o7 + $tmp], %f14      ! shift left params
  68         ldd             [$inp + 16], %f4
  69         fshiftorx       %f0, %f2, %f14, %f0
  70         fshiftorx       %f2, %f4, %f14, %f2
  71
  72 .Lenc_inp_aligned:
  73         ldd             [$key + 16], %f10       ! round[1]
  74         ldd             [$key + 24], %f12
  75
  76         fxor            %f0, %f6, %f0           ! ^=round[0]
  77         fxor            %f2, %f8, %f2
  78         ldd             [$key + 32], %f6        ! round[2]
  79         ldd             [$key + 40], %f8
  80         add             $key, 32, $key
  81         sub             $rounds, 4, $rounds
  82
  83 .Loop_enc:
  84         fmovd           %f0, %f4
  85         faesencx        %f2, %f10, %f0
  86         faesencx        %f4, %f12, %f2
  87         ldd             [$key + 16], %f10
  88         ldd             [$key + 24], %f12
  89         add             $key, 32, $key
  90
  91         fmovd           %f0, %f4
  92         faesencx        %f2, %f6, %f0
  93         faesencx        %f4, %f8, %f2
  94         ldd             [$key +  0], %f6
  95         ldd             [$key +  8], %f8
  96
  97         brnz,a          $rounds, .Loop_enc
  98         sub             $rounds, 2, $rounds
  99
 100         andcc           $out, 7, $tmp           ! is output aligned?
 101         andn            $out, 7, $out
 102         mov             0xff, $mask
 103         srl             $mask, $tmp, $mask
 104         add             %o7, 64, %o7
 105         sll             $tmp, 3, $tmp
 106
 107         fmovd           %f0, %f4
 108         faesencx        %f2, %f10, %f0
 109         faesencx        %f4, %f12, %f2
 110         ldd             [%o7 + $tmp], %f14      ! shift right params
 111
 112         fmovd           %f0, %f4
 113         faesenclx       %f2, %f6, %f0
 114         faesenclx       %f4, %f8, %f2
 115
 116         bnz,pn          %icc, .Lenc_out_unaligned
 117         mov             %g1, %o7
 118
 119         std             %f0, [$out + 0]
 120         retl
 121         std             %f2, [$out + 8]
 122
 123 .align  16
 124 .Lenc_out_unaligned:
 125         add             $out, 16, $inp
 126         orn             %g0, $mask, $tmp
 127         fshiftorx       %f0, %f0, %f14, %f4
 128         fshiftorx       %f0, %f2, %f14, %f6
 129         fshiftorx       %f2, %f2, %f14, %f8
 130
 131         stda            %f4, [$out + $mask]0xc0 ! partial store
 132         std             %f6, [$out + 8]
 133         stda            %f8, [$inp + $tmp]0xc0  ! partial store
 134         retl
 135         nop
 136 .type   aes_fx_encrypt,#function
 137 .size   aes_fx_encrypt,.-aes_fx_encrypt
 138
 139 .globl  aes_fx_decrypt
 140 .align  32
 141 aes_fx_decrypt:
 142         and             $inp, 7, $tmp           ! is input aligned?
 143         andn            $inp, 7, $inp
 144         ldd             [$key +  0], %f6        ! round[0]
 145         ldd             [$key +  8], %f8
 146         mov             %o7, %g1
 147         ld              [$key + 240], $rounds
 148
 149 1:      call            .+8
 150         add             %o7, .Linp_align-1b, %o7
 151
 152         sll             $tmp, 3, $tmp
 153         ldd             [$inp + 0], %f0         ! load input
 154         brz,pt          $tmp, .Ldec_inp_aligned
 155         ldd             [$inp + 8], %f2
 156
 157         ldd             [%o7 + $tmp], %f14      ! shift left params
 158         ldd             [$inp + 16], %f4
 159         fshiftorx       %f0, %f2, %f14, %f0
 160         fshiftorx       %f2, %f4, %f14, %f2
 161
 162 .Ldec_inp_aligned:
 163         ldd             [$key + 16], %f10       ! round[1]
 164         ldd             [$key + 24], %f12
 165
 166         fxor            %f0, %f6, %f0           ! ^=round[0]
 167         fxor            %f2, %f8, %f2
 168         ldd             [$key + 32], %f6        ! round[2]
 169         ldd             [$key + 40], %f8
 170         add             $key, 32, $key
 171         sub             $rounds, 4, $rounds
 172
 173 .Loop_dec:
 174         fmovd           %f0, %f4
 175         faesdecx        %f2, %f10, %f0
 176         faesdecx        %f4, %f12, %f2
 177         ldd             [$key + 16], %f10
 178         ldd             [$key + 24], %f12
 179         add             $key, 32, $key
 180
 181         fmovd           %f0, %f4
 182         faesdecx        %f2, %f6, %f0
 183         faesdecx        %f4, %f8, %f2
 184         ldd             [$key +  0], %f6
 185         ldd             [$key +  8], %f8
 186
 187         brnz,a          $rounds, .Loop_dec
 188         sub             $rounds, 2, $rounds
 189
 190         andcc           $out, 7, $tmp           ! is output aligned?
 191         andn            $out, 7, $out
 192         mov             0xff, $mask
 193         srl             $mask, $tmp, $mask
 194         add             %o7, 64, %o7
 195         sll             $tmp, 3, $tmp
 196
 197         fmovd           %f0, %f4
 198         faesdecx        %f2, %f10, %f0
 199         faesdecx        %f4, %f12, %f2
 200         ldd             [%o7 + $tmp], %f14      ! shift right params
 201
 202         fmovd           %f0, %f4
 203         faesdeclx       %f2, %f6, %f0
 204         faesdeclx       %f4, %f8, %f2
 205
 206         bnz,pn          %icc, .Ldec_out_unaligned
 207         mov             %g1, %o7
 208
 209         std             %f0, [$out + 0]
 210         retl
 211         std             %f2, [$out + 8]
 212
 213 .align  16
 214 .Ldec_out_unaligned:
 215         add             $out, 16, $inp
 216         orn             %g0, $mask, $tmp
 217         fshiftorx       %f0, %f0, %f14, %f4
 218         fshiftorx       %f0, %f2, %f14, %f6
 219         fshiftorx       %f2, %f2, %f14, %f8
 220
 221         stda            %f4, [$out + $mask]0xc0 ! partial store
 222         std             %f6, [$out + 8]
 223         stda            %f8, [$inp + $tmp]0xc0  ! partial store
 224         retl
 225         nop
 226 .type   aes_fx_decrypt,#function
 227 .size   aes_fx_decrypt,.-aes_fx_decrypt
 228 ___
 229 }
 230 {
 231 my ($inp,$bits,$out,$tmp,$inc) = map("%o$_",(0..5));
 232 $code.=<<___;
 233 .globl  aes_fx_set_decrypt_key
 234 .align  32
 235 aes_fx_set_decrypt_key:
 236         b               .Lset_encrypt_key
 237         mov             -1, $inc
 238         retl
 239         nop
 240 .type   aes_fx_set_decrypt_key,#function
 241 .size   aes_fx_set_decrypt_key,.-aes_fx_set_decrypt_key
 242
 243 .globl  aes_fx_set_encrypt_key
 244 .align  32
 245 aes_fx_set_encrypt_key:
 246         mov             1, $inc
 247         nop
 248 .Lset_encrypt_key:
 249         and             $inp, 7, $tmp
 250         andn            $inp, 7, $inp
 251         sll             $tmp, 3, $tmp
 252         mov             %o7, %g1
 253
 254 1:      call            .+8
 255         add             %o7, .Linp_align-1b, %o7
 256
 257         ldd             [%o7 + $tmp], %f10      ! shift left params
 258         mov             %g1, %o7
 259
 260         cmp             $bits, 192
 261         ldd             [$inp + 0], %f0
 262         bl,pt           %icc, .L128
 263         ldd             [$inp + 8], %f2
 264
 265         be,pt           %icc, .L192
 266         ldd             [$inp + 16], %f4
 267         brz,pt          $tmp, .L256aligned
 268         ldd             [$inp + 24], %f6
 269
 270         ldd             [$inp + 32], %f8
 271         fshiftorx       %f0, %f2, %f10, %f0
 272         fshiftorx       %f2, %f4, %f10, %f2
 273         fshiftorx       %f4, %f6, %f10, %f4
 274         fshiftorx       %f6, %f8, %f10, %f6
 275
 276 .L256aligned:
 277         mov             14, $bits
 278         and             $inc, `14*16`, $tmp
 279         st              $bits, [$out + 240]     ! store rounds
 280         add             $out, $tmp, $out        ! start or end of key schedule
 281         sllx            $inc, 4, $inc           ! 16 or -16
 282 ___
 283 for ($i=0; $i<6; $i++) {
 284     $code.=<<___;
 285         std             %f0, [$out + 0]
 286         faeskeyx        %f6, `0x10+$i`, %f0
 287         std             %f2, [$out + 8]
 288         add             $out, $inc, $out
 289         faeskeyx        %f0, 0x00, %f2
 290         std             %f4, [$out + 0]
 291         faeskeyx        %f2, 0x01, %f4
 292         std             %f6, [$out + 8]
 293         add             $out, $inc, $out
 294         faeskeyx        %f4, 0x00, %f6
 295 ___
 296 }
 297 $code.=<<___;
 298         std             %f0, [$out + 0]
 299         faeskeyx        %f6, `0x10+$i`, %f0
 300         std             %f2, [$out + 8]
 301         add             $out, $inc, $out
 302         faeskeyx        %f0, 0x00, %f2
 303         std             %f4,[$out + 0]
 304         std             %f6,[$out + 8]
 305         add             $out, $inc, $out
 306         std             %f0,[$out + 0]
 307         std             %f2,[$out + 8]
 308         retl
 309         xor             %o0, %o0, %o0           ! return 0
 310
 311 .align  16
 312 .L192:
 313         brz,pt          $tmp, .L192aligned
 314         nop
 315
 316         ldd             [$inp + 24], %f6
 317         fshiftorx       %f0, %f2, %f10, %f0
 318         fshiftorx       %f2, %f4, %f10, %f2
 319         fshiftorx       %f4, %f6, %f10, %f4
 320
 321 .L192aligned:
 322         mov             12, $bits
 323         and             $inc, `12*16`, $tmp
 324         st              $bits, [$out + 240]     ! store rounds
 325         add             $out, $tmp, $out        ! start or end of key schedule
 326         sllx            $inc, 4, $inc           ! 16 or -16
 327 ___
 328 for ($i=0; $i<8; $i+=2) {
 329     $code.=<<___;
 330         std             %f0, [$out + 0]
 331         faeskeyx        %f4, `0x10+$i`, %f0
 332         std             %f2, [$out + 8]
 333         add             $out, $inc, $out
 334         faeskeyx        %f0, 0x00, %f2
 335         std             %f4, [$out + 0]
 336         faeskeyx        %f2, 0x00, %f4
 337         std             %f0, [$out + 8]
 338         add             $out, $inc, $out
 339         faeskeyx        %f4, `0x10+$i+1`, %f0
 340         std             %f2, [$out + 0]
 341         faeskeyx        %f0, 0x00, %f2
 342         std             %f4, [$out + 8]
 343         add             $out, $inc, $out
 344 ___
 345 $code.=<<___            if ($i<6);
 346         faeskeyx        %f2, 0x00, %f4
 347 ___
 348 }
 349 $code.=<<___;
 350         std             %f0, [$out + 0]
 351         std             %f2, [$out + 8]
 352         retl
 353         xor             %o0, %o0, %o0           ! return 0
 354
 355 .align  16
 356 .L128:
 357         brz,pt          $tmp, .L128aligned
 358         nop
 359
 360         ldd             [$inp + 16], %f4
 361         fshiftorx       %f0, %f2, %f10, %f0
 362         fshiftorx       %f2, %f4, %f10, %f2
 363
 364 .L128aligned:
 365         mov             10, $bits
 366         and             $inc, `10*16`, $tmp
 367         st              $bits, [$out + 240]     ! store rounds
 368         add             $out, $tmp, $out        ! start or end of key schedule
 369         sllx            $inc, 4, $inc           ! 16 or -16
 370 ___
 371 for ($i=0; $i<10; $i++) {
 372     $code.=<<___;
 373         std             %f0, [$out + 0]
 374         faeskeyx        %f2, `0x10+$i`, %f0
 375         std             %f2, [$out + 8]
 376         add             $out, $inc, $out
 377         faeskeyx        %f0, 0x00, %f2
 378 ___
 379 }
 380 $code.=<<___;
 381         std             %f0, [$out + 0]
 382         std             %f2, [$out + 8]
 383         retl
 384         xor             %o0, %o0, %o0           ! return 0
 385 .type   aes_fx_set_encrypt_key,#function
 386 .size   aes_fx_set_encrypt_key,.-aes_fx_set_encrypt_key
 387 ___
 388 }
 389 {
 390 my ($inp,$out,$len,$key,$ivp,$dir) = map("%i$_",(0..5));
 391 my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
 392 my ($iv0,$iv1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
 393    = map("%f$_",grep { !($_ & 1) } (16 .. 62));
 394 my ($ileft,$iright) = ($ialign,$oalign);
 395
 396 $code.=<<___;
 397 .globl  aes_fx_cbc_encrypt
 398 .align  32
 399 aes_fx_cbc_encrypt:
 400         save            %sp, -STACK_FRAME-16, %sp
 401         srln            $len, 4, $len
 402         and             $inp, 7, $ialign
 403         andn            $inp, 7, $inp
 404         brz,pn          $len, .Lcbc_no_data
 405         sll             $ialign, 3, $ileft
 406
 407 1:      call            .+8
 408         add             %o7, .Linp_align-1b, %o7
 409
 410         ld              [$key + 240], $rounds
 411         and             $out, 7, $oalign
 412         ld              [$ivp + 0], %f0         ! load ivec
 413         andn            $out, 7, $out
 414         ld              [$ivp + 4], %f1
 415         sll             $oalign, 3, $mask
 416         ld              [$ivp + 8], %f2
 417         ld              [$ivp + 12], %f3
 418
 419         sll             $rounds, 4, $rounds
 420         add             $rounds, $key, $end
 421         ldd             [$key + 0], $r0hi       ! round[0]
 422         ldd             [$key + 8], $r0lo
 423
 424         add             $inp, 16, $inp
 425         sub             $len,  1, $len
 426         ldd             [$end + 0], $rlhi       ! round[last]
 427         ldd             [$end + 8], $rllo
 428
 429         mov             16, $inc
 430         movrz           $len, 0, $inc
 431         ldd             [$key + 16], %f10       ! round[1]
 432         ldd             [$key + 24], %f12
 433
 434         ldd             [%o7 + $ileft], $fshift ! shift left params
 435         add             %o7, 64, %o7
 436         ldd             [$inp - 16], $in0       ! load input
 437         ldd             [$inp -  8], $in1
 438         ldda            [$inp]0x82, $intail     ! non-faulting load
 439         brz             $dir, .Lcbc_decrypt
 440         add             $inp, $inc, $inp        ! inp+=16
 441
 442         fxor            $r0hi, %f0, %f0         ! ivec^=round[0]
 443         fxor            $r0lo, %f2, %f2
 444         fshiftorx       $in0, $in1, $fshift, $in0
 445         fshiftorx       $in1, $intail, $fshift, $in1
 446         nop
 447
 448 .Loop_cbc_enc:
 449         fxor            $in0, %f0, %f0          ! inp^ivec^round[0]
 450         fxor            $in1, %f2, %f2
 451         ldd             [$key + 32], %f6        ! round[2]
 452         ldd             [$key + 40], %f8
 453         add             $key, 32, $end
 454         sub             $rounds, 16*6, $inner
 455
 456 .Lcbc_enc:
 457         fmovd           %f0, %f4
 458         faesencx        %f2, %f10, %f0
 459         faesencx        %f4, %f12, %f2
 460         ldd             [$end + 16], %f10
 461         ldd             [$end + 24], %f12
 462         add             $end, 32, $end
 463
 464         fmovd           %f0, %f4
 465         faesencx        %f2, %f6, %f0
 466         faesencx        %f4, %f8, %f2
 467         ldd             [$end + 0], %f6
 468         ldd             [$end + 8], %f8
 469
 470         brnz,a          $inner, .Lcbc_enc
 471         sub             $inner, 16*2, $inner
 472
 473         fmovd           %f0, %f4
 474         faesencx        %f2, %f10, %f0
 475         faesencx        %f4, %f12, %f2
 476         ldd             [$end + 16], %f10       ! round[last-1]
 477         ldd             [$end + 24], %f12
 478
 479         movrz           $len, 0, $inc
 480         fmovd           $intail, $in0
 481         ldd             [$inp - 8], $in1        ! load next input block
 482         ldda            [$inp]0x82, $intail     ! non-faulting load
 483         add             $inp, $inc, $inp        ! inp+=16
 484
 485         fmovd           %f0, %f4
 486         faesencx        %f2, %f6, %f0
 487         faesencx        %f4, %f8, %f2
 488
 489         fshiftorx       $in0, $in1, $fshift, $in0
 490         fshiftorx       $in1, $intail, $fshift, $in1
 491
 492         fmovd           %f0, %f4
 493         faesencx        %f2, %f10, %f0
 494         faesencx        %f4, %f12, %f2
 495         ldd             [$key + 16], %f10       ! round[1]
 496         ldd             [$key + 24], %f12
 497
 498         fxor            $r0hi, $in0, $in0       ! inp^=round[0]
 499         fxor            $r0lo, $in1, $in1
 500
 501         fmovd           %f0, %f4
 502         faesenclx       %f2, $rlhi, %f0
 503         faesenclx       %f4, $rllo, %f2
 504
 505         brnz,pn         $oalign, .Lcbc_enc_unaligned_out
 506         nop
 507
 508         std             %f0, [$out + 0]
 509         std             %f2, [$out + 8]
 510         add             $out, 16, $out
 511
 512         brnz,a          $len, .Loop_cbc_enc
 513         sub             $len, 1, $len
 514
 515         st              %f0, [$ivp + 0]         ! output ivec
 516         st              %f1, [$ivp + 4]
 517         st              %f2, [$ivp + 8]
 518         st              %f3, [$ivp + 12]
 519
 520 .Lcbc_no_data:
 521         ret
 522         restore
 523
 524 .align  32
 525 .Lcbc_enc_unaligned_out:
 526         ldd             [%o7 + $mask], $fshift  ! shift right params
 527         mov             0xff, $mask
 528         srl             $mask, $oalign, $mask
 529         sub             %g0, $ileft, $iright
 530
 531         fshiftorx       %f0, %f0, $fshift, %f6
 532         fshiftorx       %f0, %f2, $fshift, %f8
 533
 534         stda            %f6, [$out + $mask]0xc0 ! partial store
 535         orn             %g0, $mask, $mask
 536         std             %f8, [$out + 8]
 537         add             $out, 16, $out
 538         brz             $len, .Lcbc_enc_unaligned_out_done
 539         sub             $len, 1, $len
 540         b               .Loop_cbc_enc_unaligned_out
 541         nop
 542
 543 .align  32
 544 .Loop_cbc_enc_unaligned_out:
 545         fmovd           %f2, $outhead
 546         fxor            $in0, %f0, %f0          ! inp^ivec^round[0]
 547         fxor            $in1, %f2, %f2
 548         ldd             [$key + 32], %f6        ! round[2]
 549         ldd             [$key + 40], %f8
 550
 551         fmovd           %f0, %f4
 552         faesencx        %f2, %f10, %f0
 553         faesencx        %f4, %f12, %f2
 554         ldd             [$key + 48], %f10       ! round[3]
 555         ldd             [$key + 56], %f12
 556
 557         ldx             [$inp - 16], %o0
 558         ldx             [$inp -  8], %o1
 559         brz             $ileft, .Lcbc_enc_aligned_inp
 560         movrz           $len, 0, $inc
 561
 562         ldx             [$inp], %o2
 563         sllx            %o0, $ileft, %o0
 564         srlx            %o1, $iright, %g1
 565         sllx            %o1, $ileft, %o1
 566         or              %g1, %o0, %o0
 567         srlx            %o2, $iright, %o2
 568         or              %o2, %o1, %o1
 569
 570 .Lcbc_enc_aligned_inp:
 571         fmovd           %f0, %f4
 572         faesencx        %f2, %f6, %f0
 573         faesencx        %f4, %f8, %f2
 574         ldd             [$key + 64], %f6        ! round[4]
 575         ldd             [$key + 72], %f8
 576         add             $key, 64, $end
 577         sub             $rounds, 16*8, $inner
 578
 579         stx             %o0, [%sp + LOCALS + 0]
 580         stx             %o1, [%sp + LOCALS + 8]
 581         add             $inp, $inc, $inp        ! inp+=16
 582         nop
 583
 584 .Lcbc_enc_unaligned:
 585         fmovd           %f0, %f4
 586         faesencx        %f2, %f10, %f0
 587         faesencx        %f4, %f12, %f2
 588         ldd             [$end + 16], %f10
 589         ldd             [$end + 24], %f12
 590         add             $end, 32, $end
 591
 592         fmovd           %f0, %f4
 593         faesencx        %f2, %f6, %f0
 594         faesencx        %f4, %f8, %f2
 595         ldd             [$end + 0], %f6
 596         ldd             [$end + 8], %f8
 597
 598         brnz,a          $inner, .Lcbc_enc_unaligned
 599         sub             $inner, 16*2, $inner
 600
 601         fmovd           %f0, %f4
 602         faesencx        %f2, %f10, %f0
 603         faesencx        %f4, %f12, %f2
 604         ldd             [$end + 16], %f10       ! round[last-1]
 605         ldd             [$end + 24], %f12
 606
 607         fmovd           %f0, %f4
 608         faesencx        %f2, %f6, %f0
 609         faesencx        %f4, %f8, %f2
 610
 611         ldd             [%sp + LOCALS + 0], $in0
 612         ldd             [%sp + LOCALS + 8], $in1
 613
 614         fmovd           %f0, %f4
 615         faesencx        %f2, %f10, %f0
 616         faesencx        %f4, %f12, %f2
 617         ldd             [$key + 16], %f10       ! round[1]
 618         ldd             [$key + 24], %f12
 619
 620         fxor            $r0hi, $in0, $in0       ! inp^=round[0]
 621         fxor            $r0lo, $in1, $in1
 622
 623         fmovd           %f0, %f4
 624         faesenclx       %f2, $rlhi, %f0
 625         faesenclx       %f4, $rllo, %f2
 626
 627         fshiftorx       $outhead, %f0, $fshift, %f6
 628         fshiftorx       %f0, %f2, $fshift, %f8
 629         std             %f6, [$out + 0]
 630         std             %f8, [$out + 8]
 631         add             $out, 16, $out
 632
 633         brnz,a          $len, .Loop_cbc_enc_unaligned_out
 634         sub             $len, 1, $len
 635
 636 .Lcbc_enc_unaligned_out_done:
 637         fshiftorx       %f2, %f2, $fshift, %f8
 638         stda            %f8, [$out + $mask]0xc0 ! partial store
 639
 640         st              %f0, [$ivp + 0]         ! output ivec
 641         st              %f1, [$ivp + 4]
 642         st              %f2, [$ivp + 8]
 643         st              %f3, [$ivp + 12]
 644
 645         ret
 646         restore
 647
 648 .align  32
 649 .Lcbc_decrypt:
 650         fshiftorx       $in0, $in1, $fshift, $in0
 651         fshiftorx       $in1, $intail, $fshift, $in1
 652         fmovd           %f0, $iv0
 653         fmovd           %f2, $iv1
 654
 655 .Loop_cbc_dec:
 656         fxor            $in0, $r0hi, %f0        ! inp^round[0]
 657         fxor            $in1, $r0lo, %f2
 658         ldd             [$key + 32], %f6        ! round[2]
 659         ldd             [$key + 40], %f8
 660         add             $key, 32, $end
 661         sub             $rounds, 16*6, $inner
 662
 663 .Lcbc_dec:
 664         fmovd           %f0, %f4
 665         faesdecx        %f2, %f10, %f0
 666         faesdecx        %f4, %f12, %f2
 667         ldd             [$end + 16], %f10
 668         ldd             [$end + 24], %f12
 669         add             $end, 32, $end
 670
 671         fmovd           %f0, %f4
 672         faesdecx        %f2, %f6, %f0
 673         faesdecx        %f4, %f8, %f2
 674         ldd             [$end + 0], %f6
 675         ldd             [$end + 8], %f8
 676
 677         brnz,a          $inner, .Lcbc_dec
 678         sub             $inner, 16*2, $inner
 679
 680         fmovd           %f0, %f4
 681         faesdecx        %f2, %f10, %f0
 682         faesdecx        %f4, %f12, %f2
 683         ldd             [$end + 16], %f10       ! round[last-1]
 684         ldd             [$end + 24], %f12
 685
 686         fmovd           %f0, %f4
 687         faesdecx        %f2, %f6, %f0
 688         faesdecx        %f4, %f8, %f2
 689         fxor            $iv0, $rlhi, %f6        ! ivec^round[last]
 690         fxor            $iv1, $rllo, %f8
 691         fmovd           $in0, $iv0
 692         fmovd           $in1, $iv1
 693
 694         movrz           $len, 0, $inc
 695         fmovd           $intail, $in0
 696         ldd             [$inp - 8], $in1        ! load next input block
 697         ldda            [$inp]0x82, $intail     ! non-faulting load
 698         add             $inp, $inc, $inp        ! inp+=16
 699
 700         fmovd           %f0, %f4
 701         faesdecx        %f2, %f10, %f0
 702         faesdecx        %f4, %f12, %f2
 703         ldd             [$key + 16], %f10       ! round[1]
 704         ldd             [$key + 24], %f12
 705
 706         fshiftorx       $in0, $in1, $fshift, $in0
 707         fshiftorx       $in1, $intail, $fshift, $in1
 708
 709         fmovd           %f0, %f4
 710         faesdeclx       %f2, %f6, %f0
 711         faesdeclx       %f4, %f8, %f2
 712
 713         brnz,pn         $oalign, .Lcbc_dec_unaligned_out
 714         nop
 715
 716         std             %f0, [$out + 0]
 717         std             %f2, [$out + 8]
 718         add             $out, 16, $out
 719
 720         brnz,a          $len, .Loop_cbc_dec
 721         sub             $len, 1, $len
 722
 723         st              $iv0,    [$ivp + 0]     ! output ivec
 724         st              $iv0#lo, [$ivp + 4]
 725         st              $iv1,    [$ivp + 8]
 726         st              $iv1#lo, [$ivp + 12]
 727
 728         ret
 729         restore
 730
 731 .align  32
 732 .Lcbc_dec_unaligned_out:
 733         ldd             [%o7 + $mask], $fshift  ! shift right params
 734         mov             0xff, $mask
 735         srl             $mask, $oalign, $mask
 736         sub             %g0, $ileft, $iright
 737
 738         fshiftorx       %f0, %f0, $fshift, %f6
 739         fshiftorx       %f0, %f2, $fshift, %f8
 740
 741         stda            %f6, [$out + $mask]0xc0 ! partial store
 742         orn             %g0, $mask, $mask
 743         std             %f8, [$out + 8]
 744         add             $out, 16, $out
 745         brz             $len, .Lcbc_dec_unaligned_out_done
 746         sub             $len, 1, $len
 747         b               .Loop_cbc_dec_unaligned_out
 748         nop
 749
 750 .align  32
 751 .Loop_cbc_dec_unaligned_out:
 752         fmovd           %f2, $outhead
 753         fxor            $in0, $r0hi, %f0        ! inp^round[0]
 754         fxor            $in1, $r0lo, %f2
 755         ldd             [$key + 32], %f6        ! round[2]
 756         ldd             [$key + 40], %f8
 757
 758         fmovd           %f0, %f4
 759         faesdecx        %f2, %f10, %f0
 760         faesdecx        %f4, %f12, %f2
 761         ldd             [$key + 48], %f10       ! round[3]
 762         ldd             [$key + 56], %f12
 763
 764         ldx             [$inp - 16], %o0
 765         ldx             [$inp - 8], %o1
 766         brz             $ileft, .Lcbc_dec_aligned_inp
 767         movrz           $len, 0, $inc
 768
 769         ldx             [$inp], %o2
 770         sllx            %o0, $ileft, %o0
 771         srlx            %o1, $iright, %g1
 772         sllx            %o1, $ileft, %o1
 773         or              %g1, %o0, %o0
 774         srlx            %o2, $iright, %o2
 775         or              %o2, %o1, %o1
 776
 777 .Lcbc_dec_aligned_inp:
 778         fmovd           %f0, %f4
 779         faesdecx        %f2, %f6, %f0
 780         faesdecx        %f4, %f8, %f2
 781         ldd             [$key + 64], %f6        ! round[4]
 782         ldd             [$key + 72], %f8
 783         add             $key, 64, $end
 784         sub             $rounds, 16*8, $inner
 785
 786         stx             %o0, [%sp + LOCALS + 0]
 787         stx             %o1, [%sp + LOCALS + 8]
 788         add             $inp, $inc, $inp        ! inp+=16
 789         nop
 790
 791 .Lcbc_dec_unaligned:
 792         fmovd           %f0, %f4
 793         faesdecx        %f2, %f10, %f0
 794         faesdecx        %f4, %f12, %f2
 795         ldd             [$end + 16], %f10
 796         ldd             [$end + 24], %f12
 797         add             $end, 32, $end
 798
 799         fmovd           %f0, %f4
 800         faesdecx        %f2, %f6, %f0
 801         faesdecx        %f4, %f8, %f2
 802         ldd             [$end + 0], %f6
 803         ldd             [$end + 8], %f8
 804
 805         brnz,a          $inner, .Lcbc_dec_unaligned
 806         sub             $inner, 16*2, $inner
 807
 808         fmovd           %f0, %f4
 809         faesdecx        %f2, %f10, %f0
 810         faesdecx        %f4, %f12, %f2
 811         ldd             [$end + 16], %f10       ! round[last-1]
 812         ldd             [$end + 24], %f12
 813
 814         fmovd           %f0, %f4
 815         faesdecx        %f2, %f6, %f0
 816         faesdecx        %f4, %f8, %f2
 817
 818         fxor            $iv0, $rlhi, %f6        ! ivec^round[last]
 819         fxor            $iv1, $rllo, %f8
 820         fmovd           $in0, $iv0
 821         fmovd           $in1, $iv1
 822         ldd             [%sp + LOCALS + 0], $in0
 823         ldd             [%sp + LOCALS + 8], $in1
 824
 825         fmovd           %f0, %f4
 826         faesdecx        %f2, %f10, %f0
 827         faesdecx        %f4, %f12, %f2
 828         ldd             [$key + 16], %f10       ! round[1]
 829         ldd             [$key + 24], %f12
 830
 831         fmovd           %f0, %f4
 832         faesdeclx       %f2, %f6, %f0
 833         faesdeclx       %f4, %f8, %f2
 834
 835         fshiftorx       $outhead, %f0, $fshift, %f6
 836         fshiftorx       %f0, %f2, $fshift, %f8
 837         std             %f6, [$out + 0]
 838         std             %f8, [$out + 8]
 839         add             $out, 16, $out
 840
 841         brnz,a          $len, .Loop_cbc_dec_unaligned_out
 842         sub             $len, 1, $len
 843
 844 .Lcbc_dec_unaligned_out_done:
 845         fshiftorx       %f2, %f2, $fshift, %f8
 846         stda            %f8, [$out + $mask]0xc0 ! partial store
 847
 848         st              $iv0,    [$ivp + 0]     ! output ivec
 849         st              $iv0#lo, [$ivp + 4]
 850         st              $iv1,    [$ivp + 8]
 851         st              $iv1#lo, [$ivp + 12]
 852
 853         ret
 854         restore
 855 .type   aes_fx_cbc_encrypt,#function
 856 .size   aes_fx_cbc_encrypt,.-aes_fx_cbc_encrypt
 857 ___
 858 }
 859 {
 860 my ($inp,$out,$len,$key,$ivp) = map("%i$_",(0..5));
 861 my ($rounds,$inner,$end,$inc,$ialign,$oalign,$mask) = map("%l$_",(0..7));
 862 my ($ctr0,$ctr1,$r0hi,$r0lo,$rlhi,$rllo,$in0,$in1,$intail,$outhead,$fshift)
 863    = map("%f$_",grep { !($_ & 1) } (16 .. 62));
 864 my ($ileft,$iright) = ($ialign, $oalign);
 865 my $one = "%f14";
 866
 867 $code.=<<___;
 868 .globl  aes_fx_ctr32_encrypt_blocks
 869 .align  32
 870 aes_fx_ctr32_encrypt_blocks:
 871         save            %sp, -STACK_FRAME-16, %sp
 872         srln            $len, 0, $len
 873         and             $inp, 7, $ialign
 874         andn            $inp, 7, $inp
 875         brz,pn          $len, .Lctr32_no_data
 876         sll             $ialign, 3, $ileft
 877
 878 .Lpic:  call            .+8
 879         add             %o7, .Linp_align - .Lpic, %o7
 880
 881         ld              [$key + 240], $rounds
 882         and             $out, 7, $oalign
 883         ld              [$ivp +  0], $ctr0      ! load counter
 884         andn            $out, 7, $out
 885         ld              [$ivp +  4], $ctr0#lo
 886         sll             $oalign, 3, $mask
 887         ld              [$ivp +  8], $ctr1
 888         ld              [$ivp + 12], $ctr1#lo
 889         ldd             [%o7 + 128], $one
 890
 891         sll             $rounds, 4, $rounds
 892         add             $rounds, $key, $end
 893         ldd             [$key + 0], $r0hi       ! round[0]
 894         ldd             [$key + 8], $r0lo
 895
 896         add             $inp, 16, $inp
 897         sub             $len, 1, $len
 898         ldd             [$key + 16], %f10       ! round[1]
 899         ldd             [$key + 24], %f12
 900
 901         mov             16, $inc
 902         movrz           $len, 0, $inc
 903         ldd             [$end + 0], $rlhi       ! round[last]
 904         ldd             [$end + 8], $rllo
 905
 906         ldd             [%o7 + $ileft], $fshift ! shiftleft params
 907         add             %o7, 64, %o7
 908         ldd             [$inp - 16], $in0       ! load input
 909         ldd             [$inp -  8], $in1
 910         ldda            [$inp]0x82, $intail     ! non-faulting load
 911         add             $inp, $inc, $inp        ! inp+=16
 912
 913         fshiftorx       $in0, $in1, $fshift, $in0
 914         fshiftorx       $in1, $intail, $fshift, $in1
 915
 916 .Loop_ctr32:
 917         fxor            $ctr0, $r0hi, %f0       ! counter^round[0]
 918         fxor            $ctr1, $r0lo, %f2
 919         ldd             [$key + 32], %f6        ! round[2]
 920         ldd             [$key + 40], %f8
 921         add             $key, 32, $end
 922         sub             $rounds, 16*6, $inner
 923
 924 .Lctr32_enc:
 925         fmovd           %f0, %f4
 926         faesencx        %f2, %f10, %f0
 927         faesencx        %f4, %f12, %f2
 928         ldd             [$end + 16], %f10
 929         ldd             [$end + 24], %f12
 930         add             $end, 32, $end
 931
 932         fmovd           %f0, %f4
 933         faesencx        %f2, %f6, %f0
 934         faesencx        %f4, %f8, %f2
 935         ldd             [$end + 0], %f6
 936         ldd             [$end + 8], %f8
 937
 938         brnz,a          $inner, .Lctr32_enc
 939         sub             $inner, 16*2, $inner
 940
 941         fmovd           %f0, %f4
 942         faesencx        %f2, %f10, %f0
 943         faesencx        %f4, %f12, %f2
 944         ldd             [$end + 16], %f10       ! round[last-1]
 945         ldd             [$end + 24], %f12
 946
 947         fmovd           %f0, %f4
 948         faesencx        %f2, %f6, %f0
 949         faesencx        %f4, %f8, %f2
 950         fxor            $in0, $rlhi, %f6        ! inp^round[last]
 951         fxor            $in1, $rllo, %f8
 952
 953         movrz           $len, 0, $inc
 954         fmovd           $intail, $in0
 955         ldd             [$inp - 8], $in1        ! load next input block
 956         ldda            [$inp]0x82, $intail     ! non-faulting load
 957         add             $inp, $inc, $inp        ! inp+=16
 958
 959         fmovd           %f0, %f4
 960         faesencx        %f2, %f10, %f0
 961         faesencx        %f4, %f12, %f2
 962         ldd             [$key + 16], %f10       ! round[1]
 963         ldd             [$key + 24], %f12
 964
 965         fshiftorx       $in0, $in1, $fshift, $in0
 966         fshiftorx       $in1, $intail, $fshift, $in1
 967         fpadd32         $ctr1, $one, $ctr1      ! increment counter
 968
 969         fmovd           %f0, %f4
 970         faesenclx       %f2, %f6, %f0
 971         faesenclx       %f4, %f8, %f2
 972
 973         brnz,pn         $oalign, .Lctr32_unaligned_out
 974         nop
 975
 976         std             %f0, [$out + 0]
 977         std             %f2, [$out + 8]
 978         add             $out, 16, $out
 979
 980         brnz,a          $len, .Loop_ctr32
 981         sub             $len, 1, $len
 982
 983 .Lctr32_no_data:
 984         ret
 985         restore
 986
 987 .align  32
 988 .Lctr32_unaligned_out:
 989         ldd             [%o7 + $mask], $fshift  ! shift right params
 990         mov             0xff, $mask
 991         srl             $mask, $oalign, $mask
 992         sub             %g0, $ileft, $iright
 993
 994         fshiftorx       %f0, %f0, $fshift, %f6
 995         fshiftorx       %f0, %f2, $fshift, %f8
 996
 997         stda            %f6, [$out + $mask]0xc0 ! partial store
 998         orn             %g0, $mask, $mask
 999         std             %f8, [$out + 8]
1000         add             $out, 16, $out
1001         brz             $len, .Lctr32_unaligned_out_done
1002         sub             $len, 1, $len
1003         b               .Loop_ctr32_unaligned_out
1004         nop
1005
1006 .align  32
1007 .Loop_ctr32_unaligned_out:
1008         fmovd           %f2, $outhead
1009         fxor            $ctr0, $r0hi, %f0       ! counter^round[0]
1010         fxor            $ctr1, $r0lo, %f2
1011         ldd             [$key + 32], %f6        ! round[2]
1012         ldd             [$key + 40], %f8
1013
1014         fmovd           %f0, %f4
1015         faesencx        %f2, %f10, %f0
1016         faesencx        %f4, %f12, %f2
1017         ldd             [$key + 48], %f10       ! round[3]
1018         ldd             [$key + 56], %f12
1019
1020         ldx             [$inp - 16], %o0
1021         ldx             [$inp -  8], %o1
1022         brz             $ileft, .Lctr32_aligned_inp
1023         movrz           $len, 0, $inc
1024
1025         ldx             [$inp], %o2
1026         sllx            %o0, $ileft, %o0
1027         srlx            %o1, $iright, %g1
1028         sllx            %o1, $ileft, %o1
1029         or              %g1, %o0, %o0
1030         srlx            %o2, $iright, %o2
1031         or              %o2, %o1, %o1
1032
1033 .Lctr32_aligned_inp:
1034         fmovd           %f0, %f4
1035         faesencx        %f2, %f6, %f0
1036         faesencx        %f4, %f8, %f2
1037         ldd             [$key + 64], %f6        ! round[4]
1038         ldd             [$key + 72], %f8
1039         add             $key, 64, $end
1040         sub             $rounds, 16*8, $inner
1041
1042         stx             %o0, [%sp + LOCALS + 0]
1043         stx             %o1, [%sp + LOCALS + 8]
1044         add             $inp, $inc, $inp        ! inp+=16
1045         nop
1046
1047 .Lctr32_enc_unaligned:
1048         fmovd           %f0, %f4
1049         faesencx        %f2, %f10, %f0
1050         faesencx        %f4, %f12, %f2
1051         ldd             [$end + 16], %f10
1052         ldd             [$end + 24], %f12
1053         add             $end, 32, $end
1054
1055         fmovd           %f0, %f4
1056         faesencx        %f2, %f6, %f0
1057         faesencx        %f4, %f8, %f2
1058         ldd             [$end + 0], %f6
1059         ldd             [$end + 8], %f8
1060
1061         brnz,a          $inner, .Lctr32_enc_unaligned
1062         sub             $inner, 16*2, $inner
1063
1064         fmovd           %f0, %f4
1065         faesencx        %f2, %f10, %f0
1066         faesencx        %f4, %f12, %f2
1067         ldd             [$end + 16], %f10       ! round[last-1]
1068         ldd             [$end + 24], %f12
1069         fpadd32         $ctr1, $one, $ctr1      ! increment counter
1070
1071         fmovd           %f0, %f4
1072         faesencx        %f2, %f6, %f0
1073         faesencx        %f4, %f8, %f2
1074         fxor            $in0, $rlhi, %f6        ! inp^round[last]
1075         fxor            $in1, $rllo, %f8
1076         ldd             [%sp + LOCALS + 0], $in0
1077         ldd             [%sp + LOCALS + 8], $in1
1078
1079         fmovd           %f0, %f4
1080         faesencx        %f2, %f10, %f0
1081         faesencx        %f4, %f12, %f2
1082         ldd             [$key + 16], %f10       ! round[1]
1083         ldd             [$key + 24], %f12
1084
1085         fmovd           %f0, %f4
1086         faesenclx       %f2, %f6, %f0
1087         faesenclx       %f4, %f8, %f2
1088
1089         fshiftorx       $outhead, %f0, $fshift, %f6
1090         fshiftorx       %f0, %f2, $fshift, %f8
1091         std             %f6, [$out + 0]
1092         std             %f8, [$out + 8]
1093         add             $out, 16, $out
1094
1095         brnz,a          $len, .Loop_ctr32_unaligned_out
1096         sub             $len, 1, $len
1097
1098 .Lctr32_unaligned_out_done:
1099         fshiftorx       %f2, %f2, $fshift, %f8
1100         stda            %f8, [$out + $mask]0xc0 ! partial store
1101
1102         ret
1103         restore
1104 .type   aes_fx_ctr32_encrypt_blocks,#function
1105 .size   aes_fx_ctr32_encrypt_blocks,.-aes_fx_ctr32_encrypt_blocks
1106
1107 .align  32
1108 .Linp_align:            ! fshiftorx parameters for left shift toward %rs1
1109         .byte   0, 0, 64,  0,   0, 64,  0, -64
1110         .byte   0, 0, 56,  8,   0, 56,  8, -56
1111         .byte   0, 0, 48, 16,   0, 48, 16, -48
1112         .byte   0, 0, 40, 24,   0, 40, 24, -40
1113         .byte   0, 0, 32, 32,   0, 32, 32, -32
1114         .byte   0, 0, 24, 40,   0, 24, 40, -24
1115         .byte   0, 0, 16, 48,   0, 16, 48, -16
1116         .byte   0, 0,  8, 56,   0,  8, 56, -8
1117 .Lout_align:            ! fshiftorx parameters for right shift toward %rs2
1118         .byte   0, 0,  0, 64,   0,  0, 64,   0
1119         .byte   0, 0,  8, 56,   0,  8, 56,  -8
1120         .byte   0, 0, 16, 48,   0, 16, 48, -16
1121         .byte   0, 0, 24, 40,   0, 24, 40, -24
1122         .byte   0, 0, 32, 32,   0, 32, 32, -32
1123         .byte   0, 0, 40, 24,   0, 40, 24, -40
1124         .byte   0, 0, 48, 16,   0, 48, 16, -48
1125         .byte   0, 0, 56,  8,   0, 56,  8, -56
1126 .Lone:
1127         .word   0, 1
1128 .asciz  "AES for Fujitsu SPARC64 X, CRYPTOGAMS by <appro\@openssl.org>"
1129 .align  4
1130 ___
1131 }
1132 # Purpose of these subroutines is to explicitly encode VIS instructions,
1133 # so that one can compile the module without having to specify VIS
1134 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1135 # Idea is to reserve for option to produce "universal" binary and let
1136 # programmer detect if current CPU is VIS capable at run-time.
1137 sub unvis {
1138 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1139 my ($ref,$opf);
1140 my %visopf = (  "faligndata"    => 0x048,
1141                 "bshuffle"      => 0x04c,
1142                 "fpadd32"       => 0x052,
1143                 "fxor"          => 0x06c,
1144                 "fsrc2"         => 0x078        );
1145
1146     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1147
1148     if ($opf=$visopf{$mnemonic}) {
1149         foreach ($rs1,$rs2,$rd) {
1150             return $ref if (!/%f([0-9]{1,2})/);
1151             $_=$1;
1152             if ($1>=32) {
1153                 return $ref if ($1&1);
1154                 # re-encode for upper double register addressing
1155                 $_=($1|$1>>5)&31;
1156             }
1157         }
1158
1159         return  sprintf ".word\t0x%08x !%s",
1160                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1161                         $ref;
1162     } else {
1163         return $ref;
1164     }
1165 }
1166
1167 sub unvis3 {
1168 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1169 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1170 my ($ref,$opf);
1171 my %visopf = (  "alignaddr"     => 0x018,
1172                 "bmask"         => 0x019,
1173                 "alignaddrl"    => 0x01a        );
1174
1175     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1176
1177     if ($opf=$visopf{$mnemonic}) {
1178         foreach ($rs1,$rs2,$rd) {
1179             return $ref if (!/%([goli])([0-9])/);
1180             $_=$bias{$1}+$2;
1181         }
1182
1183         return  sprintf ".word\t0x%08x !%s",
1184                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1185                         $ref;
1186     } else {
1187         return $ref;
1188     }
1189 }
1190
1191 sub unfx {
1192 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1193 my ($ref,$opf);
1194 my %aesopf = (  "faesencx"      => 0x90,
1195                 "faesdecx"      => 0x91,
1196                 "faesenclx"     => 0x92,
1197                 "faesdeclx"     => 0x93,
1198                 "faeskeyx"      => 0x94 );
1199
1200     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1201
1202     if (defined($opf=$aesopf{$mnemonic})) {
1203         $rs2 = ($rs2 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs2;
1204         $rs2 = oct($rs2) if ($rs2 =~ /^0/);
1205
1206         foreach ($rs1,$rd) {
1207             return $ref if (!/%f([0-9]{1,2})/);
1208             $_=$1;
1209             if ($1>=32) {
1210                 return $ref if ($1&1);
1211                 # re-encode for upper double register addressing
1212                 $_=($1|$1>>5)&31;
1213             }
1214         }
1215
1216         return  sprintf ".word\t0x%08x !%s",
1217                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1218                         $ref;
1219     } else {
1220         return $ref;
1221     }
1222 }
1223
1224 sub unfx3src {
1225 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1226 my ($ref,$opf);
1227 my %aesopf = (  "fshiftorx"     => 0x0b );
1228
1229     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1230
1231     if (defined($opf=$aesopf{$mnemonic})) {
1232         foreach ($rs1,$rs2,$rs3,$rd) {
1233             return $ref if (!/%f([0-9]{1,2})/);
1234             $_=$1;
1235             if ($1>=32) {
1236                 return $ref if ($1&1);
1237                 # re-encode for upper double register addressing
1238                 $_=($1|$1>>5)&31;
1239             }
1240         }
1241
1242         return  sprintf ".word\t0x%08x !%s",
1243                         2<<30|$rd<<25|0x37<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1244                         $ref;
1245     } else {
1246         return $ref;
1247     }
1248 }
1249
1250 foreach (split("\n",$code)) {
1251     s/\`([^\`]*)\`/eval $1/ge;
1252
1253     s/%f([0-9]+)#lo/sprintf "%%f%d",$1+1/ge;
1254
1255     s/\b(faes[^x]{3,4}x)\s+(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1256                 &unfx($1,$2,$3,$4)
1257      /ge or
1258     s/\b([f][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1259                 &unfx3src($1,$2,$3,$4,$5)
1260      /ge or
1261     s/\b([fb][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1262                 &unvis($1,$2,$3,$4)
1263      /ge or
1264     s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1265                 &unvis3($1,$2,$3,$4)
1266      /ge;
1267     print $_,"\n";
1268 }
1269
1270 close STDOUT;