crypto/modes/asm/ghash-sparcv9.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # March 2010
  11 #
  12 # The module implements "4-bit" GCM GHASH function and underlying
  13 # single multiplication operation in GF(2^128). "4-bit" means that it
  14 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
  15 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
  16 # and are expressed in cycles per processed byte, less is better:
  17 #
  18 #               gcc 3.3.x       cc 5.2          this assembler
  19 #
  20 # 32-bit build  81.4            43.3            12.6    (+546%/+244%)
  21 # 64-bit build  20.2            21.2            12.6    (+60%/+68%)
  22 #
  23 # Here is data collected on UltraSPARC T1 system running Linux:
  24 #
  25 #               gcc 4.4.1                       this assembler
  26 #
  27 # 32-bit build  566                             50      (+1000%)
  28 # 64-bit build  56                              50      (+12%)
  29 #
  30 # I don't quite understand why difference between 32-bit and 64-bit
  31 # compiler-generated code is so big. Compilers *were* instructed to
  32 # generate code for UltraSPARC and should have used 64-bit registers
  33 # for Z vector (see C code) even in 32-bit build... Oh well, it only
  34 # means more impressive improvement coefficients for this assembler
  35 # module;-) Loops are aggressively modulo-scheduled in respect to
  36 # references to input data and Z.hi updates to achieve 12 cycles
  37 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
  38 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
  39 #
  40 # October 2012
  41 #
  42 # Add VIS3 lookup-table-free implementation using polynomial
  43 # multiplication xmulx[hi] and extended addition addxc[cc]
  44 # instructions. 4.52/7.63x improvement on T3/T4 or in absolute
  45 # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
  46 # saturates at ~15.5x single-process result on 8-core processor,
  47 # or ~20.5GBps per 2.85GHz socket.
  48
  49 $output=pop;
  50 open STDOUT,">$output";
  51
  52 $frame="STACK_FRAME";
  53 $bias="STACK_BIAS";
  54
  55 $Zhi="%o0";     # 64-bit values
  56 $Zlo="%o1";
  57 $Thi="%o2";
  58 $Tlo="%o3";
  59 $rem="%o4";
  60 $tmp="%o5";
  61
  62 $nhi="%l0";     # small values and pointers
  63 $nlo="%l1";
  64 $xi0="%l2";
  65 $xi1="%l3";
  66 $rem_4bit="%l4";
  67 $remi="%l5";
  68 $Htblo="%l6";
  69 $cnt="%l7";
  70
  71 $Xi="%i0";      # input argument block
  72 $Htbl="%i1";
  73 $inp="%i2";
  74 $len="%i3";
  75
  76 $code.=<<___;
  77 #include "sparc_arch.h"
  78
  79 #ifdef  __arch64__
  80 .register       %g2,#scratch
  81 .register       %g3,#scratch
  82 #endif
  83
  84 .section        ".text",#alloc,#execinstr
  85
  86 .align  64
  87 rem_4bit:
  88         .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  89         .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  90         .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  91         .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  92 .type   rem_4bit,#object
  93 .size   rem_4bit,(.-rem_4bit)
  94
  95 .globl  gcm_ghash_4bit
  96 .align  32
  97 gcm_ghash_4bit:
  98         save    %sp,-$frame,%sp
  99         ldub    [$inp+15],$nlo
 100         ldub    [$Xi+15],$xi0
 101         ldub    [$Xi+14],$xi1
 102         add     $len,$inp,$len
 103         add     $Htbl,8,$Htblo
 104
 105 1:      call    .+8
 106         add     %o7,rem_4bit-1b,$rem_4bit
 107
 108 .Louter:
 109         xor     $xi0,$nlo,$nlo
 110         and     $nlo,0xf0,$nhi
 111         and     $nlo,0x0f,$nlo
 112         sll     $nlo,4,$nlo
 113         ldx     [$Htblo+$nlo],$Zlo
 114         ldx     [$Htbl+$nlo],$Zhi
 115
 116         ldub    [$inp+14],$nlo
 117
 118         ldx     [$Htblo+$nhi],$Tlo
 119         and     $Zlo,0xf,$remi
 120         ldx     [$Htbl+$nhi],$Thi
 121         sll     $remi,3,$remi
 122         ldx     [$rem_4bit+$remi],$rem
 123         srlx    $Zlo,4,$Zlo
 124         mov     13,$cnt
 125         sllx    $Zhi,60,$tmp
 126         xor     $Tlo,$Zlo,$Zlo
 127         srlx    $Zhi,4,$Zhi
 128         xor     $Zlo,$tmp,$Zlo
 129
 130         xor     $xi1,$nlo,$nlo
 131         and     $Zlo,0xf,$remi
 132         and     $nlo,0xf0,$nhi
 133         and     $nlo,0x0f,$nlo
 134         ba      .Lghash_inner
 135         sll     $nlo,4,$nlo
 136 .align  32
 137 .Lghash_inner:
 138         ldx     [$Htblo+$nlo],$Tlo
 139         sll     $remi,3,$remi
 140         xor     $Thi,$Zhi,$Zhi
 141         ldx     [$Htbl+$nlo],$Thi
 142         srlx    $Zlo,4,$Zlo
 143         xor     $rem,$Zhi,$Zhi
 144         ldx     [$rem_4bit+$remi],$rem
 145         sllx    $Zhi,60,$tmp
 146         xor     $Tlo,$Zlo,$Zlo
 147         ldub    [$inp+$cnt],$nlo
 148         srlx    $Zhi,4,$Zhi
 149         xor     $Zlo,$tmp,$Zlo
 150         ldub    [$Xi+$cnt],$xi1
 151         xor     $Thi,$Zhi,$Zhi
 152         and     $Zlo,0xf,$remi
 153
 154         ldx     [$Htblo+$nhi],$Tlo
 155         sll     $remi,3,$remi
 156         xor     $rem,$Zhi,$Zhi
 157         ldx     [$Htbl+$nhi],$Thi
 158         srlx    $Zlo,4,$Zlo
 159         ldx     [$rem_4bit+$remi],$rem
 160         sllx    $Zhi,60,$tmp
 161         xor     $xi1,$nlo,$nlo
 162         srlx    $Zhi,4,$Zhi
 163         and     $nlo,0xf0,$nhi
 164         addcc   $cnt,-1,$cnt
 165         xor     $Zlo,$tmp,$Zlo
 166         and     $nlo,0x0f,$nlo
 167         xor     $Tlo,$Zlo,$Zlo
 168         sll     $nlo,4,$nlo
 169         blu     .Lghash_inner
 170         and     $Zlo,0xf,$remi
 171
 172         ldx     [$Htblo+$nlo],$Tlo
 173         sll     $remi,3,$remi
 174         xor     $Thi,$Zhi,$Zhi
 175         ldx     [$Htbl+$nlo],$Thi
 176         srlx    $Zlo,4,$Zlo
 177         xor     $rem,$Zhi,$Zhi
 178         ldx     [$rem_4bit+$remi],$rem
 179         sllx    $Zhi,60,$tmp
 180         xor     $Tlo,$Zlo,$Zlo
 181         srlx    $Zhi,4,$Zhi
 182         xor     $Zlo,$tmp,$Zlo
 183         xor     $Thi,$Zhi,$Zhi
 184
 185         add     $inp,16,$inp
 186         cmp     $inp,$len
 187         be,pn   SIZE_T_CC,.Ldone
 188         and     $Zlo,0xf,$remi
 189
 190         ldx     [$Htblo+$nhi],$Tlo
 191         sll     $remi,3,$remi
 192         xor     $rem,$Zhi,$Zhi
 193         ldx     [$Htbl+$nhi],$Thi
 194         srlx    $Zlo,4,$Zlo
 195         ldx     [$rem_4bit+$remi],$rem
 196         sllx    $Zhi,60,$tmp
 197         xor     $Tlo,$Zlo,$Zlo
 198         ldub    [$inp+15],$nlo
 199         srlx    $Zhi,4,$Zhi
 200         xor     $Zlo,$tmp,$Zlo
 201         xor     $Thi,$Zhi,$Zhi
 202         stx     $Zlo,[$Xi+8]
 203         xor     $rem,$Zhi,$Zhi
 204         stx     $Zhi,[$Xi]
 205         srl     $Zlo,8,$xi1
 206         and     $Zlo,0xff,$xi0
 207         ba      .Louter
 208         and     $xi1,0xff,$xi1
 209 .align  32
 210 .Ldone:
 211         ldx     [$Htblo+$nhi],$Tlo
 212         sll     $remi,3,$remi
 213         xor     $rem,$Zhi,$Zhi
 214         ldx     [$Htbl+$nhi],$Thi
 215         srlx    $Zlo,4,$Zlo
 216         ldx     [$rem_4bit+$remi],$rem
 217         sllx    $Zhi,60,$tmp
 218         xor     $Tlo,$Zlo,$Zlo
 219         srlx    $Zhi,4,$Zhi
 220         xor     $Zlo,$tmp,$Zlo
 221         xor     $Thi,$Zhi,$Zhi
 222         stx     $Zlo,[$Xi+8]
 223         xor     $rem,$Zhi,$Zhi
 224         stx     $Zhi,[$Xi]
 225
 226         ret
 227         restore
 228 .type   gcm_ghash_4bit,#function
 229 .size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
 230 ___
 231
 232 undef $inp;
 233 undef $len;
 234
 235 $code.=<<___;
 236 .globl  gcm_gmult_4bit
 237 .align  32
 238 gcm_gmult_4bit:
 239         save    %sp,-$frame,%sp
 240         ldub    [$Xi+15],$nlo
 241         add     $Htbl,8,$Htblo
 242
 243 1:      call    .+8
 244         add     %o7,rem_4bit-1b,$rem_4bit
 245
 246         and     $nlo,0xf0,$nhi
 247         and     $nlo,0x0f,$nlo
 248         sll     $nlo,4,$nlo
 249         ldx     [$Htblo+$nlo],$Zlo
 250         ldx     [$Htbl+$nlo],$Zhi
 251
 252         ldub    [$Xi+14],$nlo
 253
 254         ldx     [$Htblo+$nhi],$Tlo
 255         and     $Zlo,0xf,$remi
 256         ldx     [$Htbl+$nhi],$Thi
 257         sll     $remi,3,$remi
 258         ldx     [$rem_4bit+$remi],$rem
 259         srlx    $Zlo,4,$Zlo
 260         mov     13,$cnt
 261         sllx    $Zhi,60,$tmp
 262         xor     $Tlo,$Zlo,$Zlo
 263         srlx    $Zhi,4,$Zhi
 264         xor     $Zlo,$tmp,$Zlo
 265
 266         and     $Zlo,0xf,$remi
 267         and     $nlo,0xf0,$nhi
 268         and     $nlo,0x0f,$nlo
 269         ba      .Lgmult_inner
 270         sll     $nlo,4,$nlo
 271 .align  32
 272 .Lgmult_inner:
 273         ldx     [$Htblo+$nlo],$Tlo
 274         sll     $remi,3,$remi
 275         xor     $Thi,$Zhi,$Zhi
 276         ldx     [$Htbl+$nlo],$Thi
 277         srlx    $Zlo,4,$Zlo
 278         xor     $rem,$Zhi,$Zhi
 279         ldx     [$rem_4bit+$remi],$rem
 280         sllx    $Zhi,60,$tmp
 281         xor     $Tlo,$Zlo,$Zlo
 282         ldub    [$Xi+$cnt],$nlo
 283         srlx    $Zhi,4,$Zhi
 284         xor     $Zlo,$tmp,$Zlo
 285         xor     $Thi,$Zhi,$Zhi
 286         and     $Zlo,0xf,$remi
 287
 288         ldx     [$Htblo+$nhi],$Tlo
 289         sll     $remi,3,$remi
 290         xor     $rem,$Zhi,$Zhi
 291         ldx     [$Htbl+$nhi],$Thi
 292         srlx    $Zlo,4,$Zlo
 293         ldx     [$rem_4bit+$remi],$rem
 294         sllx    $Zhi,60,$tmp
 295         srlx    $Zhi,4,$Zhi
 296         and     $nlo,0xf0,$nhi
 297         addcc   $cnt,-1,$cnt
 298         xor     $Zlo,$tmp,$Zlo
 299         and     $nlo,0x0f,$nlo
 300         xor     $Tlo,$Zlo,$Zlo
 301         sll     $nlo,4,$nlo
 302         blu     .Lgmult_inner
 303         and     $Zlo,0xf,$remi
 304
 305         ldx     [$Htblo+$nlo],$Tlo
 306         sll     $remi,3,$remi
 307         xor     $Thi,$Zhi,$Zhi
 308         ldx     [$Htbl+$nlo],$Thi
 309         srlx    $Zlo,4,$Zlo
 310         xor     $rem,$Zhi,$Zhi
 311         ldx     [$rem_4bit+$remi],$rem
 312         sllx    $Zhi,60,$tmp
 313         xor     $Tlo,$Zlo,$Zlo
 314         srlx    $Zhi,4,$Zhi
 315         xor     $Zlo,$tmp,$Zlo
 316         xor     $Thi,$Zhi,$Zhi
 317         and     $Zlo,0xf,$remi
 318
 319         ldx     [$Htblo+$nhi],$Tlo
 320         sll     $remi,3,$remi
 321         xor     $rem,$Zhi,$Zhi
 322         ldx     [$Htbl+$nhi],$Thi
 323         srlx    $Zlo,4,$Zlo
 324         ldx     [$rem_4bit+$remi],$rem
 325         sllx    $Zhi,60,$tmp
 326         xor     $Tlo,$Zlo,$Zlo
 327         srlx    $Zhi,4,$Zhi
 328         xor     $Zlo,$tmp,$Zlo
 329         xor     $Thi,$Zhi,$Zhi
 330         stx     $Zlo,[$Xi+8]
 331         xor     $rem,$Zhi,$Zhi
 332         stx     $Zhi,[$Xi]
 333
 334         ret
 335         restore
 336 .type   gcm_gmult_4bit,#function
 337 .size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
 338 ___
 339 \f
 340 {{{
 341 # Straightforward 128x128-bit multiplication using Karatsuba algorithm
 342 # followed by pair of 64-bit reductions [with a shortcut in first one,
 343 # which allowed to break dependency between reductions and remove one
 344 # multiplication from critical path]. While it might be suboptimal
 345 # with regard to sheer number of multiplications, other methods [such
 346 # as aggregate reduction] would require more 64-bit registers, which
 347 # we don't have in 32-bit application context.
 348
 349 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
 350
 351 ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
 352         (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
 353
 354 ($shl,$shr)=map("%l$_",(0..7));
 355
 356 # For details regarding "twisted H" see ghash-x86.pl.
 357 $code.=<<___;
 358 .globl  gcm_init_vis3
 359 .align  32
 360 gcm_init_vis3:
 361         save    %sp,-$frame,%sp
 362
 363         ldx     [%i1+0],$Hhi
 364         ldx     [%i1+8],$Hlo
 365         mov     0xE1,$Xhi
 366         mov     1,$Xlo
 367         sllx    $Xhi,57,$Xhi
 368         srax    $Hhi,63,$C0             ! broadcast carry
 369         addcc   $Hlo,$Hlo,$Hlo          ! H<<=1
 370         addxc   $Hhi,$Hhi,$Hhi
 371         and     $C0,$Xlo,$Xlo
 372         and     $C0,$Xhi,$Xhi
 373         xor     $Xlo,$Hlo,$Hlo
 374         xor     $Xhi,$Hhi,$Hhi
 375         stx     $Hlo,[%i0+8]            ! save twisted H
 376         stx     $Hhi,[%i0+0]
 377
 378         sethi   %hi(0xA0406080),$V
 379         sethi   %hi(0x20C0E000),%l0
 380         or      $V,%lo(0xA0406080),$V
 381         or      %l0,%lo(0x20C0E000),%l0
 382         sllx    $V,32,$V
 383         or      %l0,$V,$V               ! (0xE0·i)&0xff=0xA040608020C0E000
 384         stx     $V,[%i0+16]
 385
 386         ret
 387         restore
 388 .type   gcm_init_vis3,#function
 389 .size   gcm_init_vis3,.-gcm_init_vis3
 390
 391 .globl  gcm_gmult_vis3
 392 .align  32
 393 gcm_gmult_vis3:
 394         save    %sp,-$frame,%sp
 395
 396         ldx     [$Xip+8],$Xlo           ! load Xi
 397         ldx     [$Xip+0],$Xhi
 398         ldx     [$Htable+8],$Hlo        ! load twisted H
 399         ldx     [$Htable+0],$Hhi
 400
 401         mov     0xE1,%l7
 402         sllx    %l7,57,$xE1             ! 57 is not a typo
 403         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 404
 405         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 406         xmulx   $Xlo,$Hlo,$C0
 407         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 408         xmulx   $C2,$Hhl,$C1
 409         xmulxhi $Xlo,$Hlo,$Xlo
 410         xmulxhi $C2,$Hhl,$C2
 411         xmulxhi $Xhi,$Hhi,$C3
 412         xmulx   $Xhi,$Hhi,$Xhi
 413
 414         sll     $C0,3,$sqr
 415         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 416         xor     $C0,$sqr,$sqr
 417         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 418
 419         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 420         xor     $Xlo,$C2,$C2
 421          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 422         xor     $C3,$C2,$C2
 423         xor     $Xlo,$C1,$C1
 424         xor     $Xhi,$C2,$C2
 425         xor     $Xhi,$C1,$C1
 426
 427         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 428          xor    $C0,$C2,$C2
 429         xmulx   $C1,$xE1,$C0
 430          xor    $C1,$C3,$C3
 431         xmulxhi $C1,$xE1,$C1
 432
 433         xor     $Xlo,$C2,$C2
 434         xor     $C0,$C2,$C2
 435         xor     $C1,$C3,$C3
 436
 437         stx     $C2,[$Xip+8]            ! save Xi
 438         stx     $C3,[$Xip+0]
 439
 440         ret
 441         restore
 442 .type   gcm_gmult_vis3,#function
 443 .size   gcm_gmult_vis3,.-gcm_gmult_vis3
 444
 445 .globl  gcm_ghash_vis3
 446 .align  32
 447 gcm_ghash_vis3:
 448         save    %sp,-$frame,%sp
 449
 450         ldx     [$Xip+8],$C2            ! load Xi
 451         ldx     [$Xip+0],$C3
 452         ldx     [$Htable+8],$Hlo        ! load twisted H
 453         ldx     [$Htable+0],$Hhi
 454
 455         mov     0xE1,%l7
 456         sllx    %l7,57,$xE1             ! 57 is not a typo
 457         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 458
 459         and     $inp,7,$shl
 460         andn    $inp,7,$inp
 461         sll     $shl,3,$shl
 462         prefetch [$inp+63], 20
 463         sub     %g0,$shl,$shr
 464
 465         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 466 .Loop:
 467         ldx     [$inp+8],$Xlo
 468         brz,pt  $shl,1f
 469         ldx     [$inp+0],$Xhi
 470
 471         ldx     [$inp+16],$C1           ! align data
 472         srlx    $Xlo,$shr,$C0
 473         sllx    $Xlo,$shl,$Xlo
 474         sllx    $Xhi,$shl,$Xhi
 475         srlx    $C1,$shr,$C1
 476         or      $C0,$Xhi,$Xhi
 477         or      $C1,$Xlo,$Xlo
 478 1:
 479         add     $inp,16,$inp
 480         sub     $len,16,$len
 481         xor     $C2,$Xlo,$Xlo
 482         xor     $C3,$Xhi,$Xhi
 483         prefetch [$inp+63], 20
 484
 485         xmulx   $Xlo,$Hlo,$C0
 486         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 487         xmulx   $C2,$Hhl,$C1
 488         xmulxhi $Xlo,$Hlo,$Xlo
 489         xmulxhi $C2,$Hhl,$C2
 490         xmulxhi $Xhi,$Hhi,$C3
 491         xmulx   $Xhi,$Hhi,$Xhi
 492
 493         sll     $C0,3,$sqr
 494         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 495         xor     $C0,$sqr,$sqr
 496         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 497
 498         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 499         xor     $Xlo,$C2,$C2
 500          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 501         xor     $C3,$C2,$C2
 502         xor     $Xlo,$C1,$C1
 503         xor     $Xhi,$C2,$C2
 504         xor     $Xhi,$C1,$C1
 505
 506         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 507          xor    $C0,$C2,$C2
 508         xmulx   $C1,$xE1,$C0
 509          xor    $C1,$C3,$C3
 510         xmulxhi $C1,$xE1,$C1
 511
 512         xor     $Xlo,$C2,$C2
 513         xor     $C0,$C2,$C2
 514         brnz,pt $len,.Loop
 515         xor     $C1,$C3,$C3
 516
 517         stx     $C2,[$Xip+8]            ! save Xi
 518         stx     $C3,[$Xip+0]
 519
 520         ret
 521         restore
 522 .type   gcm_ghash_vis3,#function
 523 .size   gcm_ghash_vis3,.-gcm_ghash_vis3
 524 ___
 525 }}}
 526 $code.=<<___;
 527 .asciz  "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
 528 .align  4
 529 ___
 530
 531 \f
 532 # Purpose of these subroutines is to explicitly encode VIS instructions,
 533 # so that one can compile the module without having to specify VIS
 534 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 535 # Idea is to reserve for option to produce "universal" binary and let
 536 # programmer detect if current CPU is VIS capable at run-time.
 537 sub unvis3 {
 538 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 539 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 540 my ($ref,$opf);
 541 my %visopf = (  "addxc"         => 0x011,
 542                 "addxccc"       => 0x013,
 543                 "xmulx"         => 0x115,
 544                 "xmulxhi"       => 0x116        );
 545
 546     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 547
 548     if ($opf=$visopf{$mnemonic}) {
 549         foreach ($rs1,$rs2,$rd) {
 550             return $ref if (!/%([goli])([0-9])/);
 551             $_=$bias{$1}+$2;
 552         }
 553
 554         return  sprintf ".word\t0x%08x !%s",
 555                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 556                         $ref;
 557     } else {
 558         return $ref;
 559     }
 560 }
 561
 562 foreach (split("\n",$code)) {
 563         s/\`([^\`]*)\`/eval $1/ge;
 564
 565         s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 566                 &unvis3($1,$2,$3,$4)
 567          /ge;
 568
 569         print $_,"\n";
 570 }
 571
 572 close STDOUT;