crypto/modes/asm/ghash-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2010-2020 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16
  17 # March 2010
  18 #
  19 # The module implements "4-bit" GCM GHASH function and underlying
  20 # single multiplication operation in GF(2^128). "4-bit" means that it
  21 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
  22 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
  23 # and are expressed in cycles per processed byte, less is better:
  24 #
  25 #               gcc 3.3.x       cc 5.2          this assembler
  26 #
  27 # 32-bit build  81.4            43.3            12.6    (+546%/+244%)
  28 # 64-bit build  20.2            21.2            12.6    (+60%/+68%)
  29 #
  30 # Here is data collected on UltraSPARC T1 system running Linux:
  31 #
  32 #               gcc 4.4.1                       this assembler
  33 #
  34 # 32-bit build  566                             50      (+1000%)
  35 # 64-bit build  56                              50      (+12%)
  36 #
  37 # I don't quite understand why difference between 32-bit and 64-bit
  38 # compiler-generated code is so big. Compilers *were* instructed to
  39 # generate code for UltraSPARC and should have used 64-bit registers
  40 # for Z vector (see C code) even in 32-bit build... Oh well, it only
  41 # means more impressive improvement coefficients for this assembler
  42 # module;-) Loops are aggressively modulo-scheduled in respect to
  43 # references to input data and Z.hi updates to achieve 12 cycles
  44 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
  45 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
  46 #
  47 # October 2012
  48 #
  49 # Add VIS3 lookup-table-free implementation using polynomial
  50 # multiplication xmulx[hi] and extended addition addxc[cc]
  51 # instructions. 4.52/7.63x improvement on T3/T4 or in absolute
  52 # terms 7.90/2.14 cycles per byte. On T4 multi-process benchmark
  53 # saturates at ~15.5x single-process result on 8-core processor,
  54 # or ~20.5GBps per 2.85GHz socket.
  55
  56 $output=pop and open STDOUT,">$output";
  57
  58 $frame="STACK_FRAME";
  59 $bias="STACK_BIAS";
  60
  61 $Zhi="%o0";     # 64-bit values
  62 $Zlo="%o1";
  63 $Thi="%o2";
  64 $Tlo="%o3";
  65 $rem="%o4";
  66 $tmp="%o5";
  67
  68 $nhi="%l0";     # small values and pointers
  69 $nlo="%l1";
  70 $xi0="%l2";
  71 $xi1="%l3";
  72 $rem_4bit="%l4";
  73 $remi="%l5";
  74 $Htblo="%l6";
  75 $cnt="%l7";
  76
  77 $Xi="%i0";      # input argument block
  78 $Htbl="%i1";
  79 $inp="%i2";
  80 $len="%i3";
  81
  82 $code.=<<___;
  83 #ifndef __ASSEMBLER__
  84 # define __ASSEMBLER__ 1
  85 #endif
  86 #include "crypto/sparc_arch.h"
  87
  88 #ifdef  __arch64__
  89 .register       %g2,#scratch
  90 .register       %g3,#scratch
  91 #endif
  92
  93 .section        ".text",#alloc,#execinstr
  94
  95 .align  64
  96 rem_4bit:
  97         .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  98         .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  99         .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
 100         .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
 101 .type   rem_4bit,#object
 102 .size   rem_4bit,(.-rem_4bit)
 103
 104 .globl  gcm_ghash_4bit
 105 .align  32
 106 gcm_ghash_4bit:
 107         save    %sp,-$frame,%sp
 108         ldub    [$inp+15],$nlo
 109         ldub    [$Xi+15],$xi0
 110         ldub    [$Xi+14],$xi1
 111         add     $len,$inp,$len
 112         add     $Htbl,8,$Htblo
 113
 114 1:      call    .+8
 115         add     %o7,rem_4bit-1b,$rem_4bit
 116
 117 .Louter:
 118         xor     $xi0,$nlo,$nlo
 119         and     $nlo,0xf0,$nhi
 120         and     $nlo,0x0f,$nlo
 121         sll     $nlo,4,$nlo
 122         ldx     [$Htblo+$nlo],$Zlo
 123         ldx     [$Htbl+$nlo],$Zhi
 124
 125         ldub    [$inp+14],$nlo
 126
 127         ldx     [$Htblo+$nhi],$Tlo
 128         and     $Zlo,0xf,$remi
 129         ldx     [$Htbl+$nhi],$Thi
 130         sll     $remi,3,$remi
 131         ldx     [$rem_4bit+$remi],$rem
 132         srlx    $Zlo,4,$Zlo
 133         mov     13,$cnt
 134         sllx    $Zhi,60,$tmp
 135         xor     $Tlo,$Zlo,$Zlo
 136         srlx    $Zhi,4,$Zhi
 137         xor     $Zlo,$tmp,$Zlo
 138
 139         xor     $xi1,$nlo,$nlo
 140         and     $Zlo,0xf,$remi
 141         and     $nlo,0xf0,$nhi
 142         and     $nlo,0x0f,$nlo
 143         ba      .Lghash_inner
 144         sll     $nlo,4,$nlo
 145 .align  32
 146 .Lghash_inner:
 147         ldx     [$Htblo+$nlo],$Tlo
 148         sll     $remi,3,$remi
 149         xor     $Thi,$Zhi,$Zhi
 150         ldx     [$Htbl+$nlo],$Thi
 151         srlx    $Zlo,4,$Zlo
 152         xor     $rem,$Zhi,$Zhi
 153         ldx     [$rem_4bit+$remi],$rem
 154         sllx    $Zhi,60,$tmp
 155         xor     $Tlo,$Zlo,$Zlo
 156         ldub    [$inp+$cnt],$nlo
 157         srlx    $Zhi,4,$Zhi
 158         xor     $Zlo,$tmp,$Zlo
 159         ldub    [$Xi+$cnt],$xi1
 160         xor     $Thi,$Zhi,$Zhi
 161         and     $Zlo,0xf,$remi
 162
 163         ldx     [$Htblo+$nhi],$Tlo
 164         sll     $remi,3,$remi
 165         xor     $rem,$Zhi,$Zhi
 166         ldx     [$Htbl+$nhi],$Thi
 167         srlx    $Zlo,4,$Zlo
 168         ldx     [$rem_4bit+$remi],$rem
 169         sllx    $Zhi,60,$tmp
 170         xor     $xi1,$nlo,$nlo
 171         srlx    $Zhi,4,$Zhi
 172         and     $nlo,0xf0,$nhi
 173         addcc   $cnt,-1,$cnt
 174         xor     $Zlo,$tmp,$Zlo
 175         and     $nlo,0x0f,$nlo
 176         xor     $Tlo,$Zlo,$Zlo
 177         sll     $nlo,4,$nlo
 178         blu     .Lghash_inner
 179         and     $Zlo,0xf,$remi
 180
 181         ldx     [$Htblo+$nlo],$Tlo
 182         sll     $remi,3,$remi
 183         xor     $Thi,$Zhi,$Zhi
 184         ldx     [$Htbl+$nlo],$Thi
 185         srlx    $Zlo,4,$Zlo
 186         xor     $rem,$Zhi,$Zhi
 187         ldx     [$rem_4bit+$remi],$rem
 188         sllx    $Zhi,60,$tmp
 189         xor     $Tlo,$Zlo,$Zlo
 190         srlx    $Zhi,4,$Zhi
 191         xor     $Zlo,$tmp,$Zlo
 192         xor     $Thi,$Zhi,$Zhi
 193
 194         add     $inp,16,$inp
 195         cmp     $inp,$len
 196         be,pn   SIZE_T_CC,.Ldone
 197         and     $Zlo,0xf,$remi
 198
 199         ldx     [$Htblo+$nhi],$Tlo
 200         sll     $remi,3,$remi
 201         xor     $rem,$Zhi,$Zhi
 202         ldx     [$Htbl+$nhi],$Thi
 203         srlx    $Zlo,4,$Zlo
 204         ldx     [$rem_4bit+$remi],$rem
 205         sllx    $Zhi,60,$tmp
 206         xor     $Tlo,$Zlo,$Zlo
 207         ldub    [$inp+15],$nlo
 208         srlx    $Zhi,4,$Zhi
 209         xor     $Zlo,$tmp,$Zlo
 210         xor     $Thi,$Zhi,$Zhi
 211         stx     $Zlo,[$Xi+8]
 212         xor     $rem,$Zhi,$Zhi
 213         stx     $Zhi,[$Xi]
 214         srl     $Zlo,8,$xi1
 215         and     $Zlo,0xff,$xi0
 216         ba      .Louter
 217         and     $xi1,0xff,$xi1
 218 .align  32
 219 .Ldone:
 220         ldx     [$Htblo+$nhi],$Tlo
 221         sll     $remi,3,$remi
 222         xor     $rem,$Zhi,$Zhi
 223         ldx     [$Htbl+$nhi],$Thi
 224         srlx    $Zlo,4,$Zlo
 225         ldx     [$rem_4bit+$remi],$rem
 226         sllx    $Zhi,60,$tmp
 227         xor     $Tlo,$Zlo,$Zlo
 228         srlx    $Zhi,4,$Zhi
 229         xor     $Zlo,$tmp,$Zlo
 230         xor     $Thi,$Zhi,$Zhi
 231         stx     $Zlo,[$Xi+8]
 232         xor     $rem,$Zhi,$Zhi
 233         stx     $Zhi,[$Xi]
 234
 235         ret
 236         restore
 237 .type   gcm_ghash_4bit,#function
 238 .size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
 239 ___
 240
 241 undef $inp;
 242 undef $len;
 243
 244 $code.=<<___;
 245 .globl  gcm_gmult_4bit
 246 .align  32
 247 gcm_gmult_4bit:
 248         save    %sp,-$frame,%sp
 249         ldub    [$Xi+15],$nlo
 250         add     $Htbl,8,$Htblo
 251
 252 1:      call    .+8
 253         add     %o7,rem_4bit-1b,$rem_4bit
 254
 255         and     $nlo,0xf0,$nhi
 256         and     $nlo,0x0f,$nlo
 257         sll     $nlo,4,$nlo
 258         ldx     [$Htblo+$nlo],$Zlo
 259         ldx     [$Htbl+$nlo],$Zhi
 260
 261         ldub    [$Xi+14],$nlo
 262
 263         ldx     [$Htblo+$nhi],$Tlo
 264         and     $Zlo,0xf,$remi
 265         ldx     [$Htbl+$nhi],$Thi
 266         sll     $remi,3,$remi
 267         ldx     [$rem_4bit+$remi],$rem
 268         srlx    $Zlo,4,$Zlo
 269         mov     13,$cnt
 270         sllx    $Zhi,60,$tmp
 271         xor     $Tlo,$Zlo,$Zlo
 272         srlx    $Zhi,4,$Zhi
 273         xor     $Zlo,$tmp,$Zlo
 274
 275         and     $Zlo,0xf,$remi
 276         and     $nlo,0xf0,$nhi
 277         and     $nlo,0x0f,$nlo
 278         ba      .Lgmult_inner
 279         sll     $nlo,4,$nlo
 280 .align  32
 281 .Lgmult_inner:
 282         ldx     [$Htblo+$nlo],$Tlo
 283         sll     $remi,3,$remi
 284         xor     $Thi,$Zhi,$Zhi
 285         ldx     [$Htbl+$nlo],$Thi
 286         srlx    $Zlo,4,$Zlo
 287         xor     $rem,$Zhi,$Zhi
 288         ldx     [$rem_4bit+$remi],$rem
 289         sllx    $Zhi,60,$tmp
 290         xor     $Tlo,$Zlo,$Zlo
 291         ldub    [$Xi+$cnt],$nlo
 292         srlx    $Zhi,4,$Zhi
 293         xor     $Zlo,$tmp,$Zlo
 294         xor     $Thi,$Zhi,$Zhi
 295         and     $Zlo,0xf,$remi
 296
 297         ldx     [$Htblo+$nhi],$Tlo
 298         sll     $remi,3,$remi
 299         xor     $rem,$Zhi,$Zhi
 300         ldx     [$Htbl+$nhi],$Thi
 301         srlx    $Zlo,4,$Zlo
 302         ldx     [$rem_4bit+$remi],$rem
 303         sllx    $Zhi,60,$tmp
 304         srlx    $Zhi,4,$Zhi
 305         and     $nlo,0xf0,$nhi
 306         addcc   $cnt,-1,$cnt
 307         xor     $Zlo,$tmp,$Zlo
 308         and     $nlo,0x0f,$nlo
 309         xor     $Tlo,$Zlo,$Zlo
 310         sll     $nlo,4,$nlo
 311         blu     .Lgmult_inner
 312         and     $Zlo,0xf,$remi
 313
 314         ldx     [$Htblo+$nlo],$Tlo
 315         sll     $remi,3,$remi
 316         xor     $Thi,$Zhi,$Zhi
 317         ldx     [$Htbl+$nlo],$Thi
 318         srlx    $Zlo,4,$Zlo
 319         xor     $rem,$Zhi,$Zhi
 320         ldx     [$rem_4bit+$remi],$rem
 321         sllx    $Zhi,60,$tmp
 322         xor     $Tlo,$Zlo,$Zlo
 323         srlx    $Zhi,4,$Zhi
 324         xor     $Zlo,$tmp,$Zlo
 325         xor     $Thi,$Zhi,$Zhi
 326         and     $Zlo,0xf,$remi
 327
 328         ldx     [$Htblo+$nhi],$Tlo
 329         sll     $remi,3,$remi
 330         xor     $rem,$Zhi,$Zhi
 331         ldx     [$Htbl+$nhi],$Thi
 332         srlx    $Zlo,4,$Zlo
 333         ldx     [$rem_4bit+$remi],$rem
 334         sllx    $Zhi,60,$tmp
 335         xor     $Tlo,$Zlo,$Zlo
 336         srlx    $Zhi,4,$Zhi
 337         xor     $Zlo,$tmp,$Zlo
 338         xor     $Thi,$Zhi,$Zhi
 339         stx     $Zlo,[$Xi+8]
 340         xor     $rem,$Zhi,$Zhi
 341         stx     $Zhi,[$Xi]
 342
 343         ret
 344         restore
 345 .type   gcm_gmult_4bit,#function
 346 .size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
 347 ___
 348 \f
 349 {{{
 350 # Straightforward 128x128-bit multiplication using Karatsuba algorithm
 351 # followed by pair of 64-bit reductions [with a shortcut in first one,
 352 # which allowed to break dependency between reductions and remove one
 353 # multiplication from critical path]. While it might be suboptimal
 354 # with regard to sheer number of multiplications, other methods [such
 355 # as aggregate reduction] would require more 64-bit registers, which
 356 # we don't have in 32-bit application context.
 357
 358 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
 359
 360 ($Hhl,$Hlo,$Hhi,$Xlo,$Xhi,$xE1,$sqr, $C0,$C1,$C2,$C3,$V)=
 361         (map("%o$_",(0..5,7)),map("%g$_",(1..5)));
 362
 363 ($shl,$shr)=map("%l$_",(0..7));
 364
 365 # For details regarding "twisted H" see ghash-x86.pl.
 366 $code.=<<___;
 367 .globl  gcm_init_vis3
 368 .align  32
 369 gcm_init_vis3:
 370         save    %sp,-$frame,%sp
 371
 372         ldx     [%i1+0],$Hhi
 373         ldx     [%i1+8],$Hlo
 374         mov     0xE1,$Xhi
 375         mov     1,$Xlo
 376         sllx    $Xhi,57,$Xhi
 377         srax    $Hhi,63,$C0             ! broadcast carry
 378         addcc   $Hlo,$Hlo,$Hlo          ! H<<=1
 379         addxc   $Hhi,$Hhi,$Hhi
 380         and     $C0,$Xlo,$Xlo
 381         and     $C0,$Xhi,$Xhi
 382         xor     $Xlo,$Hlo,$Hlo
 383         xor     $Xhi,$Hhi,$Hhi
 384         stx     $Hlo,[%i0+8]            ! save twisted H
 385         stx     $Hhi,[%i0+0]
 386
 387         sethi   %hi(0xA0406080),$V
 388         sethi   %hi(0x20C0E000),%l0
 389         or      $V,%lo(0xA0406080),$V
 390         or      %l0,%lo(0x20C0E000),%l0
 391         sllx    $V,32,$V
 392         or      %l0,$V,$V               ! (0xE0·i)&0xff=0xA040608020C0E000
 393         stx     $V,[%i0+16]
 394
 395         ret
 396         restore
 397 .type   gcm_init_vis3,#function
 398 .size   gcm_init_vis3,.-gcm_init_vis3
 399
 400 .globl  gcm_gmult_vis3
 401 .align  32
 402 gcm_gmult_vis3:
 403         save    %sp,-$frame,%sp
 404
 405         ldx     [$Xip+8],$Xlo           ! load Xi
 406         ldx     [$Xip+0],$Xhi
 407         ldx     [$Htable+8],$Hlo        ! load twisted H
 408         ldx     [$Htable+0],$Hhi
 409
 410         mov     0xE1,%l7
 411         sllx    %l7,57,$xE1             ! 57 is not a typo
 412         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 413
 414         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 415         xmulx   $Xlo,$Hlo,$C0
 416         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 417         xmulx   $C2,$Hhl,$C1
 418         xmulxhi $Xlo,$Hlo,$Xlo
 419         xmulxhi $C2,$Hhl,$C2
 420         xmulxhi $Xhi,$Hhi,$C3
 421         xmulx   $Xhi,$Hhi,$Xhi
 422
 423         sll     $C0,3,$sqr
 424         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 425         xor     $C0,$sqr,$sqr
 426         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 427
 428         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 429         xor     $Xlo,$C2,$C2
 430          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 431         xor     $C3,$C2,$C2
 432         xor     $Xlo,$C1,$C1
 433         xor     $Xhi,$C2,$C2
 434         xor     $Xhi,$C1,$C1
 435
 436         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 437          xor    $C0,$C2,$C2
 438         xmulx   $C1,$xE1,$C0
 439          xor    $C1,$C3,$C3
 440         xmulxhi $C1,$xE1,$C1
 441
 442         xor     $Xlo,$C2,$C2
 443         xor     $C0,$C2,$C2
 444         xor     $C1,$C3,$C3
 445
 446         stx     $C2,[$Xip+8]            ! save Xi
 447         stx     $C3,[$Xip+0]
 448
 449         ret
 450         restore
 451 .type   gcm_gmult_vis3,#function
 452 .size   gcm_gmult_vis3,.-gcm_gmult_vis3
 453
 454 .globl  gcm_ghash_vis3
 455 .align  32
 456 gcm_ghash_vis3:
 457         save    %sp,-$frame,%sp
 458         nop
 459         srln    $len,0,$len             ! needed on v8+, "nop" on v9
 460
 461         ldx     [$Xip+8],$C2            ! load Xi
 462         ldx     [$Xip+0],$C3
 463         ldx     [$Htable+8],$Hlo        ! load twisted H
 464         ldx     [$Htable+0],$Hhi
 465
 466         mov     0xE1,%l7
 467         sllx    %l7,57,$xE1             ! 57 is not a typo
 468         ldx     [$Htable+16],$V         ! (0xE0·i)&0xff=0xA040608020C0E000
 469
 470         and     $inp,7,$shl
 471         andn    $inp,7,$inp
 472         sll     $shl,3,$shl
 473         prefetch [$inp+63], 20
 474         sub     %g0,$shl,$shr
 475
 476         xor     $Hhi,$Hlo,$Hhl          ! Karatsuba pre-processing
 477 .Loop:
 478         ldx     [$inp+8],$Xlo
 479         brz,pt  $shl,1f
 480         ldx     [$inp+0],$Xhi
 481
 482         ldx     [$inp+16],$C1           ! align data
 483         srlx    $Xlo,$shr,$C0
 484         sllx    $Xlo,$shl,$Xlo
 485         sllx    $Xhi,$shl,$Xhi
 486         srlx    $C1,$shr,$C1
 487         or      $C0,$Xhi,$Xhi
 488         or      $C1,$Xlo,$Xlo
 489 1:
 490         add     $inp,16,$inp
 491         sub     $len,16,$len
 492         xor     $C2,$Xlo,$Xlo
 493         xor     $C3,$Xhi,$Xhi
 494         prefetch [$inp+63], 20
 495
 496         xmulx   $Xlo,$Hlo,$C0
 497         xor     $Xlo,$Xhi,$C2           ! Karatsuba pre-processing
 498         xmulx   $C2,$Hhl,$C1
 499         xmulxhi $Xlo,$Hlo,$Xlo
 500         xmulxhi $C2,$Hhl,$C2
 501         xmulxhi $Xhi,$Hhi,$C3
 502         xmulx   $Xhi,$Hhi,$Xhi
 503
 504         sll     $C0,3,$sqr
 505         srlx    $V,$sqr,$sqr            ! ·0xE0 [implicit &(7<<3)]
 506         xor     $C0,$sqr,$sqr
 507         sllx    $sqr,57,$sqr            ! ($C0·0xE1)<<1<<56 [implicit &0x7f]
 508
 509         xor     $C0,$C1,$C1             ! Karatsuba post-processing
 510         xor     $Xlo,$C2,$C2
 511          xor    $sqr,$Xlo,$Xlo          ! real destination is $C1
 512         xor     $C3,$C2,$C2
 513         xor     $Xlo,$C1,$C1
 514         xor     $Xhi,$C2,$C2
 515         xor     $Xhi,$C1,$C1
 516
 517         xmulxhi $C0,$xE1,$Xlo           ! ·0xE1<<1<<56
 518          xor    $C0,$C2,$C2
 519         xmulx   $C1,$xE1,$C0
 520          xor    $C1,$C3,$C3
 521         xmulxhi $C1,$xE1,$C1
 522
 523         xor     $Xlo,$C2,$C2
 524         xor     $C0,$C2,$C2
 525         brnz,pt $len,.Loop
 526         xor     $C1,$C3,$C3
 527
 528         stx     $C2,[$Xip+8]            ! save Xi
 529         stx     $C3,[$Xip+0]
 530
 531         ret
 532         restore
 533 .type   gcm_ghash_vis3,#function
 534 .size   gcm_ghash_vis3,.-gcm_ghash_vis3
 535 ___
 536 }}}
 537 $code.=<<___;
 538 .asciz  "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
 539 .align  4
 540 ___
 541
 542 \f
 543 # Purpose of these subroutines is to explicitly encode VIS instructions,
 544 # so that one can compile the module without having to specify VIS
 545 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 546 # Idea is to reserve for option to produce "universal" binary and let
 547 # programmer detect if current CPU is VIS capable at run-time.
 548 sub unvis3 {
 549 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 550 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 551 my ($ref,$opf);
 552 my %visopf = (  "addxc"         => 0x011,
 553                 "addxccc"       => 0x013,
 554                 "xmulx"         => 0x115,
 555                 "xmulxhi"       => 0x116        );
 556
 557     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 558
 559     if ($opf=$visopf{$mnemonic}) {
 560         foreach ($rs1,$rs2,$rd) {
 561             return $ref if (!/%([goli])([0-9])/);
 562             $_=$bias{$1}+$2;
 563         }
 564
 565         return  sprintf ".word\t0x%08x !%s",
 566                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 567                         $ref;
 568     } else {
 569         return $ref;
 570     }
 571 }
 572
 573 foreach (split("\n",$code)) {
 574         s/\`([^\`]*)\`/eval $1/ge;
 575
 576         s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 577                 &unvis3($1,$2,$3,$4)
 578          /ge;
 579
 580         print $_,"\n";
 581 }
 582
 583 close STDOUT or die "error closing STDOUT: $!";