crypto/modes/asm/ghash-sparcv9.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # March 2010
  11 #
  12 # The module implements "4-bit" GCM GHASH function and underlying
  13 # single multiplication operation in GF(2^128). "4-bit" means that it
  14 # uses 256 bytes per-key table [+128 bytes shared table]. Performance
  15 # results are for streamed GHASH subroutine on UltraSPARC pre-Tx CPU
  16 # and are expressed in cycles per processed byte, less is better:
  17 #
  18 #               gcc 3.3.x       cc 5.2          this assembler
  19 #
  20 # 32-bit build  81.4            43.3            12.6    (+546%/+244%)
  21 # 64-bit build  20.2            21.2            12.6    (+60%/+68%)
  22 #
  23 # Here is data collected on UltraSPARC T1 system running Linux:
  24 #
  25 #               gcc 4.4.1                       this assembler
  26 #
  27 # 32-bit build  566                             50      (+1000%)
  28 # 64-bit build  56                              50      (+12%)
  29 #
  30 # I don't quite understand why difference between 32-bit and 64-bit
  31 # compiler-generated code is so big. Compilers *were* instructed to
  32 # generate code for UltraSPARC and should have used 64-bit registers
  33 # for Z vector (see C code) even in 32-bit build... Oh well, it only
  34 # means more impressive improvement coefficients for this assembler
  35 # module;-) Loops are aggressively modulo-scheduled in respect to
  36 # references to input data and Z.hi updates to achieve 12 cycles
  37 # timing. To anchor to something else, sha1-sparcv9.pl spends 11.6
  38 # cycles to process one byte on UltraSPARC pre-Tx CPU and ~24 on T1.
  39 #
  40 # October 2012
  41 #
  42 # Add VIS3 lookup-table-free implementation using polynomial
  43 # multiplication xmulx[hi] and extended addition addxc[cc]
  44 # instructions. 3.96/6.26x improvement on T3/T4 or in absolute
  45 # terms 9.02/2.61 cycles per byte.
  46
  47 $bits=32;
  48 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  49 if ($bits==64)  { $bias=2047; $frame=192; }
  50 else            { $bias=0;    $frame=112; }
  51
  52 $output=shift;
  53 open STDOUT,">$output";
  54
  55 $Zhi="%o0";     # 64-bit values
  56 $Zlo="%o1";
  57 $Thi="%o2";
  58 $Tlo="%o3";
  59 $rem="%o4";
  60 $tmp="%o5";
  61
  62 $nhi="%l0";     # small values and pointers
  63 $nlo="%l1";
  64 $xi0="%l2";
  65 $xi1="%l3";
  66 $rem_4bit="%l4";
  67 $remi="%l5";
  68 $Htblo="%l6";
  69 $cnt="%l7";
  70
  71 $Xi="%i0";      # input argument block
  72 $Htbl="%i1";
  73 $inp="%i2";
  74 $len="%i3";
  75
  76 $code.=<<___ if ($bits==64);
  77 .register       %g2,#scratch
  78 .register       %g3,#scratch
  79 ___
  80 $code.=<<___;
  81 .section        ".text",#alloc,#execinstr
  82
  83 .align  64
  84 rem_4bit:
  85         .long   `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0
  86         .long   `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0
  87         .long   `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0
  88         .long   `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0
  89 .type   rem_4bit,#object
  90 .size   rem_4bit,(.-rem_4bit)
  91
  92 .globl  gcm_ghash_4bit
  93 .align  32
  94 gcm_ghash_4bit:
  95         save    %sp,-$frame,%sp
  96         ldub    [$inp+15],$nlo
  97         ldub    [$Xi+15],$xi0
  98         ldub    [$Xi+14],$xi1
  99         add     $len,$inp,$len
 100         add     $Htbl,8,$Htblo
 101
 102 1:      call    .+8
 103         add     %o7,rem_4bit-1b,$rem_4bit
 104
 105 .Louter:
 106         xor     $xi0,$nlo,$nlo
 107         and     $nlo,0xf0,$nhi
 108         and     $nlo,0x0f,$nlo
 109         sll     $nlo,4,$nlo
 110         ldx     [$Htblo+$nlo],$Zlo
 111         ldx     [$Htbl+$nlo],$Zhi
 112
 113         ldub    [$inp+14],$nlo
 114
 115         ldx     [$Htblo+$nhi],$Tlo
 116         and     $Zlo,0xf,$remi
 117         ldx     [$Htbl+$nhi],$Thi
 118         sll     $remi,3,$remi
 119         ldx     [$rem_4bit+$remi],$rem
 120         srlx    $Zlo,4,$Zlo
 121         mov     13,$cnt
 122         sllx    $Zhi,60,$tmp
 123         xor     $Tlo,$Zlo,$Zlo
 124         srlx    $Zhi,4,$Zhi
 125         xor     $Zlo,$tmp,$Zlo
 126
 127         xor     $xi1,$nlo,$nlo
 128         and     $Zlo,0xf,$remi
 129         and     $nlo,0xf0,$nhi
 130         and     $nlo,0x0f,$nlo
 131         ba      .Lghash_inner
 132         sll     $nlo,4,$nlo
 133 .align  32
 134 .Lghash_inner:
 135         ldx     [$Htblo+$nlo],$Tlo
 136         sll     $remi,3,$remi
 137         xor     $Thi,$Zhi,$Zhi
 138         ldx     [$Htbl+$nlo],$Thi
 139         srlx    $Zlo,4,$Zlo
 140         xor     $rem,$Zhi,$Zhi
 141         ldx     [$rem_4bit+$remi],$rem
 142         sllx    $Zhi,60,$tmp
 143         xor     $Tlo,$Zlo,$Zlo
 144         ldub    [$inp+$cnt],$nlo
 145         srlx    $Zhi,4,$Zhi
 146         xor     $Zlo,$tmp,$Zlo
 147         ldub    [$Xi+$cnt],$xi1
 148         xor     $Thi,$Zhi,$Zhi
 149         and     $Zlo,0xf,$remi
 150
 151         ldx     [$Htblo+$nhi],$Tlo
 152         sll     $remi,3,$remi
 153         xor     $rem,$Zhi,$Zhi
 154         ldx     [$Htbl+$nhi],$Thi
 155         srlx    $Zlo,4,$Zlo
 156         ldx     [$rem_4bit+$remi],$rem
 157         sllx    $Zhi,60,$tmp
 158         xor     $xi1,$nlo,$nlo
 159         srlx    $Zhi,4,$Zhi
 160         and     $nlo,0xf0,$nhi
 161         addcc   $cnt,-1,$cnt
 162         xor     $Zlo,$tmp,$Zlo
 163         and     $nlo,0x0f,$nlo
 164         xor     $Tlo,$Zlo,$Zlo
 165         sll     $nlo,4,$nlo
 166         blu     .Lghash_inner
 167         and     $Zlo,0xf,$remi
 168
 169         ldx     [$Htblo+$nlo],$Tlo
 170         sll     $remi,3,$remi
 171         xor     $Thi,$Zhi,$Zhi
 172         ldx     [$Htbl+$nlo],$Thi
 173         srlx    $Zlo,4,$Zlo
 174         xor     $rem,$Zhi,$Zhi
 175         ldx     [$rem_4bit+$remi],$rem
 176         sllx    $Zhi,60,$tmp
 177         xor     $Tlo,$Zlo,$Zlo
 178         srlx    $Zhi,4,$Zhi
 179         xor     $Zlo,$tmp,$Zlo
 180         xor     $Thi,$Zhi,$Zhi
 181
 182         add     $inp,16,$inp
 183         cmp     $inp,$len
 184         be,pn   `$bits==64?"%xcc":"%icc"`,.Ldone
 185         and     $Zlo,0xf,$remi
 186
 187         ldx     [$Htblo+$nhi],$Tlo
 188         sll     $remi,3,$remi
 189         xor     $rem,$Zhi,$Zhi
 190         ldx     [$Htbl+$nhi],$Thi
 191         srlx    $Zlo,4,$Zlo
 192         ldx     [$rem_4bit+$remi],$rem
 193         sllx    $Zhi,60,$tmp
 194         xor     $Tlo,$Zlo,$Zlo
 195         ldub    [$inp+15],$nlo
 196         srlx    $Zhi,4,$Zhi
 197         xor     $Zlo,$tmp,$Zlo
 198         xor     $Thi,$Zhi,$Zhi
 199         stx     $Zlo,[$Xi+8]
 200         xor     $rem,$Zhi,$Zhi
 201         stx     $Zhi,[$Xi]
 202         srl     $Zlo,8,$xi1
 203         and     $Zlo,0xff,$xi0
 204         ba      .Louter
 205         and     $xi1,0xff,$xi1
 206 .align  32
 207 .Ldone:
 208         ldx     [$Htblo+$nhi],$Tlo
 209         sll     $remi,3,$remi
 210         xor     $rem,$Zhi,$Zhi
 211         ldx     [$Htbl+$nhi],$Thi
 212         srlx    $Zlo,4,$Zlo
 213         ldx     [$rem_4bit+$remi],$rem
 214         sllx    $Zhi,60,$tmp
 215         xor     $Tlo,$Zlo,$Zlo
 216         srlx    $Zhi,4,$Zhi
 217         xor     $Zlo,$tmp,$Zlo
 218         xor     $Thi,$Zhi,$Zhi
 219         stx     $Zlo,[$Xi+8]
 220         xor     $rem,$Zhi,$Zhi
 221         stx     $Zhi,[$Xi]
 222
 223         ret
 224         restore
 225 .type   gcm_ghash_4bit,#function
 226 .size   gcm_ghash_4bit,(.-gcm_ghash_4bit)
 227 ___
 228
 229 undef $inp;
 230 undef $len;
 231
 232 $code.=<<___;
 233 .globl  gcm_gmult_4bit
 234 .align  32
 235 gcm_gmult_4bit:
 236         save    %sp,-$frame,%sp
 237         ldub    [$Xi+15],$nlo
 238         add     $Htbl,8,$Htblo
 239
 240 1:      call    .+8
 241         add     %o7,rem_4bit-1b,$rem_4bit
 242
 243         and     $nlo,0xf0,$nhi
 244         and     $nlo,0x0f,$nlo
 245         sll     $nlo,4,$nlo
 246         ldx     [$Htblo+$nlo],$Zlo
 247         ldx     [$Htbl+$nlo],$Zhi
 248
 249         ldub    [$Xi+14],$nlo
 250
 251         ldx     [$Htblo+$nhi],$Tlo
 252         and     $Zlo,0xf,$remi
 253         ldx     [$Htbl+$nhi],$Thi
 254         sll     $remi,3,$remi
 255         ldx     [$rem_4bit+$remi],$rem
 256         srlx    $Zlo,4,$Zlo
 257         mov     13,$cnt
 258         sllx    $Zhi,60,$tmp
 259         xor     $Tlo,$Zlo,$Zlo
 260         srlx    $Zhi,4,$Zhi
 261         xor     $Zlo,$tmp,$Zlo
 262
 263         and     $Zlo,0xf,$remi
 264         and     $nlo,0xf0,$nhi
 265         and     $nlo,0x0f,$nlo
 266         ba      .Lgmult_inner
 267         sll     $nlo,4,$nlo
 268 .align  32
 269 .Lgmult_inner:
 270         ldx     [$Htblo+$nlo],$Tlo
 271         sll     $remi,3,$remi
 272         xor     $Thi,$Zhi,$Zhi
 273         ldx     [$Htbl+$nlo],$Thi
 274         srlx    $Zlo,4,$Zlo
 275         xor     $rem,$Zhi,$Zhi
 276         ldx     [$rem_4bit+$remi],$rem
 277         sllx    $Zhi,60,$tmp
 278         xor     $Tlo,$Zlo,$Zlo
 279         ldub    [$Xi+$cnt],$nlo
 280         srlx    $Zhi,4,$Zhi
 281         xor     $Zlo,$tmp,$Zlo
 282         xor     $Thi,$Zhi,$Zhi
 283         and     $Zlo,0xf,$remi
 284
 285         ldx     [$Htblo+$nhi],$Tlo
 286         sll     $remi,3,$remi
 287         xor     $rem,$Zhi,$Zhi
 288         ldx     [$Htbl+$nhi],$Thi
 289         srlx    $Zlo,4,$Zlo
 290         ldx     [$rem_4bit+$remi],$rem
 291         sllx    $Zhi,60,$tmp
 292         srlx    $Zhi,4,$Zhi
 293         and     $nlo,0xf0,$nhi
 294         addcc   $cnt,-1,$cnt
 295         xor     $Zlo,$tmp,$Zlo
 296         and     $nlo,0x0f,$nlo
 297         xor     $Tlo,$Zlo,$Zlo
 298         sll     $nlo,4,$nlo
 299         blu     .Lgmult_inner
 300         and     $Zlo,0xf,$remi
 301
 302         ldx     [$Htblo+$nlo],$Tlo
 303         sll     $remi,3,$remi
 304         xor     $Thi,$Zhi,$Zhi
 305         ldx     [$Htbl+$nlo],$Thi
 306         srlx    $Zlo,4,$Zlo
 307         xor     $rem,$Zhi,$Zhi
 308         ldx     [$rem_4bit+$remi],$rem
 309         sllx    $Zhi,60,$tmp
 310         xor     $Tlo,$Zlo,$Zlo
 311         srlx    $Zhi,4,$Zhi
 312         xor     $Zlo,$tmp,$Zlo
 313         xor     $Thi,$Zhi,$Zhi
 314         and     $Zlo,0xf,$remi
 315
 316         ldx     [$Htblo+$nhi],$Tlo
 317         sll     $remi,3,$remi
 318         xor     $rem,$Zhi,$Zhi
 319         ldx     [$Htbl+$nhi],$Thi
 320         srlx    $Zlo,4,$Zlo
 321         ldx     [$rem_4bit+$remi],$rem
 322         sllx    $Zhi,60,$tmp
 323         xor     $Tlo,$Zlo,$Zlo
 324         srlx    $Zhi,4,$Zhi
 325         xor     $Zlo,$tmp,$Zlo
 326         xor     $Thi,$Zhi,$Zhi
 327         stx     $Zlo,[$Xi+8]
 328         xor     $rem,$Zhi,$Zhi
 329         stx     $Zhi,[$Xi]
 330
 331         ret
 332         restore
 333 .type   gcm_gmult_4bit,#function
 334 .size   gcm_gmult_4bit,(.-gcm_gmult_4bit)
 335 ___
 336 \f
 337 {{{
 338 # Straightforward 64-bits-at-a-time approach with pair of 128x64-bit
 339 # multiplications followed by 64-bit reductions. While it might be
 340 # suboptimal with regard to sheer amount of multiplications, other
 341 # methods would require larger amount of 64-bit registers, which we
 342 # don't have in 32-bit application. Also, they [alternative methods
 343 # such as aggregated reduction] kind of thrive on fast 128-bit SIMD
 344 # instructions and these are not option on SPARC...
 345
 346 ($Xip,$Htable,$inp,$len)=map("%i$_",(0..3));
 347
 348 ($xE1,$Hhi,$Hlo,$Rhi,$Rlo,$M0hi,$M0lo,$M1hi,$M1lo,$Zhi,$Zlo,$X)=
 349         (map("%g$_",(1..5)),map("%o$_",(0..5,7)));
 350 ($shl,$shr)=map("%l$_",(0..7));
 351
 352 $code.=<<___;
 353 .globl  gcm_gmult_vis3
 354 .align  32
 355 gcm_gmult_vis3:
 356         save    %sp,-$frame,%sp
 357
 358         ldx     [$Xip+8],$X             ! load X.lo
 359         ldx     [$Htable-8], $Hlo       ! load H
 360         ldx     [$Htable-16],$Hhi
 361         mov     0xE1,$xE1
 362         sllx    $xE1,57,$xE1
 363
 364         xmulx   $X,$Hlo,$M0lo           ! H·X.lo
 365         xmulxhi $X,$Hlo,$M0hi
 366         xmulx   $X,$Hhi,$M1lo
 367         xmulxhi $X,$Hhi,$M1hi
 368         ldx     [$Xip+0],$X             ! load X.hi
 369
 370         addcc   $M0lo,$M0lo,$M0lo       ! (H·X.lo)<<1
 371         xor     $M0hi,$M1lo,$M1lo
 372
 373         xmulx   $xE1,$M0lo,$Rlo         ! res=Z.lo·(0xE1<<57)
 374         xmulxhi $xE1,$M0lo,$Rhi
 375
 376         addxccc $M1lo,$M1lo,$Zlo        ! Z=((H·X.lo)<<1)>>64
 377         addxc   $M1hi,$M1hi,$Zhi
 378         xor     $M0lo,$Zhi,$Zhi         ! overflow bit from 0xE1<<57
 379
 380         xmulx   $X,$Hlo,$M0lo           ! H·X.hi
 381         xmulxhi $X,$Hlo,$M0hi
 382         xmulx   $X,$Hhi,$M1lo
 383         xmulxhi $X,$Hhi,$M1hi
 384
 385         xor     $Rlo,$Zlo,$Zlo          ! Z^=res
 386         xor     $Rhi,$Zhi,$Zhi
 387
 388         addcc   $M0lo,$M0lo,$M0lo       ! (H·X.lo)<<1
 389         xor     $Zlo, $M0lo,$M0lo
 390         xor     $M0hi,$M1lo,$M1lo
 391
 392         xmulx   $xE1,$M0lo,$Rlo         ! res=Z.lo·(0xE1<<57)
 393         xmulxhi $xE1,$M0lo,$Rhi
 394
 395         addxccc $M1lo,$M1lo,$M1lo
 396         addxc   $M1hi,$M1hi,$M1hi
 397
 398         xor     $M1lo,$Zhi,$Zlo         ! Z=(Z^(H·X.hi)<<1)>>64
 399         xor     $M0lo,$M1hi,$Zhi        ! overflow bit from 0xE1<<57
 400
 401         xor     $Rlo,$Zlo,$Zlo          ! Z^=res
 402         xor     $Rhi,$Zhi,$Zhi
 403
 404         stx     $Zlo,[$Xip+8]           ! save Xi
 405         stx     $Zhi,[$Xip+0]
 406
 407         ret
 408         restore
 409 .type   gcm_gmult_vis3,#function
 410 .size   gcm_gmult_vis3,.-gcm_gmult_vis3
 411
 412 .globl  gcm_ghash_vis3
 413 .align  32
 414 gcm_ghash_vis3:
 415         save    %sp,-$frame,%sp
 416
 417         ldx     [$Xip+0],$Zhi           ! load X.hi
 418         ldx     [$Xip+8],$Zlo           ! load X.lo
 419         and     $inp,7,$shl
 420         andn    $inp,7,$inp
 421         ldx     [$Htable-8], $Hlo       ! load H
 422         ldx     [$Htable-16],$Hhi
 423         sll     $shl,3,$shl
 424         prefetch [$inp+63], 20
 425         mov     0xE1,$xE1
 426         sub     %g0,$shl,$shr
 427         sllx    $xE1,57,$xE1
 428
 429 .Loop:
 430         ldx     [$inp+8],$Rlo           ! load *inp
 431         brz,pt  $shl,1f
 432         ldx     [$inp+0],$Rhi
 433
 434         ldx     [$inp+16],$X            ! align data
 435         srlx    $Rlo,$shr,$M0lo
 436         sllx    $Rlo,$shl,$Rlo
 437         sllx    $Rhi,$shl,$Rhi
 438         srlx    $X,$shr,$X
 439         or      $M0lo,$Rhi,$Rhi
 440         or      $X,$Rlo,$Rlo
 441
 442 1:
 443         add     $inp,16,$inp
 444         sub     $len,16,$len
 445         xor     $Rlo,$Zlo,$X
 446         prefetch [$inp+63], 20
 447
 448         xmulx   $X,$Hlo,$M0lo           ! H·X.lo
 449         xmulxhi $X,$Hlo,$M0hi
 450         xmulx   $X,$Hhi,$M1lo
 451         xmulxhi $X,$Hhi,$M1hi
 452         xor     $Rhi,$Zhi,$X
 453
 454         addcc   $M0lo,$M0lo,$M0lo       ! (H·X.lo)<<1
 455         xor     $M0hi,$M1lo,$M1lo
 456
 457         xmulx   $xE1,$M0lo,$Rlo         ! res=Z.lo·(0xE1<<57)
 458         xmulxhi $xE1,$M0lo,$Rhi
 459
 460         addxccc $M1lo,$M1lo,$Zlo        ! Z=((H·X.lo)<<1)>>64
 461         addxc   $M1hi,$M1hi,$Zhi
 462         xor     $M0lo,$Zhi,$Zhi         ! overflow bit from 0xE1<<57
 463
 464         xmulx   $X,$Hlo,$M0lo           ! H·X.hi
 465         xmulxhi $X,$Hlo,$M0hi
 466         xmulx   $X,$Hhi,$M1lo
 467         xmulxhi $X,$Hhi,$M1hi
 468
 469         xor     $Rlo,$Zlo,$Zlo          ! Z^=res
 470         xor     $Rhi,$Zhi,$Zhi
 471
 472         addcc   $M0lo,$M0lo,$M0lo       ! (H·X.lo)<<1
 473         xor     $Zlo, $M0lo,$M0lo
 474         xor     $M0hi,$M1lo,$M1lo
 475
 476         xmulx   $xE1,$M0lo,$Rlo         ! res=Z.lo·(0xE1<<57)
 477         xmulxhi $xE1,$M0lo,$Rhi
 478
 479         addxccc $M1lo,$M1lo,$M1lo
 480         addxc   $M1hi,$M1hi,$M1hi
 481
 482         xor     $M1lo,$Zhi,$Zlo         ! Z=(Z^(H·X.hi)<<1)>>64
 483         xor     $M0lo,$M1hi,$Zhi        ! overflow bit from 0xE1<<57
 484
 485         xor     $Rlo,$Zlo,$Zlo          ! Z^=res
 486         brnz,pt $len,.Loop
 487         xor     $Rhi,$Zhi,$Zhi
 488
 489         stx     $Zlo,[$Xip+8]           ! save Xi
 490         stx     $Zhi,[$Xip+0]
 491
 492         ret
 493         restore
 494 .type   gcm_ghash_vis3,#function
 495 .size   gcm_ghash_vis3,.-gcm_ghash_vis3
 496 ___
 497 }}}
 498 $code.=<<___;
 499 .asciz  "GHASH for SPARCv9/VIS3, CRYPTOGAMS by <appro\@openssl.org>"
 500 .align  4
 501 ___
 502
 503 \f
 504 # Purpose of these subroutines is to explicitly encode VIS instructions,
 505 # so that one can compile the module without having to specify VIS
 506 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 507 # Idea is to reserve for option to produce "universal" binary and let
 508 # programmer detect if current CPU is VIS capable at run-time.
 509 sub unvis3 {
 510 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 511 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 512 my ($ref,$opf);
 513 my %visopf = (  "addxc"         => 0x011,
 514                 "addxccc"       => 0x013,
 515                 "xmulx"         => 0x115,
 516                 "xmulxhi"       => 0x116        );
 517
 518     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 519
 520     if ($opf=$visopf{$mnemonic}) {
 521         foreach ($rs1,$rs2,$rd) {
 522             return $ref if (!/%([goli])([0-9])/);
 523             $_=$bias{$1}+$2;
 524         }
 525
 526         return  sprintf ".word\t0x%08x !%s",
 527                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 528                         $ref;
 529     } else {
 530         return $ref;
 531     }
 532 }
 533
 534 foreach (split("\n",$code)) {
 535         s/\`([^\`]*)\`/eval $1/ge;
 536
 537         s/\b(xmulx[hi]*|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 538                 &unvis3($1,$2,$3,$4)
 539          /ge;
 540
 541         print $_,"\n";
 542 }
 543
 544 close STDOUT;