crypto/poly1305/asm/poly1305-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the OpenSSL license (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9 #
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 # ====================================================================
  16 #
  17 # This module implements Poly1305 hash for SPARCv9, vanilla, as well
  18 # as VIS3 and FMA extensions.
  19 #
  20 # May, August 2015
  21 #
  22 # Numbers are cycles per processed byte with poly1305_blocks alone.
  23 #
  24 #                       IALU(*)         FMA
  25 #
  26 # UltraSPARC III        12.3(**)
  27 # SPARC T3              7.92
  28 # SPARC T4              1.70(***)       6.55
  29 # SPARC64 X             5.60            3.64
  30 #
  31 # (*)   Comparison to compiler-generated code is really problematic,
  32 #       because latter's performance varies too much depending on too
  33 #       many variables. For example, one can measure from 5x to 15x
  34 #       improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
  35 #       unfair comparison, because compiler doesn't use VIS3, but
  36 #       given same initial conditions coefficient varies from 3x to 9x.
  37 # (**)  Pre-III performance should be even worse; floating-point
  38 #       performance for UltraSPARC I-IV on the other hand is reported
  39 #       to be 4.25 for hand-coded assembly, but they are just too old
  40 #       to care about.
  41 # (***) Multi-process benchmark saturates at ~12.5x single-process
  42 #       result on 8-core processor, or ~21GBps per 2.85GHz socket.
  43
  44 my $output = pop;
  45 open STDOUT,">$output";
  46
  47 my ($ctx,$inp,$len,$padbit,$shl,$shr)   = map("%i$_",(0..5));
  48 my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4)    = map("%l$_",(0..7));
  49 my ($h0,$h1,$h2,$h3, $t0,$t1,$t2)       = map("%o$_",(0..5,7));
  50 my ($d0,$d1,$d2,$d3)                    = map("%g$_",(1..4));
  51
  52 my $output = pop;
  53 open STDOUT,">$stdout";
  54
  55 $code.=<<___;
  56 #include "sparc_arch.h"
  57
  58 #ifdef  __arch64__
  59 .register       %g2,#scratch
  60 .register       %g3,#scratch
  61 # define        STPTR   stx
  62 # define        SIZE_T  8
  63 #else
  64 # define        STPTR   st
  65 # define        SIZE_T  4
  66 #endif
  67 #define LOCALS  (STACK_BIAS+STACK_FRAME)
  68
  69 .section        ".text",#alloc,#execinstr
  70
  71 #ifdef __PIC__
  72 SPARC_PIC_THUNK(%g1)
  73 #endif
  74
  75 .globl  poly1305_init
  76 .align  32
  77 poly1305_init:
  78         save    %sp,-STACK_FRAME-16,%sp
  79         nop
  80
  81         SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
  82         ld      [%g1],%g1
  83
  84         and     %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
  85         cmp     %g1,SPARCV9_FMADD
  86         be      .Lpoly1305_init_fma
  87         nop
  88
  89         stx     %g0,[$ctx+0]
  90         stx     %g0,[$ctx+8]            ! zero hash value
  91         brz,pn  $inp,.Lno_key
  92         stx     %g0,[$ctx+16]
  93
  94         and     $inp,7,$shr             ! alignment factor
  95         andn    $inp,7,$inp
  96         sll     $shr,3,$shr             ! *8
  97         neg     $shr,$shl
  98
  99         sethi   %hi(0x0ffffffc),$t0
 100         set     8,$h1
 101         or      $t0,%lo(0x0ffffffc),$t0
 102         set     16,$h2
 103         sllx    $t0,32,$t1
 104         or      $t0,$t1,$t1             ! 0x0ffffffc0ffffffc
 105         or      $t1,3,$t0               ! 0x0ffffffc0fffffff
 106
 107         ldxa    [$inp+%g0]0x88,$h0      ! load little-endian key
 108         brz,pt  $shr,.Lkey_aligned
 109         ldxa    [$inp+$h1]0x88,$h1
 110
 111         ldxa    [$inp+$h2]0x88,$h2
 112         srlx    $h0,$shr,$h0
 113         sllx    $h1,$shl,$t2
 114         srlx    $h1,$shr,$h1
 115         or      $t2,$h0,$h0
 116         sllx    $h2,$shl,$h2
 117         or      $h2,$h1,$h1
 118
 119 .Lkey_aligned:
 120         and     $t0,$h0,$h0
 121         and     $t1,$h1,$h1
 122         stx     $h0,[$ctx+32+0]         ! store key
 123         stx     $h1,[$ctx+32+8]
 124
 125         andcc   %g1,SPARCV9_VIS3,%g0
 126         be      .Lno_key
 127         nop
 128
 129 1:      call    .+8
 130         add     %o7,poly1305_blocks_vis3-1b,%o7
 131
 132         add     %o7,poly1305_emit-poly1305_blocks_vis3,%o5
 133         STPTR   %o7,[%i2]
 134         STPTR   %o5,[%i2+SIZE_T]
 135
 136         ret
 137         restore %g0,1,%o0               ! return 1
 138
 139 .Lno_key:
 140         ret
 141         restore %g0,%g0,%o0             ! return 0
 142 .size   poly1305_init,.-poly1305_init
 143
 144 .globl  poly1305_blocks
 145 .align  32
 146 poly1305_blocks:
 147         save    %sp,-STACK_FRAME,%sp
 148         andn    $len,15,$len
 149
 150         brz,pn  $len,.Lno_data
 151         nop
 152
 153         ld      [$ctx+32+0],$r1         ! load key
 154         ld      [$ctx+32+4],$r0
 155         ld      [$ctx+32+8],$r3
 156         ld      [$ctx+32+12],$r2
 157
 158         ld      [$ctx+0],$h1            ! load hash value
 159         ld      [$ctx+4],$h0
 160         ld      [$ctx+8],$h3
 161         ld      [$ctx+12],$h2
 162         ld      [$ctx+16],$h4
 163
 164         and     $inp,7,$shr             ! alignment factor
 165         andn    $inp,7,$inp
 166         set     8,$d1
 167         sll     $shr,3,$shr             ! *8
 168         set     16,$d2
 169         neg     $shr,$shl
 170
 171         srl     $r1,2,$s1
 172         srl     $r2,2,$s2
 173         add     $r1,$s1,$s1
 174         srl     $r3,2,$s3
 175         add     $r2,$s2,$s2
 176         add     $r3,$s3,$s3
 177
 178 .Loop:
 179         ldxa    [$inp+%g0]0x88,$d0      ! load little-endian input
 180         brz,pt  $shr,.Linp_aligned
 181         ldxa    [$inp+$d1]0x88,$d1
 182
 183         ldxa    [$inp+$d2]0x88,$d2
 184         srlx    $d0,$shr,$d0
 185         sllx    $d1,$shl,$t1
 186         srlx    $d1,$shr,$d1
 187         or      $t1,$d0,$d0
 188         sllx    $d2,$shl,$d2
 189         or      $d2,$d1,$d1
 190
 191 .Linp_aligned:
 192         srlx    $d0,32,$t0
 193         addcc   $d0,$h0,$h0             ! accumulate input
 194         srlx    $d1,32,$t1
 195         addccc  $t0,$h1,$h1
 196         addccc  $d1,$h2,$h2
 197         addccc  $t1,$h3,$h3
 198         addc    $padbit,$h4,$h4
 199
 200         umul    $r0,$h0,$d0
 201         umul    $r1,$h0,$d1
 202         umul    $r2,$h0,$d2
 203         umul    $r3,$h0,$d3
 204          sub    $len,16,$len
 205          add    $inp,16,$inp
 206
 207         umul    $s3,$h1,$t0
 208         umul    $r0,$h1,$t1
 209         umul    $r1,$h1,$t2
 210         add     $t0,$d0,$d0
 211         add     $t1,$d1,$d1
 212         umul    $r2,$h1,$t0
 213         add     $t2,$d2,$d2
 214         add     $t0,$d3,$d3
 215
 216         umul    $s2,$h2,$t1
 217         umul    $s3,$h2,$t2
 218         umul    $r0,$h2,$t0
 219         add     $t1,$d0,$d0
 220         add     $t2,$d1,$d1
 221         umul    $r1,$h2,$t1
 222         add     $t0,$d2,$d2
 223         add     $t1,$d3,$d3
 224
 225         umul    $s1,$h3,$t2
 226         umul    $s2,$h3,$t0
 227         umul    $s3,$h3,$t1
 228         add     $t2,$d0,$d0
 229         add     $t0,$d1,$d1
 230         umul    $r0,$h3,$t2
 231         add     $t1,$d2,$d2
 232         add     $t2,$d3,$d3
 233
 234         umul    $s1,$h4,$t0
 235         umul    $s2,$h4,$t1
 236         umul    $s3,$h4,$t2
 237         umul    $r0,$h4,$h4
 238         add     $t0,$d1,$d1
 239         add     $t1,$d2,$d2
 240         srlx    $d0,32,$h1
 241         add     $t2,$d3,$d3
 242         srlx    $d1,32,$h2
 243
 244         addcc   $d1,$h1,$h1
 245         srlx    $d2,32,$h3
 246          set    8,$d1
 247         addccc  $d2,$h2,$h2
 248         srlx    $d3,32,$t0
 249          set    16,$d2
 250         addccc  $d3,$h3,$h3
 251         addc    $t0,$h4,$h4
 252
 253         srl     $h4,2,$t0               ! final reduction step
 254         andn    $h4,3,$t1
 255         and     $h4,3,$h4
 256         add     $t1,$t0,$t0
 257
 258         addcc   $t0,$d0,$h0
 259         addccc  %g0,$h1,$h1
 260         addccc  %g0,$h2,$h2
 261         addccc  %g0,$h3,$h3
 262         brnz,pt $len,.Loop
 263         addc    %g0,$h4,$h4
 264
 265         st      $h1,[$ctx+0]            ! store hash value
 266         st      $h0,[$ctx+4]
 267         st      $h3,[$ctx+8]
 268         st      $h2,[$ctx+12]
 269         st      $h4,[$ctx+16]
 270
 271 .Lno_data:
 272         ret
 273         restore
 274 .size   poly1305_blocks,.-poly1305_blocks
 275 ___
 276 ########################################################################
 277 # VIS3 has umulxhi and addxc...
 278 {
 279 my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
 280 my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
 281
 282 $code.=<<___;
 283 .align  32
 284 poly1305_blocks_vis3:
 285         save    %sp,-STACK_FRAME,%sp
 286         andn    $len,15,$len
 287
 288         brz,pn  $len,.Lno_data
 289         nop
 290
 291         ldx     [$ctx+32+0],$R0         ! load key
 292         ldx     [$ctx+32+8],$R1
 293
 294         ldx     [$ctx+0],$H0            ! load hash value
 295         ldx     [$ctx+8],$H1
 296         ld      [$ctx+16],$H2
 297
 298         and     $inp,7,$shr             ! alignment factor
 299         andn    $inp,7,$inp
 300         set     8,$r1
 301         sll     $shr,3,$shr             ! *8
 302         set     16,$r2
 303         neg     $shr,$shl
 304
 305         srlx    $R1,2,$S1
 306         b       .Loop_vis3
 307         add     $R1,$S1,$S1
 308
 309 .Loop_vis3:
 310         ldxa    [$inp+%g0]0x88,$D0      ! load little-endian input
 311         brz,pt  $shr,.Linp_aligned_vis3
 312         ldxa    [$inp+$r1]0x88,$D1
 313
 314         ldxa    [$inp+$r2]0x88,$D2
 315         srlx    $D0,$shr,$D0
 316         sllx    $D1,$shl,$T1
 317         srlx    $D1,$shr,$D1
 318         or      $T1,$D0,$D0
 319         sllx    $D2,$shl,$D2
 320         or      $D2,$D1,$D1
 321
 322 .Linp_aligned_vis3:
 323         addcc   $D0,$H0,$H0             ! accumulate input
 324          sub    $len,16,$len
 325         addxccc $D1,$H1,$H1
 326          add    $inp,16,$inp
 327
 328         mulx    $R0,$H0,$D0             ! r0*h0
 329         addxc   $padbit,$H2,$H2
 330         umulxhi $R0,$H0,$D1
 331         mulx    $S1,$H1,$T0             ! s1*h1
 332         umulxhi $S1,$H1,$T1
 333         addcc   $T0,$D0,$D0
 334         mulx    $R1,$H0,$T0             ! r1*h0
 335         addxc   $T1,$D1,$D1
 336         umulxhi $R1,$H0,$D2
 337         addcc   $T0,$D1,$D1
 338         mulx    $R0,$H1,$T0             ! r0*h1
 339         addxc   %g0,$D2,$D2
 340         umulxhi $R0,$H1,$T1
 341         addcc   $T0,$D1,$D1
 342         mulx    $S1,$H2,$T0             ! s1*h2
 343         addxc   $T1,$D2,$D2
 344         mulx    $R0,$H2,$T1             ! r0*h2
 345         addcc   $T0,$D1,$D1
 346         addxc   $T1,$D2,$D2
 347
 348         srlx    $D2,2,$T0               ! final reduction step
 349         andn    $D2,3,$T1
 350         and     $D2,3,$H2
 351         add     $T1,$T0,$T0
 352
 353         addcc   $T0,$D0,$H0
 354         addxccc %g0,$D1,$H1
 355         brnz,pt $len,.Loop_vis3
 356         addxc   %g0,$H2,$H2
 357
 358         stx     $H0,[$ctx+0]            ! store hash value
 359         stx     $H1,[$ctx+8]
 360         st      $H2,[$ctx+16]
 361
 362         ret
 363         restore
 364 .size   poly1305_blocks_vis3,.-poly1305_blocks_vis3
 365 ___
 366 }
 367 my ($mac,$nonce) = ($inp,$len);
 368
 369 $code.=<<___;
 370 .globl  poly1305_emit
 371 .align  32
 372 poly1305_emit:
 373         save    %sp,-STACK_FRAME,%sp
 374
 375         ld      [$ctx+0],$h1            ! load hash value
 376         ld      [$ctx+4],$h0
 377         ld      [$ctx+8],$h3
 378         ld      [$ctx+12],$h2
 379         ld      [$ctx+16],$h4
 380
 381         addcc   $h0,5,$r0               ! compare to modulus
 382         addccc  $h1,0,$r1
 383         addccc  $h2,0,$r2
 384         addccc  $h3,0,$r3
 385         addc    $h4,0,$h4
 386         andcc   $h4,4,%g0               ! did it carry/borrow?
 387
 388         movnz   %icc,$r0,$h0
 389         ld      [$nonce+0],$r0          ! load nonce
 390         movnz   %icc,$r1,$h1
 391         ld      [$nonce+4],$r1
 392         movnz   %icc,$r2,$h2
 393         ld      [$nonce+8],$r2
 394         movnz   %icc,$r3,$h3
 395         ld      [$nonce+12],$r3
 396
 397         addcc   $r0,$h0,$h0             ! accumulate nonce
 398         addccc  $r1,$h1,$h1
 399         addccc  $r2,$h2,$h2
 400         addc    $r3,$h3,$h3
 401
 402         srl     $h0,8,$r0
 403         stb     $h0,[$mac+0]            ! store little-endian result
 404         srl     $h0,16,$r1
 405         stb     $r0,[$mac+1]
 406         srl     $h0,24,$r2
 407         stb     $r1,[$mac+2]
 408         stb     $r2,[$mac+3]
 409
 410         srl     $h1,8,$r0
 411         stb     $h1,[$mac+4]
 412         srl     $h1,16,$r1
 413         stb     $r0,[$mac+5]
 414         srl     $h1,24,$r2
 415         stb     $r1,[$mac+6]
 416         stb     $r2,[$mac+7]
 417
 418         srl     $h2,8,$r0
 419         stb     $h2,[$mac+8]
 420         srl     $h2,16,$r1
 421         stb     $r0,[$mac+9]
 422         srl     $h2,24,$r2
 423         stb     $r1,[$mac+10]
 424         stb     $r2,[$mac+11]
 425
 426         srl     $h3,8,$r0
 427         stb     $h3,[$mac+12]
 428         srl     $h3,16,$r1
 429         stb     $r0,[$mac+13]
 430         srl     $h3,24,$r2
 431         stb     $r1,[$mac+14]
 432         stb     $r2,[$mac+15]
 433
 434         ret
 435         restore
 436 .size   poly1305_emit,.-poly1305_emit
 437 ___
 438
 439 {
 440 my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
 441 my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
 442 my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
 443 my $i2=$step;
 444
 445 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
 446     $two0,$two32,$two64,$two96,$two130,$five_two130,
 447     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
 448     $s2lo,$s2hi,$s3lo,$s3hi,
 449     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
 450 # borrowings
 451 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
 452 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
 453 my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
 454
 455 $code.=<<___;
 456 .align  32
 457 poly1305_init_fma:
 458         save    %sp,-STACK_FRAME-16,%sp
 459         nop
 460
 461 .Lpoly1305_init_fma:
 462 1:      call    .+8
 463         add     %o7,.Lconsts_fma-1b,%o7
 464
 465         ldd     [%o7+8*0],$two0                 ! load constants
 466         ldd     [%o7+8*1],$two32
 467         ldd     [%o7+8*2],$two64
 468         ldd     [%o7+8*3],$two96
 469         ldd     [%o7+8*5],$five_two130
 470
 471         std     $two0,[$ctx+8*0]                ! initial hash value, biased 0
 472         std     $two32,[$ctx+8*1]
 473         std     $two64,[$ctx+8*2]
 474         std     $two96,[$ctx+8*3]
 475
 476         brz,pn  $inp,.Lno_key_fma
 477         nop
 478
 479         stx     %fsr,[%sp+LOCALS]               ! save original %fsr
 480         ldx     [%o7+8*6],%fsr                  ! load new %fsr
 481
 482         std     $two0,[$ctx+8*4]                ! key "template"
 483         std     $two32,[$ctx+8*5]
 484         std     $two64,[$ctx+8*6]
 485         std     $two96,[$ctx+8*7]
 486
 487         and     $inp,7,$shr
 488         andn    $inp,7,$inp                     ! align pointer
 489         mov     8,$i1
 490         sll     $shr,3,$shr
 491         mov     16,$i2
 492         neg     $shr,$shl
 493
 494         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian key
 495         ldxa    [$inp+$i1]0x88,$in2
 496
 497         brz     $shr,.Lkey_aligned_fma
 498         sethi   %hi(0xf0000000),$i1             !   0xf0000000
 499
 500         ldxa    [$inp+$i2]0x88,$in4
 501
 502         srlx    $in0,$shr,$in0                  ! align data
 503         sllx    $in2,$shl,$in1
 504         srlx    $in2,$shr,$in2
 505         or      $in1,$in0,$in0
 506         sllx    $in4,$shl,$in3
 507         or      $in3,$in2,$in2
 508
 509 .Lkey_aligned_fma:
 510         or      $i1,3,$i2                       !   0xf0000003
 511         srlx    $in0,32,$in1
 512         andn    $in0,$i1,$in0                   ! &=0x0fffffff
 513         andn    $in1,$i2,$in1                   ! &=0x0ffffffc
 514         srlx    $in2,32,$in3
 515         andn    $in2,$i2,$in2
 516         andn    $in3,$i2,$in3
 517
 518         st      $in0,[$ctx+`8*4+4`]             ! fill "template"
 519         st      $in1,[$ctx+`8*5+4`]
 520         st      $in2,[$ctx+`8*6+4`]
 521         st      $in3,[$ctx+`8*7+4`]
 522
 523         ldd     [$ctx+8*4],$h0lo                ! load [biased] key
 524         ldd     [$ctx+8*5],$h1lo
 525         ldd     [$ctx+8*6],$h2lo
 526         ldd     [$ctx+8*7],$h3lo
 527
 528         fsubd   $h0lo,$two0, $h0lo              ! r0
 529          ldd    [%o7+8*7],$two0                 ! more constants
 530         fsubd   $h1lo,$two32,$h1lo              ! r1
 531          ldd    [%o7+8*8],$two32
 532         fsubd   $h2lo,$two64,$h2lo              ! r2
 533          ldd    [%o7+8*9],$two64
 534         fsubd   $h3lo,$two96,$h3lo              ! r3
 535          ldd    [%o7+8*10],$two96
 536
 537         fmuld   $five_two130,$h1lo,$s1lo        ! s1
 538         fmuld   $five_two130,$h2lo,$s2lo        ! s2
 539         fmuld   $five_two130,$h3lo,$s3lo        ! s3
 540
 541         faddd   $h0lo,$two0, $h0hi
 542         faddd   $h1lo,$two32,$h1hi
 543         faddd   $h2lo,$two64,$h2hi
 544         faddd   $h3lo,$two96,$h3hi
 545
 546         fsubd   $h0hi,$two0, $h0hi
 547          ldd    [%o7+8*11],$two0                ! more constants
 548         fsubd   $h1hi,$two32,$h1hi
 549          ldd    [%o7+8*12],$two32
 550         fsubd   $h2hi,$two64,$h2hi
 551          ldd    [%o7+8*13],$two64
 552         fsubd   $h3hi,$two96,$h3hi
 553
 554         fsubd   $h0lo,$h0hi,$h0lo
 555          std    $h0hi,[$ctx+8*5]                ! r0hi
 556         fsubd   $h1lo,$h1hi,$h1lo
 557          std    $h1hi,[$ctx+8*7]                ! r1hi
 558         fsubd   $h2lo,$h2hi,$h2lo
 559          std    $h2hi,[$ctx+8*9]                ! r2hi
 560         fsubd   $h3lo,$h3hi,$h3lo
 561          std    $h3hi,[$ctx+8*11]               ! r3hi
 562
 563         faddd   $s1lo,$two0, $s1hi
 564         faddd   $s2lo,$two32,$s2hi
 565         faddd   $s3lo,$two64,$s3hi
 566
 567         fsubd   $s1hi,$two0, $s1hi
 568         fsubd   $s2hi,$two32,$s2hi
 569         fsubd   $s3hi,$two64,$s3hi
 570
 571         fsubd   $s1lo,$s1hi,$s1lo
 572         fsubd   $s2lo,$s2hi,$s2lo
 573         fsubd   $s3lo,$s3hi,$s3lo
 574
 575         ldx     [%sp+LOCALS],%fsr               ! restore %fsr
 576
 577         std     $h0lo,[$ctx+8*4]                ! r0lo
 578         std     $h1lo,[$ctx+8*6]                ! r1lo
 579         std     $h2lo,[$ctx+8*8]                ! r2lo
 580         std     $h3lo,[$ctx+8*10]               ! r3lo
 581
 582         std     $s1hi,[$ctx+8*13]
 583         std     $s2hi,[$ctx+8*15]
 584         std     $s3hi,[$ctx+8*17]
 585
 586         std     $s1lo,[$ctx+8*12]
 587         std     $s2lo,[$ctx+8*14]
 588         std     $s3lo,[$ctx+8*16]
 589
 590         add     %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
 591         add     %o7,poly1305_emit_fma-.Lconsts_fma,%o1
 592         STPTR   %o0,[%i2]
 593         STPTR   %o1,[%i2+SIZE_T]
 594
 595         ret
 596         restore %g0,1,%o0                       ! return 1
 597
 598 .Lno_key_fma:
 599         ret
 600         restore %g0,%g0,%o0                     ! return 0
 601 .size   poly1305_init_fma,.-poly1305_init_fma
 602
 603 .align  32
 604 poly1305_blocks_fma:
 605         save    %sp,-STACK_FRAME-48,%sp
 606         srlx    $len,4,$len
 607
 608         brz,pn  $len,.Labort
 609         sub     $len,1,$len
 610
 611 1:      call    .+8
 612         add     %o7,.Lconsts_fma-1b,%o7
 613
 614         ldd     [%o7+8*0],$two0                 ! load constants
 615         ldd     [%o7+8*1],$two32
 616         ldd     [%o7+8*2],$two64
 617         ldd     [%o7+8*3],$two96
 618         ldd     [%o7+8*4],$two130
 619         ldd     [%o7+8*5],$five_two130
 620
 621         ldd     [$ctx+8*0],$h0lo                ! load [biased] hash value
 622         ldd     [$ctx+8*1],$h1lo
 623         ldd     [$ctx+8*2],$h2lo
 624         ldd     [$ctx+8*3],$h3lo
 625
 626         std     $two0,[%sp+LOCALS+8*0]          ! input "template"
 627         sethi   %hi((1023+52+96)<<20),$in3
 628         std     $two32,[%sp+LOCALS+8*1]
 629         or      $padbit,$in3,$in3
 630         std     $two64,[%sp+LOCALS+8*2]
 631         st      $in3,[%sp+LOCALS+8*3]
 632
 633         and     $inp,7,$shr
 634         andn    $inp,7,$inp                     ! align pointer
 635         mov     8,$i1
 636         sll     $shr,3,$shr
 637         mov     16,$step
 638         neg     $shr,$shl
 639
 640         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian input
 641         brz     $shr,.Linp_aligned_fma
 642         ldxa    [$inp+$i1]0x88,$in2
 643
 644         ldxa    [$inp+$step]0x88,$in4
 645         add     $inp,8,$inp
 646
 647         srlx    $in0,$shr,$in0                  ! align data
 648         sllx    $in2,$shl,$in1
 649         srlx    $in2,$shr,$in2
 650         or      $in1,$in0,$in0
 651         sllx    $in4,$shl,$in3
 652         srlx    $in4,$shr,$in4                  ! pre-shift
 653         or      $in3,$in2,$in2
 654
 655 .Linp_aligned_fma:
 656         srlx    $in0,32,$in1
 657         movrz   $len,0,$step
 658         srlx    $in2,32,$in3
 659         add     $step,$inp,$inp                 ! conditional advance
 660
 661         st      $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
 662         st      $in1,[%sp+LOCALS+8*1+4]
 663         st      $in2,[%sp+LOCALS+8*2+4]
 664         st      $in3,[%sp+LOCALS+8*3+4]
 665
 666         ldd     [$ctx+8*4],$r0lo                ! load key
 667         ldd     [$ctx+8*5],$r0hi
 668         ldd     [$ctx+8*6],$r1lo
 669         ldd     [$ctx+8*7],$r1hi
 670         ldd     [$ctx+8*8],$r2lo
 671         ldd     [$ctx+8*9],$r2hi
 672         ldd     [$ctx+8*10],$r3lo
 673         ldd     [$ctx+8*11],$r3hi
 674         ldd     [$ctx+8*12],$s1lo
 675         ldd     [$ctx+8*13],$s1hi
 676         ldd     [$ctx+8*14],$s2lo
 677         ldd     [$ctx+8*15],$s2hi
 678         ldd     [$ctx+8*16],$s3lo
 679         ldd     [$ctx+8*17],$s3hi
 680
 681         stx     %fsr,[%sp+LOCALS+8*4]           ! save original %fsr
 682         ldx     [%o7+8*6],%fsr                  ! load new %fsr
 683
 684         subcc   $len,1,$len
 685         movrz   $len,0,$step
 686
 687         ldd     [%sp+LOCALS+8*0],$x0            ! load biased input
 688         ldd     [%sp+LOCALS+8*1],$x1
 689         ldd     [%sp+LOCALS+8*2],$x2
 690         ldd     [%sp+LOCALS+8*3],$x3
 691
 692         fsubd   $h0lo,$two0, $h0lo              ! de-bias hash value
 693         fsubd   $h1lo,$two32,$h1lo
 694          ldxa   [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
 695         fsubd   $h2lo,$two64,$h2lo
 696         fsubd   $h3lo,$two96,$h3lo
 697          ldxa   [$inp+$i1]0x88,$in2
 698
 699         fsubd   $x0,$two0, $x0                  ! de-bias input
 700         fsubd   $x1,$two32,$x1
 701         fsubd   $x2,$two64,$x2
 702         fsubd   $x3,$two96,$x3
 703
 704         brz     $shr,.Linp_aligned_fma2
 705         add     $step,$inp,$inp                 ! conditional advance
 706
 707         sllx    $in0,$shl,$in1                  ! align data
 708         srlx    $in0,$shr,$in3
 709         or      $in1,$in4,$in0
 710         sllx    $in2,$shl,$in1
 711         srlx    $in2,$shr,$in4                  ! pre-shift
 712         or      $in3,$in1,$in2
 713 .Linp_aligned_fma2:
 714         srlx    $in0,32,$in1
 715         srlx    $in2,32,$in3
 716
 717         faddd   $h0lo,$x0,$x0                   ! accumulate input
 718          stw    $in0,[%sp+LOCALS+8*0+4]
 719         faddd   $h1lo,$x1,$x1
 720          stw    $in1,[%sp+LOCALS+8*1+4]
 721         faddd   $h2lo,$x2,$x2
 722          stw    $in2,[%sp+LOCALS+8*2+4]
 723         faddd   $h3lo,$x3,$x3
 724          stw    $in3,[%sp+LOCALS+8*3+4]
 725
 726         b       .Lentry_fma
 727         nop
 728
 729 .align  16
 730 .Loop_fma:
 731         ldxa    [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
 732         ldxa    [$inp+$i1]0x88,$in2
 733         movrz   $len,0,$step
 734
 735         faddd   $y0,$h0lo,$h0lo                 ! accumulate input
 736         faddd   $y1,$h0hi,$h0hi
 737         faddd   $y2,$h2lo,$h2lo
 738         faddd   $y3,$h2hi,$h2hi
 739
 740         brz,pn  $shr,.Linp_aligned_fma3
 741         add     $step,$inp,$inp                 ! conditional advance
 742
 743         sllx    $in0,$shl,$in1                  ! align data
 744         srlx    $in0,$shr,$in3
 745         or      $in1,$in4,$in0
 746         sllx    $in2,$shl,$in1
 747         srlx    $in2,$shr,$in4                  ! pre-shift
 748         or      $in3,$in1,$in2
 749
 750 .Linp_aligned_fma3:
 751         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
 752         faddd   $two64,$h1lo,$c1lo
 753          srlx   $in0,32,$in1
 754         faddd   $two64,$h1hi,$c1hi
 755          srlx   $in2,32,$in3
 756         faddd   $two130,$h3lo,$c3lo
 757          st     $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
 758         faddd   $two130,$h3hi,$c3hi
 759          st     $in1,[%sp+LOCALS+8*1+4]
 760         faddd   $two32,$h0lo,$c0lo
 761          st     $in2,[%sp+LOCALS+8*2+4]
 762         faddd   $two32,$h0hi,$c0hi
 763          st     $in3,[%sp+LOCALS+8*3+4]
 764         faddd   $two96,$h2lo,$c2lo
 765         faddd   $two96,$h2hi,$c2hi
 766
 767         fsubd   $c1lo,$two64,$c1lo
 768         fsubd   $c1hi,$two64,$c1hi
 769         fsubd   $c3lo,$two130,$c3lo
 770         fsubd   $c3hi,$two130,$c3hi
 771         fsubd   $c0lo,$two32,$c0lo
 772         fsubd   $c0hi,$two32,$c0hi
 773         fsubd   $c2lo,$two96,$c2lo
 774         fsubd   $c2hi,$two96,$c2hi
 775
 776         fsubd   $h1lo,$c1lo,$h1lo
 777         fsubd   $h1hi,$c1hi,$h1hi
 778         fsubd   $h3lo,$c3lo,$h3lo
 779         fsubd   $h3hi,$c3hi,$h3hi
 780         fsubd   $h2lo,$c2lo,$h2lo
 781         fsubd   $h2hi,$c2hi,$h2hi
 782         fsubd   $h0lo,$c0lo,$h0lo
 783         fsubd   $h0hi,$c0hi,$h0hi
 784
 785         faddd   $h1lo,$c0lo,$h1lo
 786         faddd   $h1hi,$c0hi,$h1hi
 787         faddd   $h3lo,$c2lo,$h3lo
 788         faddd   $h3hi,$c2hi,$h3hi
 789         faddd   $h2lo,$c1lo,$h2lo
 790         faddd   $h2hi,$c1hi,$h2hi
 791         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
 792         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
 793
 794         faddd   $h1lo,$h1hi,$x1
 795          ldd    [$ctx+8*12],$s1lo               ! reload constants
 796         faddd   $h3lo,$h3hi,$x3
 797          ldd    [$ctx+8*13],$s1hi
 798         faddd   $h2lo,$h2hi,$x2
 799          ldd    [$ctx+8*10],$r3lo
 800         faddd   $h0lo,$h0hi,$x0
 801          ldd    [$ctx+8*11],$r3hi
 802
 803 .Lentry_fma:
 804         fmuld   $x1,$s3lo,$h0lo
 805         fmuld   $x1,$s3hi,$h0hi
 806         fmuld   $x1,$r1lo,$h2lo
 807         fmuld   $x1,$r1hi,$h2hi
 808         fmuld   $x1,$r0lo,$h1lo
 809         fmuld   $x1,$r0hi,$h1hi
 810         fmuld   $x1,$r2lo,$h3lo
 811         fmuld   $x1,$r2hi,$h3hi
 812
 813         fmaddd  $x3,$s1lo,$h0lo,$h0lo
 814         fmaddd  $x3,$s1hi,$h0hi,$h0hi
 815         fmaddd  $x3,$s3lo,$h2lo,$h2lo
 816         fmaddd  $x3,$s3hi,$h2hi,$h2hi
 817         fmaddd  $x3,$s2lo,$h1lo,$h1lo
 818         fmaddd  $x3,$s2hi,$h1hi,$h1hi
 819         fmaddd  $x3,$r0lo,$h3lo,$h3lo
 820         fmaddd  $x3,$r0hi,$h3hi,$h3hi
 821
 822         fmaddd  $x2,$s2lo,$h0lo,$h0lo
 823         fmaddd  $x2,$s2hi,$h0hi,$h0hi
 824         fmaddd  $x2,$r0lo,$h2lo,$h2lo
 825         fmaddd  $x2,$r0hi,$h2hi,$h2hi
 826         fmaddd  $x2,$s3lo,$h1lo,$h1lo
 827          ldd    [%sp+LOCALS+8*0],$y0            ! load [biased] input
 828         fmaddd  $x2,$s3hi,$h1hi,$h1hi
 829          ldd    [%sp+LOCALS+8*1],$y1
 830         fmaddd  $x2,$r1lo,$h3lo,$h3lo
 831          ldd    [%sp+LOCALS+8*2],$y2
 832         fmaddd  $x2,$r1hi,$h3hi,$h3hi
 833          ldd    [%sp+LOCALS+8*3],$y3
 834
 835         fmaddd  $x0,$r0lo,$h0lo,$h0lo
 836          fsubd  $y0,$two0, $y0                  ! de-bias input
 837         fmaddd  $x0,$r0hi,$h0hi,$h0hi
 838          fsubd  $y1,$two32,$y1
 839         fmaddd  $x0,$r2lo,$h2lo,$h2lo
 840          fsubd  $y2,$two64,$y2
 841         fmaddd  $x0,$r2hi,$h2hi,$h2hi
 842          fsubd  $y3,$two96,$y3
 843         fmaddd  $x0,$r1lo,$h1lo,$h1lo
 844         fmaddd  $x0,$r1hi,$h1hi,$h1hi
 845         fmaddd  $x0,$r3lo,$h3lo,$h3lo
 846         fmaddd  $x0,$r3hi,$h3hi,$h3hi
 847
 848         bcc     SIZE_T_CC,.Loop_fma
 849         subcc   $len,1,$len
 850
 851         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
 852         faddd   $h0lo,$two32,$c0lo
 853         faddd   $h0hi,$two32,$c0hi
 854         faddd   $h2lo,$two96,$c2lo
 855         faddd   $h2hi,$two96,$c2hi
 856         faddd   $h1lo,$two64,$c1lo
 857         faddd   $h1hi,$two64,$c1hi
 858         faddd   $h3lo,$two130,$c3lo
 859         faddd   $h3hi,$two130,$c3hi
 860
 861         fsubd   $c0lo,$two32,$c0lo
 862         fsubd   $c0hi,$two32,$c0hi
 863         fsubd   $c2lo,$two96,$c2lo
 864         fsubd   $c2hi,$two96,$c2hi
 865         fsubd   $c1lo,$two64,$c1lo
 866         fsubd   $c1hi,$two64,$c1hi
 867         fsubd   $c3lo,$two130,$c3lo
 868         fsubd   $c3hi,$two130,$c3hi
 869
 870         fsubd   $h1lo,$c1lo,$h1lo
 871         fsubd   $h1hi,$c1hi,$h1hi
 872         fsubd   $h3lo,$c3lo,$h3lo
 873         fsubd   $h3hi,$c3hi,$h3hi
 874         fsubd   $h2lo,$c2lo,$h2lo
 875         fsubd   $h2hi,$c2hi,$h2hi
 876         fsubd   $h0lo,$c0lo,$h0lo
 877         fsubd   $h0hi,$c0hi,$h0hi
 878
 879         faddd   $h1lo,$c0lo,$h1lo
 880         faddd   $h1hi,$c0hi,$h1hi
 881         faddd   $h3lo,$c2lo,$h3lo
 882         faddd   $h3hi,$c2hi,$h3hi
 883         faddd   $h2lo,$c1lo,$h2lo
 884         faddd   $h2hi,$c1hi,$h2hi
 885         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
 886         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
 887
 888         faddd   $h1lo,$h1hi,$x1
 889         faddd   $h3lo,$h3hi,$x3
 890         faddd   $h2lo,$h2hi,$x2
 891         faddd   $h0lo,$h0hi,$x0
 892
 893         faddd   $x1,$two32,$x1                  ! bias
 894         faddd   $x3,$two96,$x3
 895         faddd   $x2,$two64,$x2
 896         faddd   $x0,$two0, $x0
 897
 898         ldx     [%sp+LOCALS+8*4],%fsr           ! restore saved %fsr
 899
 900         std     $x1,[$ctx+8*1]                  ! store [biased] hash value
 901         std     $x3,[$ctx+8*3]
 902         std     $x2,[$ctx+8*2]
 903         std     $x0,[$ctx+8*0]
 904
 905 .Labort:
 906         ret
 907         restore
 908 .size   poly1305_blocks_fma,.-poly1305_blocks_fma
 909 ___
 910 {
 911 my ($mac,$nonce)=($inp,$len);
 912
 913 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
 914    ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
 915
 916 $code.=<<___;
 917 .align  32
 918 poly1305_emit_fma:
 919         save    %sp,-STACK_FRAME,%sp
 920
 921         ld      [$ctx+8*0+0],$d0                ! load hash
 922         ld      [$ctx+8*0+4],$h0
 923         ld      [$ctx+8*1+0],$d1
 924         ld      [$ctx+8*1+4],$h1
 925         ld      [$ctx+8*2+0],$d2
 926         ld      [$ctx+8*2+4],$h2
 927         ld      [$ctx+8*3+0],$d3
 928         ld      [$ctx+8*3+4],$h3
 929
 930         sethi   %hi(0xfff00000),$mask
 931         andn    $d0,$mask,$d0                   ! mask exponent
 932         andn    $d1,$mask,$d1
 933         andn    $d2,$mask,$d2
 934         andn    $d3,$mask,$d3                   ! can be partially reduced...
 935         mov     3,$mask
 936
 937         srl     $d3,2,$padbit                   ! ... so reduce
 938         and     $d3,$mask,$h4
 939         andn    $d3,$mask,$d3
 940         add     $padbit,$d3,$d3
 941
 942         addcc   $d3,$h0,$h0
 943         addccc  $d0,$h1,$h1
 944         addccc  $d1,$h2,$h2
 945         addccc  $d2,$h3,$h3
 946         addc    %g0,$h4,$h4
 947
 948         addcc   $h0,5,$d0                       ! compare to modulus
 949         addccc  $h1,0,$d1
 950         addccc  $h2,0,$d2
 951         addccc  $h3,0,$d3
 952         addc    $h4,0,$mask
 953
 954         srl     $mask,2,$mask                   ! did it carry/borrow?
 955         neg     $mask,$mask
 956         sra     $mask,31,$mask                  ! mask
 957
 958         andn    $h0,$mask,$h0
 959         and     $d0,$mask,$d0
 960         andn    $h1,$mask,$h1
 961         and     $d1,$mask,$d1
 962         or      $d0,$h0,$h0
 963         ld      [$nonce+0],$d0                  ! load nonce
 964         andn    $h2,$mask,$h2
 965         and     $d2,$mask,$d2
 966         or      $d1,$h1,$h1
 967         ld      [$nonce+4],$d1
 968         andn    $h3,$mask,$h3
 969         and     $d3,$mask,$d3
 970         or      $d2,$h2,$h2
 971         ld      [$nonce+8],$d2
 972         or      $d3,$h3,$h3
 973         ld      [$nonce+12],$d3
 974
 975         addcc   $d0,$h0,$h0                     ! accumulate nonce
 976         addccc  $d1,$h1,$h1
 977         addccc  $d2,$h2,$h2
 978         addc    $d3,$h3,$h3
 979
 980         stb     $h0,[$mac+0]                    ! write little-endian result
 981         srl     $h0,8,$h0
 982         stb     $h1,[$mac+4]
 983         srl     $h1,8,$h1
 984         stb     $h2,[$mac+8]
 985         srl     $h2,8,$h2
 986         stb     $h3,[$mac+12]
 987         srl     $h3,8,$h3
 988
 989         stb     $h0,[$mac+1]
 990         srl     $h0,8,$h0
 991         stb     $h1,[$mac+5]
 992         srl     $h1,8,$h1
 993         stb     $h2,[$mac+9]
 994         srl     $h2,8,$h2
 995         stb     $h3,[$mac+13]
 996         srl     $h3,8,$h3
 997
 998         stb     $h0,[$mac+2]
 999         srl     $h0,8,$h0
1000         stb     $h1,[$mac+6]
1001         srl     $h1,8,$h1
1002         stb     $h2,[$mac+10]
1003         srl     $h2,8,$h2
1004         stb     $h3,[$mac+14]
1005         srl     $h3,8,$h3
1006
1007         stb     $h0,[$mac+3]
1008         stb     $h1,[$mac+7]
1009         stb     $h2,[$mac+11]
1010         stb     $h3,[$mac+15]
1011
1012         ret
1013         restore
1014 .size   poly1305_emit_fma,.-poly1305_emit_fma
1015 ___
1016 }
1017
1018 $code.=<<___;
1019 .align  64
1020 .Lconsts_fma:
1021 .word   0x43300000,0x00000000           ! 2^(52+0)
1022 .word   0x45300000,0x00000000           ! 2^(52+32)
1023 .word   0x47300000,0x00000000           ! 2^(52+64)
1024 .word   0x49300000,0x00000000           ! 2^(52+96)
1025 .word   0x4b500000,0x00000000           ! 2^(52+130)
1026
1027 .word   0x37f40000,0x00000000           ! 5/2^130
1028 .word   0,1<<30                         ! fsr: truncate, no exceptions
1029
1030 .word   0x44300000,0x00000000           ! 2^(52+16+0)
1031 .word   0x46300000,0x00000000           ! 2^(52+16+32)
1032 .word   0x48300000,0x00000000           ! 2^(52+16+64)
1033 .word   0x4a300000,0x00000000           ! 2^(52+16+96)
1034 .word   0x3e300000,0x00000000           ! 2^(52+16+0-96)
1035 .word   0x40300000,0x00000000           ! 2^(52+16+32-96)
1036 .word   0x42300000,0x00000000           ! 2^(52+16+64-96)
1037 .asciz  "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1038 .align  4
1039 ___
1040 }
1041 \f
1042 # Purpose of these subroutines is to explicitly encode VIS instructions,
1043 # so that one can compile the module without having to specify VIS
1044 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1045 # Idea is to reserve for option to produce "universal" binary and let
1046 # programmer detect if current CPU is VIS capable at run-time.
1047 sub unvis3 {
1048 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1049 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1050 my ($ref,$opf);
1051 my %visopf = (  "addxc"         => 0x011,
1052                 "addxccc"       => 0x013,
1053                 "umulxhi"       => 0x016        );
1054
1055     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1056
1057     if ($opf=$visopf{$mnemonic}) {
1058         foreach ($rs1,$rs2,$rd) {
1059             return $ref if (!/%([goli])([0-9])/);
1060             $_=$bias{$1}+$2;
1061         }
1062
1063         return  sprintf ".word\t0x%08x !%s",
1064                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1065                         $ref;
1066     } else {
1067         return $ref;
1068     }
1069 }
1070
1071 sub unfma {
1072 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1073 my ($ref,$opf);
1074 my %fmaopf = (  "fmadds"        => 0x1,
1075                 "fmaddd"        => 0x2,
1076                 "fmsubs"        => 0x5,
1077                 "fmsubd"        => 0x6          );
1078
1079     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1080
1081     if ($opf=$fmaopf{$mnemonic}) {
1082         foreach ($rs1,$rs2,$rs3,$rd) {
1083             return $ref if (!/%f([0-9]{1,2})/);
1084             $_=$1;
1085             if ($1>=32) {
1086                 return $ref if ($1&1);
1087                 # re-encode for upper double register addressing
1088                 $_=($1|$1>>5)&31;
1089             }
1090         }
1091
1092         return  sprintf ".word\t0x%08x !%s",
1093                         0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1094                         $ref;
1095     } else {
1096         return $ref;
1097     }
1098 }
1099
1100 foreach (split("\n",$code)) {
1101         s/\`([^\`]*)\`/eval $1/ge;
1102
1103         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1104                 &unvis3($1,$2,$3,$4)
1105          /ge    or
1106         s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1107                 &unfma($1,$2,$3,$4,$5)
1108          /ge;
1109
1110         print $_,"\n";
1111 }
1112
1113 close STDOUT;