crypto/bn/asm/sparcv9a-mont.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. Rights for redistribution and usage in source and binary
   6 # forms are granted according to the OpenSSL license.
   7 # ====================================================================
   8
   9 # "Teaser" Montgomery multiplication module for UltraSPARC. Why FPU?
  10 # Because unlike integer multiplier, which simply stalls whole CPU,
  11 # FPU is fully pipelined and can effectively emit 48 bit partial
  12 # product every cycle. Why not blended SPARC v9? One can argue that
  13 # making this module dependent on UltraSPARC VIS extension limits its
  14 # binary compatibility. Very well may be, but the simple fact is that
  15 # there is no known SPARC v9 implementation, which does not implement
  16 # VIS. Even brand new Fujitsu's SPARC64 V is equipped with VIS unit.
  17
  18 # USI&II cores currently exhibit uniform 2x improvement [over pre-
  19 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
  20 # performance improves few percents for shorter keys and worsens few
  21 # percents for longer keys. This's because USIII integer multiplier
  22 # is >3x faster than USI&II one, which is harder to match [but see
  23 # TODO list below]. It should also be noted that SPARC64 V features
  24 # out-of-order execution, which *might* mean that integer multiplier
  25 # is pipelined, which in turn *might* be impossible to match...
  26 #
  27 # TODO:
  28 # - complete 32-bit adaptation (requires universal changes to
  29 #   BN_MONT_CTX and bn_mul_mont prototype, but nothing really
  30 #   unmanagable:-);
  31 # - modulo-schedule inner loop for better performance (on in-order
  32 #   execution core such as UltraSPARC this shall result in further
  33 #   noticeable(!) improvement);
  34 # - dedicated squaring procedure[?];
  35
  36 $fname="bn_mul_mont";
  37 $bits=32;
  38 for (@ARGV) {
  39         $bits=64    if (/\-m64/        || /\-xarch\=v9/);
  40         $vis=1      if (/\-mcpu=ultra/ || /\-xarch\=v[9|8plus]\S/);
  41 }
  42
  43 if (!$vis || $bits==32) {       # 32-bit is not supported just yet...
  44 print<<___;
  45 .section        ".text",#alloc,#execinstr
  46 .global $fname
  47 $fname:
  48         retl
  49         xor     %o0,%o0,%o0     ! just signal "not implemented"
  50 .type   $fname,#function
  51 .size   $fname,(.-$fname)
  52 ___
  53 exit;
  54 }
  55
  56 if ($bits==64) {
  57         $bias=2047;
  58         $frame=192;
  59 } else {
  60         $bias=0;
  61         $frame=128;     # 96 rounded up to largest known cache-line
  62 }
  63 $locals=64;
  64
  65 # In order to provide for 32-/64-bit ABI duality, I keep integers wider
  66 # than 32 bit in %g1-%g4 and %o0-%o5. %l0-%l7 and %i0-%i5 are used
  67 # exclusively for pointers, indexes and other small values...
  68 # int bn_mul_mont(
  69 $rp="%i0";      # BN_ULONG *rp,
  70 $ap="%i1";      # const BN_ULONG *ap,
  71 $bp="%i2";      # const BN_ULONG *bp,
  72 $np="%i3";      # const BN_ULONG *np,
  73 $n0="%i4";      # BN_ULONG n0,
  74 $num="%i5";     # int num);
  75
  76 $tp="%l0";
  77 $ap_l="%l1";    # a[num],n[num] are smashed to 32-bit words and saved
  78 $ap_h="%l2";    # to these four vectors as double-precision FP values.
  79 $np_l="%l3";    # This way a bunch of fxtods are eliminated in second
  80 $np_h="%l4";    # loop and L1-cache aliasing is minimized...
  81 $i="%l5";
  82 $j="%l6";
  83 $mask="%l7";    # 16-bit mask, 0xffff
  84
  85 $n0="%g4";      # reassigned!!!
  86 $carry="%i4";   # reassigned!!! [only 1 bit is used]
  87
  88 # FP register naming chart
  89 #
  90 #     ..HILO
  91 #       dcba
  92 #   --------
  93 #        LOa
  94 #       LOb
  95 #      LOc
  96 #     LOd
  97 #      HIa
  98 #     HIb
  99 #    HIc
 100 #   HId
 101 #    ..a
 102 #   ..b
 103 $ba="%f0";    $bb="%f2";    $bc="%f4";    $bd="%f6";
 104 $na="%f8";    $nb="%f10";   $nc="%f12";   $nd="%f14";
 105 $alo="%f16";  $alo_="%f17"; $ahi="%f18";  $ahi_="%f19";
 106 $nlo="%f20";  $nlo_="%f21"; $nhi="%f22";  $nhi_="%f23";
 107
 108 $dota="%f24"; $dotb="%f26";
 109
 110 $aloa="%f32"; $alob="%f34"; $aloc="%f36"; $alod="%f38";
 111 $ahia="%f40"; $ahib="%f42"; $ahic="%f44"; $ahid="%f46";
 112 $nloa="%f48"; $nlob="%f50"; $nloc="%f52"; $nlod="%f54";
 113 $nhia="%f56"; $nhib="%f58"; $nhic="%f60"; $nhid="%f62";
 114
 115 $ASI_FL16_P=0xD2;       # magic ASI value to engage 16-bit FP load
 116
 117 $code=<<___;
 118 .ident          "UltraSPARC Montgomery multiply by <appro\@fy.chalmers.se>"
 119 .section        ".text",#alloc,#execinstr
 120
 121 .global $fname
 122 .align  32
 123 $fname:
 124         save    %sp,-$frame,%sp
 125         sethi   %hi(0xffff),$mask
 126         sll     $num,3,$num             ! num*=8
 127         or      $mask,%lo(0xffff),$mask
 128         mov     %i4,$n0                 ! reassigned, remember?
 129
 130         add     %sp,$bias,%o0           ! real top of stack
 131         sll     $num,2,%o1
 132         add     %o1,$num,%o1            ! %o1=num*5
 133         sub     %o0,%o1,%o0
 134         sub     %o0,$locals,%o0
 135         and     %o0,-2048,%o0           ! optimize TLB utilization
 136         sub     %o0,$bias,%sp           ! alloca
 137
 138         rd      %asi,%o7
 139         add     %sp,$bias+$frame+$locals,$tp
 140         add     $tp,$num,$ap_l
 141         add     $ap_l,$num,$ap_l        ! [an]p_[lh] point at the vector ends !
 142         add     $ap_l,$num,$ap_h
 143         add     $ap_h,$num,$np_l
 144         add     $np_l,$num,$np_h
 145
 146         wr      %g0,$ASI_FL16_P,%asi    ! setup %asi for 16-bit FP loads
 147
 148         add     $rp,$num,$rp            ! readjust input pointers to point
 149         add     $ap,$num,$ap            ! at the ends too...
 150         add     $bp,$num,$bp
 151         add     $np,$num,$np
 152
 153         stx     %o7,[%sp+$bias+$frame+48]
 154 \f
 155         sub     %g0,$num,$i
 156         sub     %g0,$num,$j
 157
 158         add     $ap,$j,%o3
 159         add     $bp,$i,%o4
 160         ldx     [$bp+$i],%o0            ! bp[0]
 161         add     $np,$j,%o5
 162         add     %sp,$bias+$frame+0,%o7
 163         ldx     [$ap+$j],%o1            ! ap[0]
 164
 165         mulx    %o1,%o0,%o0             ! ap[0]*bp[0]
 166         mulx    $n0,%o0,%o0             ! ap[0]*bp[0]*n0
 167         stx     %o0,[%o7]
 168
 169         ld      [%o3+4],$alo_           ! load a[j] as pair of 32-bit words
 170         fxors   $alo,$alo,$alo
 171         ld      [%o3+0],$ahi_
 172         fxors   $ahi,$ahi,$ahi
 173         ld      [%o5+4],$nlo_           ! load n[j] as pair of 32-bit words
 174         fxors   $nlo,$nlo,$nlo
 175         ld      [%o5+0],$nhi_
 176         fxors   $nhi,$nhi,$nhi
 177
 178         ! transfer b[i] to FPU as 4x16-bit values
 179         ldda    [%o4+6]%asi,$ba
 180         fxtod   $alo,$alo
 181         ldda    [%o4+4]%asi,$bb
 182         fxtod   $ahi,$ahi
 183         ldda    [%o4+2]%asi,$bc
 184         fxtod   $nlo,$nlo
 185         ldda    [%o4+0]%asi,$bd
 186         fxtod   $nhi,$nhi
 187
 188         ! transfer ap[0]*b[0]*n0 to FPU as 4x16-bit values
 189         ldda    [%o7+6]%asi,$na
 190         fxtod   $ba,$ba
 191         ldda    [%o7+4]%asi,$nb
 192         fxtod   $bb,$bb
 193         ldda    [%o7+2]%asi,$nc
 194         fxtod   $bc,$bc
 195         ldda    [%o7+0]%asi,$nd
 196         fxtod   $bd,$bd
 197
 198         std     $alo,[$ap_l+$j]         ! save smashed ap[j] in double format
 199         fxtod   $na,$na
 200         std     $ahi,[$ap_h+$j]
 201         fxtod   $nb,$nb
 202         std     $nlo,[$np_l+$j]         ! save smashed np[j] in double format
 203         fxtod   $nc,$nc
 204         std     $nhi,[$np_h+$j]
 205         fxtod   $nd,$nd
 206
 207         fmuld   $alo,$ba,$aloa
 208         fmuld   $nlo,$na,$nloa
 209         fmuld   $alo,$bb,$alob
 210         fmuld   $nlo,$nb,$nlob
 211         fmuld   $alo,$bc,$aloc
 212         fmuld   $nlo,$nc,$nloc
 213                 faddd   $aloa,$nloa,$nloa
 214         fmuld   $alo,$bd,$alod
 215         fmuld   $nlo,$nd,$nlod
 216                 faddd   $alob,$nlob,$nlob
 217         fmuld   $ahi,$ba,$ahia
 218         fmuld   $nhi,$na,$nhia
 219                 faddd   $aloc,$nloc,$nloc
 220         fmuld   $ahi,$bb,$ahib
 221         fmuld   $nhi,$nb,$nhib
 222                 faddd   $alod,$nlod,$nlod
 223         fmuld   $ahi,$bc,$ahic
 224         fmuld   $nhi,$nc,$nhic
 225                 faddd   $ahia,$nhia,$nhia
 226         fmuld   $ahi,$bd,$ahid
 227         fmuld   $nhi,$nd,$nhid
 228
 229         faddd   $ahib,$nhib,$nhib
 230         faddd   $ahic,$nhic,$dota       ! $nhic
 231         faddd   $ahid,$nhid,$dotb       ! $nhid
 232
 233         faddd   $nloc,$nhia,$nloc
 234         faddd   $nlod,$nhib,$nlod
 235
 236         fdtox   $nloa,$nloa
 237         fdtox   $nlob,$nlob
 238         fdtox   $nloc,$nloc
 239         fdtox   $nlod,$nlod
 240
 241         std     $nloa,[%sp+$bias+$frame+0]
 242         std     $nlob,[%sp+$bias+$frame+8]
 243         std     $nloc,[%sp+$bias+$frame+16]
 244         std     $nlod,[%sp+$bias+$frame+24]
 245         ldx     [%sp+$bias+$frame+0],%o0
 246         ldx     [%sp+$bias+$frame+8],%o1
 247         ldx     [%sp+$bias+$frame+16],%o2
 248         ldx     [%sp+$bias+$frame+24],%o3
 249
 250         srlx    %o0,16,%o7
 251         add     %o7,%o1,%o1
 252         srlx    %o1,16,%o7
 253         add     %o7,%o2,%o2
 254         srlx    %o2,16,%o7
 255         add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 256         !and    %o0,$mask,%o0
 257         !and    %o1,$mask,%o1
 258         !and    %o2,$mask,%o2
 259         !sllx   %o1,16,%o1
 260         !sllx   %o2,32,%o2
 261         !sllx   %o3,48,%o7
 262         !or     %o1,%o0,%o0
 263         !or     %o2,%o0,%o0
 264         !or     %o7,%o0,%o0             ! 64-bit result
 265         srlx    %o3,16,%g1              ! 34-bit carry
 266 \f
 267         ba      .L1st
 268         add     $j,8,$j
 269 .align  32
 270 .L1st:
 271         add     $ap,$j,%o3
 272         add     $np,$j,%o4
 273         ld      [%o3+4],$alo_           ! load a[j] as pair of 32-bit words
 274         fxors   $alo,$alo,$alo
 275         ld      [%o3+0],$ahi_
 276         fxors   $ahi,$ahi,$ahi
 277         ld      [%o4+4],$nlo_           ! load n[j] as pair of 32-bit words
 278         fxors   $nlo,$nlo,$nlo
 279         ld      [%o4+0],$nhi_
 280         fxors   $nhi,$nhi,$nhi
 281
 282         fxtod   $alo,$alo
 283         fxtod   $ahi,$ahi
 284         fxtod   $nlo,$nlo
 285         fxtod   $nhi,$nhi
 286
 287         std     $alo,[$ap_l+$j]         ! save smashed ap[j] in double format
 288         fmuld   $alo,$ba,$aloa
 289         std     $ahi,[$ap_h+$j]
 290         fmuld   $nlo,$na,$nloa
 291         std     $nlo,[$np_l+$j]         ! save smashed np[j] in double format
 292         fmuld   $alo,$bb,$alob
 293         std     $nhi,[$np_h+$j]
 294         fmuld   $nlo,$nb,$nlob
 295         fmuld   $alo,$bc,$aloc
 296         fmuld   $nlo,$nc,$nloc
 297                 faddd   $aloa,$nloa,$nloa
 298         fmuld   $alo,$bd,$alod
 299         fmuld   $nlo,$nd,$nlod
 300                 faddd   $alob,$nlob,$nlob
 301         fmuld   $ahi,$ba,$ahia
 302         fmuld   $nhi,$na,$nhia
 303                 faddd   $aloc,$nloc,$nloc
 304         fmuld   $ahi,$bb,$ahib
 305         fmuld   $nhi,$nb,$nhib
 306                 faddd   $alod,$nlod,$nlod
 307         fmuld   $ahi,$bc,$ahic
 308         fmuld   $nhi,$nc,$nhic
 309                 faddd   $ahia,$nhia,$nhia
 310         fmuld   $ahi,$bd,$ahid
 311         fmuld   $nhi,$nd,$nhid
 312                 faddd   $ahib,$nhib,$nhib
 313
 314         faddd   $dota,$nloa,$nloa
 315         faddd   $dotb,$nlob,$nlob
 316         faddd   $ahic,$nhic,$dota       ! $nhic
 317         faddd   $ahid,$nhid,$dotb       ! $nhid
 318
 319         faddd   $nloc,$nhia,$nloc
 320         faddd   $nlod,$nhib,$nlod
 321
 322         fdtox   $nloa,$nloa
 323         fdtox   $nlob,$nlob
 324         fdtox   $nloc,$nloc
 325         fdtox   $nlod,$nlod
 326
 327         std     $nloa,[%sp+$bias+$frame+0]
 328         std     $nlob,[%sp+$bias+$frame+8]
 329         std     $nloc,[%sp+$bias+$frame+16]
 330         std     $nlod,[%sp+$bias+$frame+24]
 331         ldx     [%sp+$bias+$frame+0],%o0
 332         ldx     [%sp+$bias+$frame+8],%o1
 333         ldx     [%sp+$bias+$frame+16],%o2
 334         ldx     [%sp+$bias+$frame+24],%o3
 335
 336         srlx    %o0,16,%o7
 337         add     %o7,%o1,%o1
 338         srlx    %o1,16,%o7
 339         add     %o7,%o2,%o2
 340         srlx    %o2,16,%o7
 341         add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 342         and     %o0,$mask,%o0
 343         and     %o1,$mask,%o1
 344         and     %o2,$mask,%o2
 345         sllx    %o1,16,%o1
 346         sllx    %o2,32,%o2
 347         sllx    %o3,48,%o7
 348         or      %o1,%o0,%o0
 349         or      %o2,%o0,%o0
 350         or      %o7,%o0,%o0             ! 64-bit result
 351         addcc   %g1,%o0,%o0
 352         srlx    %o3,16,%g1              ! 34-bit carry
 353         bcs,a   %xcc,.+8
 354         add     %g1,1,%g1
 355
 356         stx     %o0,[$tp]               ! tp[j-1]=
 357         add     $j,8,$j
 358         brnz    $j,.L1st
 359         add     $tp,8,$tp
 360 \f
 361         fdtox   $dota,$dota
 362         fdtox   $dotb,$dotb
 363         std     $dota,[%sp+$bias+$frame+32]
 364         std     $dotb,[%sp+$bias+$frame+40]
 365         ldx     [%sp+$bias+$frame+32],%o0
 366         ldx     [%sp+$bias+$frame+40],%o1
 367
 368         srlx    %o0,16,%o7
 369         add     %o7,%o1,%o1
 370         and     %o0,$mask,%o0
 371         sllx    %o1,16,%o7
 372         or      %o7,%o0,%o0
 373         addcc   %g1,%o0,%o0
 374         srlx    %o1,48,%g1
 375         bcs,a   %xcc,.+8
 376         add     %g1,1,%g1
 377
 378         mov     %g1,$carry
 379         stx     %o0,[$tp]               ! tp[num-1]=
 380 \f
 381         ba      .Louter
 382         add     $i,8,$i
 383 .align  32
 384 .Louter:
 385         sub     %g0,$num,$j
 386         add     %sp,$bias+$frame+$locals,$tp
 387
 388         add     $bp,$i,%o4
 389         ldx     [$bp+$i],%o0            ! bp[i]
 390         add     %sp,$bias+$frame+0,%o7
 391         ldx     [$ap+$j],%o1            ! ap[0]
 392
 393         ldx     [$tp],%o2               ! tp[0]
 394         mulx    %o1,%o0,%o0
 395         addcc   %o2,%o0,%o0
 396         mulx    $n0,%o0,%o0             ! (ap[0]*bp[i]+t[0])*n0
 397         stx     %o0,[%o7]
 398
 399
 400         ! transfer b[i] to FPU as 4x16-bit values
 401         ldda    [%o4+6]%asi,$ba
 402         ldda    [%o4+4]%asi,$bb
 403         ldda    [%o4+2]%asi,$bc
 404         ldda    [%o4+0]%asi,$bd
 405
 406         ! transfer (ap[0]*b[i]+t[0])*n0 to FPU as 4x16-bit values
 407         ldda    [%o7+6]%asi,$na
 408         fxtod   $ba,$ba
 409         ldda    [%o7+4]%asi,$nb
 410         fxtod   $bb,$bb
 411         ldda    [%o7+2]%asi,$nc
 412         fxtod   $bc,$bc
 413         ldda    [%o7+0]%asi,$nd
 414         fxtod   $bd,$bd
 415         ldd     [$ap_l+$j],$alo         ! load a[j] in double format
 416         fxtod   $na,$na
 417         ldd     [$ap_h+$j],$ahi
 418         fxtod   $nb,$nb
 419         ldd     [$np_l+$j],$nlo         ! load n[j] in double format
 420         fxtod   $nc,$nc
 421         ldd     [$np_h+$j],$nhi
 422         fxtod   $nd,$nd
 423
 424         fmuld   $alo,$ba,$aloa
 425         fmuld   $nlo,$na,$nloa
 426         fmuld   $alo,$bb,$alob
 427         fmuld   $nlo,$nb,$nlob
 428         fmuld   $alo,$bc,$aloc
 429         fmuld   $nlo,$nc,$nloc
 430                 faddd   $aloa,$nloa,$nloa
 431         fmuld   $alo,$bd,$alod
 432         fmuld   $nlo,$nd,$nlod
 433                 faddd   $alob,$nlob,$nlob
 434         fmuld   $ahi,$ba,$ahia
 435         fmuld   $nhi,$na,$nhia
 436                 faddd   $aloc,$nloc,$nloc
 437         fmuld   $ahi,$bb,$ahib
 438         fmuld   $nhi,$nb,$nhib
 439                 faddd   $alod,$nlod,$nlod
 440         fmuld   $ahi,$bc,$ahic
 441         fmuld   $nhi,$nc,$nhic
 442                 faddd   $ahia,$nhia,$nhia
 443         fmuld   $ahi,$bd,$ahid
 444         fmuld   $nhi,$nd,$nhid
 445
 446         faddd   $ahib,$nhib,$nhib
 447         faddd   $ahic,$nhic,$dota       ! $nhic
 448         faddd   $ahid,$nhid,$dotb       ! $nhid
 449
 450         faddd   $nloc,$nhia,$nloc
 451         faddd   $nlod,$nhib,$nlod
 452
 453         fdtox   $nloa,$nloa
 454         fdtox   $nlob,$nlob
 455         fdtox   $nloc,$nloc
 456         fdtox   $nlod,$nlod
 457
 458         std     $nloa,[%sp+$bias+$frame+0]
 459         std     $nlob,[%sp+$bias+$frame+8]
 460         std     $nloc,[%sp+$bias+$frame+16]
 461         std     $nlod,[%sp+$bias+$frame+24]
 462         ldx     [%sp+$bias+$frame+0],%o0
 463         ldx     [%sp+$bias+$frame+8],%o1
 464         ldx     [%sp+$bias+$frame+16],%o2
 465         ldx     [%sp+$bias+$frame+24],%o3
 466
 467         srlx    %o0,16,%o7
 468         add     %o7,%o1,%o1
 469         srlx    %o1,16,%o7
 470         add     %o7,%o2,%o2
 471         srlx    %o2,16,%o7
 472         add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 473         ! why?
 474         and     %o0,$mask,%o0
 475         and     %o1,$mask,%o1
 476         and     %o2,$mask,%o2
 477         sllx    %o1,16,%o1
 478         sllx    %o2,32,%o2
 479         sllx    %o3,48,%o7
 480         or      %o1,%o0,%o0
 481         or      %o2,%o0,%o0
 482         or      %o7,%o0,%o0             ! 64-bit result
 483         ldx     [$tp],%o7
 484         addcc   %o7,%o0,%o0
 485         ! end-of-why?
 486         srlx    %o3,16,%g1              ! 34-bit carry
 487         bcs,a   %xcc,.+8
 488         add     %g1,1,%g1
 489 \f
 490         ba      .Linner
 491         add     $j,8,$j
 492 .align  32
 493 .Linner:
 494         ldd     [$ap_l+$j],$alo         ! load a[j] in double format
 495         ldd     [$ap_h+$j],$ahi
 496         ldd     [$np_l+$j],$nlo         ! load n[j] in double format
 497         ldd     [$np_h+$j],$nhi
 498
 499         fmuld   $alo,$ba,$aloa
 500         fmuld   $nlo,$na,$nloa
 501         fmuld   $alo,$bb,$alob
 502         fmuld   $nlo,$nb,$nlob
 503         fmuld   $alo,$bc,$aloc
 504         fmuld   $nlo,$nc,$nloc
 505                 faddd   $aloa,$nloa,$nloa
 506         fmuld   $alo,$bd,$alod
 507         fmuld   $nlo,$nd,$nlod
 508                 faddd   $alob,$nlob,$nlob
 509         fmuld   $ahi,$ba,$ahia
 510         fmuld   $nhi,$na,$nhia
 511                 faddd   $aloc,$nloc,$nloc
 512         fmuld   $ahi,$bb,$ahib
 513         fmuld   $nhi,$nb,$nhib
 514                 faddd   $alod,$nlod,$nlod
 515         fmuld   $ahi,$bc,$ahic
 516         fmuld   $nhi,$nc,$nhic
 517                 faddd   $ahia,$nhia,$nhia
 518         fmuld   $ahi,$bd,$ahid
 519         fmuld   $nhi,$nd,$nhid
 520
 521         faddd   $ahib,$nhib,$nhib
 522         faddd   $dota,$nloa,$nloa
 523         faddd   $dotb,$nlob,$nlob
 524         faddd   $ahic,$nhic,$dota       ! $nhic
 525         faddd   $ahid,$nhid,$dotb       ! $nhid
 526
 527         faddd   $nloc,$nhia,$nloc
 528         faddd   $nlod,$nhib,$nlod
 529
 530         fdtox   $nloa,$nloa
 531         fdtox   $nlob,$nlob
 532         fdtox   $nloc,$nloc
 533         fdtox   $nlod,$nlod
 534
 535         std     $nloa,[%sp+$bias+$frame+0]
 536         std     $nlob,[%sp+$bias+$frame+8]
 537         std     $nloc,[%sp+$bias+$frame+16]
 538         std     $nlod,[%sp+$bias+$frame+24]
 539         ldx     [%sp+$bias+$frame+0],%o0
 540         ldx     [%sp+$bias+$frame+8],%o1
 541         ldx     [%sp+$bias+$frame+16],%o2
 542         ldx     [%sp+$bias+$frame+24],%o3
 543
 544         srlx    %o0,16,%o7
 545         add     %o7,%o1,%o1
 546         srlx    %o1,16,%o7
 547         add     %o7,%o2,%o2
 548         srlx    %o2,16,%o7
 549         add     %o7,%o3,%o3             ! %o3.%o2[0..15].%o1[0..15].%o0[0..15]
 550         and     %o0,$mask,%o0
 551         and     %o1,$mask,%o1
 552         and     %o2,$mask,%o2
 553         sllx    %o1,16,%o1
 554         sllx    %o2,32,%o2
 555         sllx    %o3,48,%o7
 556         or      %o1,%o0,%o0
 557         or      %o2,%o0,%o0
 558         or      %o7,%o0,%o0             ! 64-bit result
 559         addcc   %g1,%o0,%o0
 560         srlx    %o3,16,%g1              ! 34-bit carry
 561         bcs,a   %xcc,.+8
 562         add     %g1,1,%g1
 563
 564         ldx     [$tp+8],%o7             ! tp[j]
 565         addcc   %o7,%o0,%o0
 566         bcs,a   %xcc,.+8
 567         add     %g1,1,%g1
 568
 569         stx     %o0,[$tp]               ! tp[j-1]
 570         add     $j,8,$j
 571         brnz    $j,.Linner
 572         add     $tp,8,$tp
 573 \f
 574         fdtox   $dota,$dota
 575         fdtox   $dotb,$dotb
 576         std     $dota,[%sp+$bias+$frame+32]
 577         std     $dotb,[%sp+$bias+$frame+40]
 578         ldx     [%sp+$bias+$frame+32],%o0
 579         ldx     [%sp+$bias+$frame+40],%o1
 580
 581         srlx    %o0,16,%o7
 582         add     %o7,%o1,%o1
 583         and     %o0,$mask,%o0
 584         sllx    %o1,16,%o7
 585         or      %o7,%o0,%o0
 586         addcc   %g1,%o0,%o0
 587         srlx    %o1,48,%g1
 588         bcs,a   %xcc,.+8
 589         add     %g1,1,%g1
 590
 591         addcc   $carry,%o0,%o0
 592         stx     %o0,[$tp]               ! tp[num-1]
 593         mov     %g1,$carry
 594         bcs,a   %xcc,.+8
 595         add     $carry,1,$carry
 596
 597         add     $i,8,$i
 598         brnz    $i,.Louter
 599         nop
 600 \f
 601         sub     %g0,$num,$j             ! j=-num
 602         add     $tp,8,$tp               ! adjust tp to point at the end
 603
 604         cmp     $carry,0                ! clears %icc.c
 605         bne,pn  %icc,.Lsub
 606         nop
 607
 608         ld      [$tp-8],%o0
 609         ld      [$np-8],%o1
 610         cmp     %o0,%o1
 611         bcs,pt  %icc,.Lcopy             ! %icc.c is clean if not taken
 612         nop
 613
 614 .align  32,0x1000000
 615 .Lsub:
 616         ldd     [$tp+$j],%o0
 617         ldd     [$np+$j],%o2
 618         subccc  %o1,%o3,%o1
 619         subccc  %o0,%o2,%o0
 620         std     %o0,[$rp+$j]
 621         add     $j,8,$j
 622         brnz    $j,.Lsub
 623         nop
 624         subccc  $carry,0,$carry
 625         bcc     %icc,.Lzap
 626         sub     %g0,$num,$j
 627
 628 .align  16,0x1000000
 629 .Lcopy:
 630         ldx     [$tp+$j],%o0
 631         stx     %o0,[$rp+$j]
 632         add     $j,8,$j
 633         brnz    $j,.Lcopy
 634         nop
 635         ba      .Lzap
 636         sub     %g0,$num,$j
 637
 638 .align  32
 639 .Lzap:
 640         stx     %g0,[$tp+$j]
 641         stx     %g0,[$ap_l+$j]
 642         stx     %g0,[$ap_h+$j]
 643         stx     %g0,[$np_l+$j]
 644         stx     %g0,[$np_h+$j]
 645         add     $j,8,$j
 646         brnz    $j,.Lzap
 647         nop
 648
 649         ldx     [%sp+$bias+$frame+48],%o7
 650         wr      %g0,%o7,%asi            ! restore %asi
 651
 652         mov     1,%i0
 653         ret
 654         restore
 655 .type   $fname,#function
 656 .size   $fname,(.-$fname)
 657 ___
 658
 659 $code =~ s/\`([^\`]*)\`/eval($1)/gem;
 660 print $code;
 661 close STDOUT;