crypto/sha/asm/sha1-sparcv9a.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9
  10 # January 2009
  11 #
  12 # Provided that UltraSPARC VIS instructions are pipe-lined(*) and
  13 # pairable(*) with IALU ones, offloading of Xupdate to the UltraSPARC
  14 # Graphic Unit would make it possible to achieve higher instruction-
  15 # level parallelism, ILP, and thus higher performance. It should be
  16 # explicitly noted that ILP is the keyword, and it means that this
  17 # code would be unsuitable for cores like UltraSPARC-Tx. The idea is
  18 # not really novel, Sun had VIS-powered implementation for a while.
  19 # Unlike Sun's implementation this one can process multiple unaligned
  20 # input blocks, and as such works as drop-in replacement for OpenSSL
  21 # sha1_block_data_order. Performance improvement was measured to be
  22 # 40% over pure IALU sha1-sparcv9.pl on UltraSPARC-IIi, but 12% on
  23 # UltraSPARC-III. See below for discussion...
  24 #
  25 # The module does not present direct interest for OpenSSL, because
  26 # it doesn't provide better performance on contemporary SPARCv9 CPUs,
  27 # UltraSPARC-Tx and SPARC64-V[II] to be specific. Those who feel they
  28 # absolutely must score on UltraSPARC-I-IV can simply replace
  29 # crypto/sha/asm/sha1-sparcv9.pl with this module.
  30 #
  31 # (*)   "Pipe-lined" means that even if it takes several cycles to
  32 #       complete, next instruction using same functional unit [but not
  33 #       depending on the result of the current instruction] can start
  34 #       execution without having to wait for the unit. "Pairable"
  35 #       means that two [or more] independent instructions can be
  36 #       issued at the very same time.
  37
  38 $bits=32;
  39 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  40 if ($bits==64)  { $bias=2047; $frame=192; }
  41 else            { $bias=0;    $frame=112; }
  42
  43 $output=shift;
  44 open STDOUT,">$output";
  45
  46 $ctx="%i0";
  47 $inp="%i1";
  48 $len="%i2";
  49 $tmp0="%i3";
  50 $tmp1="%i4";
  51 $tmp2="%i5";
  52 $tmp3="%g5";
  53
  54 $base="%g1";
  55 $align="%g4";
  56 $Xfer="%o5";
  57 $nXfer=$tmp3;
  58 $Xi="%o7";
  59
  60 $A="%l0";
  61 $B="%l1";
  62 $C="%l2";
  63 $D="%l3";
  64 $E="%l4";
  65 @V=($A,$B,$C,$D,$E);
  66
  67 $Actx="%o0";
  68 $Bctx="%o1";
  69 $Cctx="%o2";
  70 $Dctx="%o3";
  71 $Ectx="%o4";
  72
  73 $fmul="%f32";
  74 $VK_00_19="%f34";
  75 $VK_20_39="%f36";
  76 $VK_40_59="%f38";
  77 $VK_60_79="%f40";
  78 @VK=($VK_00_19,$VK_20_39,$VK_40_59,$VK_60_79);
  79 @X=("%f0", "%f1", "%f2", "%f3", "%f4", "%f5", "%f6", "%f7",
  80     "%f8", "%f9","%f10","%f11","%f12","%f13","%f14","%f15","%f16");
  81
  82 # This is reference 2x-parallelized VIS-powered Xupdate procedure. It
  83 # covers even K_NN_MM addition...
  84 sub Xupdate {
  85 my ($i)=@_;
  86 my $K=@VK[($i+16)/20];
  87 my $j=($i+16)%16;
  88
  89 #       [ provided that GSR.alignaddr_offset is 5, $mul contains
  90 #         0x100ULL<<32|0x100 value and K_NN_MM are pre-loaded to
  91 #         chosen registers... ]
  92 $code.=<<___;
  93         fxors           @X[($j+13)%16],@X[$j],@X[$j]    !-1/-1/-1:X[0]^=X[13]
  94         fxors           @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
  95         fxor            @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
  96         fxor            %f18,@X[$j],@X[$j]              ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
  97         faligndata      @X[$j],@X[$j],%f18              ! 3/ 7/ 5:Tmp=X[0,1]>>>24
  98         fpadd32         @X[$j],@X[$j],@X[$j]            ! 4/ 8/ 6:X[0,1]<<=1
  99         fmul8ulx16      %f18,$fmul,%f18                 ! 5/10/ 7:Tmp>>=7, Tmp&=1
 100         ![fxors         %f15,%f2,%f2]
 101         for             %f18,@X[$j],@X[$j]              ! 8/14/10:X[0,1]|=Tmp
 102         ![fxors         %f0,%f3,%f3]                    !10/17/12:X[0] dependency
 103         fpadd32         $K,@X[$j],%f20
 104         std             %f20,[$Xfer+`4*$j`]
 105 ___
 106 # The numbers delimited with slash are the earliest possible dispatch
 107 # cycles for given instruction assuming 1 cycle latency for simple VIS
 108 # instructions, such as on UltraSPARC-I&II, 3 cycles latency, such as
 109 # on UltraSPARC-III&IV, and 2 cycles latency(*), respectively. Being
 110 # 2x-parallelized the procedure is "worth" 5, 8.5 or 6 ticks per SHA1
 111 # round. As [long as] FPU/VIS instructions are perfectly pairable with
 112 # IALU ones, the round timing is defined by the maximum between VIS
 113 # and IALU timings. The latter varies from round to round and averages
 114 # out at 6.25 ticks. This means that USI&II should operate at IALU
 115 # rate, while USIII&IV - at VIS rate. This explains why performance
 116 # improvement varies among processors. Well, given that pure IALU
 117 # sha1-sparcv9.pl module exhibits virtually uniform performance of
 118 # ~9.3 cycles per SHA1 round. Timings mentioned above are theoretical
 119 # lower limits. Real-life performance was measured to be 6.6 cycles
 120 # per SHA1 round on USIIi and 8.3 on USIII. The latter is lower than
 121 # half-round VIS timing, because there are 16 Xupdate-free rounds,
 122 # which "push down" average theoretical timing to 8 cycles...
 123
 124 # (*)   SPARC64-V[II] was originally believed to have 2 cycles VIS
 125 #       latency. Well, it might have, but it doesn't have dedicated
 126 #       VIS-unit. Instead, VIS instructions are executed by other
 127 #       functional units, ones used here - by IALU. This doesn't
 128 #       improve effective ILP...
 129 }
 130
 131 # The reference Xupdate procedure is then "strained" over *pairs* of
 132 # BODY_NN_MM and kind of modulo-scheduled in respect to X[n]^=X[n+13]
 133 # and K_NN_MM addition. It's "running" 15 rounds ahead, which leaves
 134 # plenty of room to amortize for read-after-write hazard, as well as
 135 # to fetch and align input for the next spin. The VIS instructions are
 136 # scheduled for latency of 2 cycles, because there are not enough IALU
 137 # instructions to schedule for latency of 3, while scheduling for 1
 138 # would give no gain on USI&II anyway.
 139
 140 sub BODY_00_19 {
 141 my ($i,$a,$b,$c,$d,$e)=@_;
 142 my $j=$i&~1;
 143 my $k=($j+16+2)%16;     # ahead reference
 144 my $l=($j+16-2)%16;     # behind reference
 145 my $K=@VK[($j+16-2)/20];
 146
 147 $j=($j+16)%16;
 148
 149 $code.=<<___ if (!($i&1));
 150         sll             $a,5,$tmp0                      !! $i
 151         and             $c,$b,$tmp3
 152         ld              [$Xfer+`4*($i%16)`],$Xi
 153          fxors          @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
 154         srl             $a,27,$tmp1
 155         add             $tmp0,$e,$e
 156          fxor           @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
 157         sll             $b,30,$tmp2
 158         add             $tmp1,$e,$e
 159         andn            $d,$b,$tmp1
 160         add             $Xi,$e,$e
 161          fxor           %f18,@X[$j],@X[$j]              ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
 162         srl             $b,2,$b
 163         or              $tmp1,$tmp3,$tmp1
 164         or              $tmp2,$b,$b
 165         add             $tmp1,$e,$e
 166          faligndata     @X[$j],@X[$j],%f18              ! 3/ 7/ 5:Tmp=X[0,1]>>>24
 167 ___
 168 $code.=<<___ if ($i&1);
 169         sll             $a,5,$tmp0                      !! $i
 170         and             $c,$b,$tmp3
 171         ld              [$Xfer+`4*($i%16)`],$Xi
 172          fpadd32        @X[$j],@X[$j],@X[$j]            ! 4/ 8/ 6:X[0,1]<<=1
 173         srl             $a,27,$tmp1
 174         add             $tmp0,$e,$e
 175          fmul8ulx16     %f18,$fmul,%f18                 ! 5/10/ 7:Tmp>>=7, Tmp&=1
 176         sll             $b,30,$tmp2
 177         add             $tmp1,$e,$e
 178          fpadd32        $K,@X[$l],%f20                  !
 179         andn            $d,$b,$tmp1
 180         add             $Xi,$e,$e
 181          fxors          @X[($k+13)%16],@X[$k],@X[$k]    !-1/-1/-1:X[0]^=X[13]
 182         srl             $b,2,$b
 183         or              $tmp1,$tmp3,$tmp1
 184          fxor           %f18,@X[$j],@X[$j]              ! 8/14/10:X[0,1]|=Tmp
 185         or              $tmp2,$b,$b
 186         add             $tmp1,$e,$e
 187 ___
 188 $code.=<<___ if ($i&1 && $i>=2);
 189          std            %f20,[$Xfer+`4*$l`]             !
 190 ___
 191 }
 192
 193 sub BODY_20_39 {
 194 my ($i,$a,$b,$c,$d,$e)=@_;
 195 my $j=$i&~1;
 196 my $k=($j+16+2)%16;     # ahead reference
 197 my $l=($j+16-2)%16;     # behind reference
 198 my $K=@VK[($j+16-2)/20];
 199
 200 $j=($j+16)%16;
 201
 202 $code.=<<___ if (!($i&1) && $i<64);
 203         sll             $a,5,$tmp0                      !! $i
 204         ld              [$Xfer+`4*($i%16)`],$Xi
 205          fxors          @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
 206         srl             $a,27,$tmp1
 207         add             $tmp0,$e,$e
 208          fxor           @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
 209         xor             $c,$b,$tmp0
 210         add             $tmp1,$e,$e
 211         sll             $b,30,$tmp2
 212         xor             $d,$tmp0,$tmp1
 213          fxor           %f18,@X[$j],@X[$j]              ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
 214         srl             $b,2,$b
 215         add             $tmp1,$e,$e
 216         or              $tmp2,$b,$b
 217         add             $Xi,$e,$e
 218          faligndata     @X[$j],@X[$j],%f18              ! 3/ 7/ 5:Tmp=X[0,1]>>>24
 219 ___
 220 $code.=<<___ if ($i&1 && $i<64);
 221         sll             $a,5,$tmp0                      !! $i
 222         ld              [$Xfer+`4*($i%16)`],$Xi
 223          fpadd32        @X[$j],@X[$j],@X[$j]            ! 4/ 8/ 6:X[0,1]<<=1
 224         srl             $a,27,$tmp1
 225         add             $tmp0,$e,$e
 226          fmul8ulx16     %f18,$fmul,%f18                 ! 5/10/ 7:Tmp>>=7, Tmp&=1
 227         xor             $c,$b,$tmp0
 228         add             $tmp1,$e,$e
 229          fpadd32        $K,@X[$l],%f20                  !
 230         sll             $b,30,$tmp2
 231         xor             $d,$tmp0,$tmp1
 232          fxors          @X[($k+13)%16],@X[$k],@X[$k]    !-1/-1/-1:X[0]^=X[13]
 233         srl             $b,2,$b
 234         add             $tmp1,$e,$e
 235          fxor           %f18,@X[$j],@X[$j]              ! 8/14/10:X[0,1]|=Tmp
 236         or              $tmp2,$b,$b
 237         add             $Xi,$e,$e
 238          std            %f20,[$Xfer+`4*$l`]             !
 239 ___
 240 $code.=<<___ if ($i==64);
 241         sll             $a,5,$tmp0                      !! $i
 242         ld              [$Xfer+`4*($i%16)`],$Xi
 243          fpadd32        $K,@X[$l],%f20
 244         srl             $a,27,$tmp1
 245         add             $tmp0,$e,$e
 246         xor             $c,$b,$tmp0
 247         add             $tmp1,$e,$e
 248         sll             $b,30,$tmp2
 249         xor             $d,$tmp0,$tmp1
 250          std            %f20,[$Xfer+`4*$l`]
 251         srl             $b,2,$b
 252         add             $tmp1,$e,$e
 253         or              $tmp2,$b,$b
 254         add             $Xi,$e,$e
 255 ___
 256 $code.=<<___ if ($i>64);
 257         sll             $a,5,$tmp0                      !! $i
 258         ld              [$Xfer+`4*($i%16)`],$Xi
 259         srl             $a,27,$tmp1
 260         add             $tmp0,$e,$e
 261         xor             $c,$b,$tmp0
 262         add             $tmp1,$e,$e
 263         sll             $b,30,$tmp2
 264         xor             $d,$tmp0,$tmp1
 265         srl             $b,2,$b
 266         add             $tmp1,$e,$e
 267         or              $tmp2,$b,$b
 268         add             $Xi,$e,$e
 269 ___
 270 }
 271
 272 sub BODY_40_59 {
 273 my ($i,$a,$b,$c,$d,$e)=@_;
 274 my $j=$i&~1;
 275 my $k=($j+16+2)%16;     # ahead reference
 276 my $l=($j+16-2)%16;     # behind reference
 277 my $K=@VK[($j+16-2)/20];
 278
 279 $j=($j+16)%16;
 280
 281 $code.=<<___ if (!($i&1));
 282         sll             $a,5,$tmp0                      !! $i
 283         ld              [$Xfer+`4*($i%16)`],$Xi
 284          fxors          @X[($j+14)%16],@X[$j+1],@X[$j+1]! 0/ 0/ 0:X[1]^=X[14]
 285         srl             $a,27,$tmp1
 286         add             $tmp0,$e,$e
 287          fxor           @X[($j+2)%16],@X[($j+8)%16],%f18! 1/ 1/ 1:Tmp=X[2,3]^X[8,9]
 288         and             $c,$b,$tmp0
 289         add             $tmp1,$e,$e
 290         sll             $b,30,$tmp2
 291         or              $c,$b,$tmp1
 292          fxor           %f18,@X[$j],@X[$j]              ! 2/ 4/ 3:X[0,1]^=X[2,3]^X[8,9]
 293         srl             $b,2,$b
 294         and             $d,$tmp1,$tmp1
 295         add             $Xi,$e,$e
 296         or              $tmp1,$tmp0,$tmp1
 297          faligndata     @X[$j],@X[$j],%f18              ! 3/ 7/ 5:Tmp=X[0,1]>>>24
 298         or              $tmp2,$b,$b
 299         add             $tmp1,$e,$e
 300          fpadd32        @X[$j],@X[$j],@X[$j]            ! 4/ 8/ 6:X[0,1]<<=1
 301 ___
 302 $code.=<<___ if ($i&1);
 303         sll             $a,5,$tmp0                      !! $i
 304         ld              [$Xfer+`4*($i%16)`],$Xi
 305         srl             $a,27,$tmp1
 306         add             $tmp0,$e,$e
 307          fmul8ulx16     %f18,$fmul,%f18                 ! 5/10/ 7:Tmp>>=7, Tmp&=1
 308         and             $c,$b,$tmp0
 309         add             $tmp1,$e,$e
 310          fpadd32        $K,@X[$l],%f20                  !
 311         sll             $b,30,$tmp2
 312         or              $c,$b,$tmp1
 313          fxors          @X[($k+13)%16],@X[$k],@X[$k]    !-1/-1/-1:X[0]^=X[13]
 314         srl             $b,2,$b
 315         and             $d,$tmp1,$tmp1
 316          fxor           %f18,@X[$j],@X[$j]              ! 8/14/10:X[0,1]|=Tmp
 317         add             $Xi,$e,$e
 318         or              $tmp1,$tmp0,$tmp1
 319         or              $tmp2,$b,$b
 320         add             $tmp1,$e,$e
 321          std            %f20,[$Xfer+`4*$l`]             !
 322 ___
 323 }
 324
 325 # If there is more data to process, then we pre-fetch the data for
 326 # next iteration in last ten rounds...
 327 sub BODY_70_79 {
 328 my ($i,$a,$b,$c,$d,$e)=@_;
 329 my $j=$i&~1;
 330 my $m=($i%8)*2;
 331
 332 $j=($j+16)%16;
 333
 334 $code.=<<___ if ($i==70);
 335         sll             $a,5,$tmp0                      !! $i
 336         ld              [$Xfer+`4*($i%16)`],$Xi
 337         srl             $a,27,$tmp1
 338         add             $tmp0,$e,$e
 339          ldd            [$inp+64],@X[0]
 340         xor             $c,$b,$tmp0
 341         add             $tmp1,$e,$e
 342         sll             $b,30,$tmp2
 343         xor             $d,$tmp0,$tmp1
 344         srl             $b,2,$b
 345         add             $tmp1,$e,$e
 346         or              $tmp2,$b,$b
 347         add             $Xi,$e,$e
 348
 349         and             $inp,-64,$nXfer
 350         inc             64,$inp
 351         and             $nXfer,255,$nXfer
 352         alignaddr       %g0,$align,%g0
 353         add             $base,$nXfer,$nXfer
 354 ___
 355 $code.=<<___ if ($i==71);
 356         sll             $a,5,$tmp0                      !! $i
 357         ld              [$Xfer+`4*($i%16)`],$Xi
 358         srl             $a,27,$tmp1
 359         add             $tmp0,$e,$e
 360         xor             $c,$b,$tmp0
 361         add             $tmp1,$e,$e
 362         sll             $b,30,$tmp2
 363         xor             $d,$tmp0,$tmp1
 364         srl             $b,2,$b
 365         add             $tmp1,$e,$e
 366         or              $tmp2,$b,$b
 367         add             $Xi,$e,$e
 368 ___
 369 $code.=<<___ if ($i>=72);
 370          faligndata     @X[$m],@X[$m+2],@X[$m]
 371         sll             $a,5,$tmp0                      !! $i
 372         ld              [$Xfer+`4*($i%16)`],$Xi
 373         srl             $a,27,$tmp1
 374         add             $tmp0,$e,$e
 375         xor             $c,$b,$tmp0
 376         add             $tmp1,$e,$e
 377          fpadd32        $VK_00_19,@X[$m],%f20
 378         sll             $b,30,$tmp2
 379         xor             $d,$tmp0,$tmp1
 380         srl             $b,2,$b
 381         add             $tmp1,$e,$e
 382         or              $tmp2,$b,$b
 383         add             $Xi,$e,$e
 384 ___
 385 $code.=<<___ if ($i<77);
 386          ldd            [$inp+`8*($i+1-70)`],@X[2*($i+1-70)]
 387 ___
 388 $code.=<<___ if ($i==77);       # redundant if $inp was aligned
 389          add            $align,63,$tmp0
 390          and            $tmp0,-8,$tmp0
 391          ldd            [$inp+$tmp0],@X[16]
 392 ___
 393 $code.=<<___ if ($i>=72);
 394          std            %f20,[$nXfer+`4*$m`]
 395 ___
 396 }
 397
 398 $code.=<<___;
 399 .section        ".text",#alloc,#execinstr
 400
 401 .align  64
 402 vis_const:
 403 .long   0x5a827999,0x5a827999   ! K_00_19
 404 .long   0x6ed9eba1,0x6ed9eba1   ! K_20_39
 405 .long   0x8f1bbcdc,0x8f1bbcdc   ! K_40_59
 406 .long   0xca62c1d6,0xca62c1d6   ! K_60_79
 407 .long   0x00000100,0x00000100
 408 .align  64
 409 .type   vis_const,#object
 410 .size   vis_const,(.-vis_const)
 411
 412 .globl  sha1_block_data_order
 413 sha1_block_data_order:
 414         save    %sp,-$frame,%sp
 415         add     %fp,$bias-256,$base
 416
 417 1:      call    .+8
 418         add     %o7,vis_const-1b,$tmp0
 419
 420         ldd     [$tmp0+0],$VK_00_19
 421         ldd     [$tmp0+8],$VK_20_39
 422         ldd     [$tmp0+16],$VK_40_59
 423         ldd     [$tmp0+24],$VK_60_79
 424         ldd     [$tmp0+32],$fmul
 425
 426         ld      [$ctx+0],$Actx
 427         and     $base,-256,$base
 428         ld      [$ctx+4],$Bctx
 429         sub     $base,$bias+$frame,%sp
 430         ld      [$ctx+8],$Cctx
 431         and     $inp,7,$align
 432         ld      [$ctx+12],$Dctx
 433         and     $inp,-8,$inp
 434         ld      [$ctx+16],$Ectx
 435
 436         ! X[16] is maintained in FP register bank
 437         alignaddr       %g0,$align,%g0
 438         ldd             [$inp+0],@X[0]
 439         sub             $inp,-64,$Xfer
 440         ldd             [$inp+8],@X[2]
 441         and             $Xfer,-64,$Xfer
 442         ldd             [$inp+16],@X[4]
 443         and             $Xfer,255,$Xfer
 444         ldd             [$inp+24],@X[6]
 445         add             $base,$Xfer,$Xfer
 446         ldd             [$inp+32],@X[8]
 447         ldd             [$inp+40],@X[10]
 448         ldd             [$inp+48],@X[12]
 449         brz,pt          $align,.Laligned
 450         ldd             [$inp+56],@X[14]
 451
 452         ldd             [$inp+64],@X[16]
 453         faligndata      @X[0],@X[2],@X[0]
 454         faligndata      @X[2],@X[4],@X[2]
 455         faligndata      @X[4],@X[6],@X[4]
 456         faligndata      @X[6],@X[8],@X[6]
 457         faligndata      @X[8],@X[10],@X[8]
 458         faligndata      @X[10],@X[12],@X[10]
 459         faligndata      @X[12],@X[14],@X[12]
 460         faligndata      @X[14],@X[16],@X[14]
 461
 462 .Laligned:
 463         mov             5,$tmp0
 464         dec             1,$len
 465         alignaddr       %g0,$tmp0,%g0
 466         fpadd32         $VK_00_19,@X[0],%f16
 467         fpadd32         $VK_00_19,@X[2],%f18
 468         fpadd32         $VK_00_19,@X[4],%f20
 469         fpadd32         $VK_00_19,@X[6],%f22
 470         fpadd32         $VK_00_19,@X[8],%f24
 471         fpadd32         $VK_00_19,@X[10],%f26
 472         fpadd32         $VK_00_19,@X[12],%f28
 473         fpadd32         $VK_00_19,@X[14],%f30
 474         std             %f16,[$Xfer+0]
 475         mov             $Actx,$A
 476         std             %f18,[$Xfer+8]
 477         mov             $Bctx,$B
 478         std             %f20,[$Xfer+16]
 479         mov             $Cctx,$C
 480         std             %f22,[$Xfer+24]
 481         mov             $Dctx,$D
 482         std             %f24,[$Xfer+32]
 483         mov             $Ectx,$E
 484         std             %f26,[$Xfer+40]
 485         fxors           @X[13],@X[0],@X[0]
 486         std             %f28,[$Xfer+48]
 487         ba              .Loop
 488         std             %f30,[$Xfer+56]
 489 .align  32
 490 .Loop:
 491 ___
 492 for ($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
 493 for (;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 494 for (;$i<60;$i++)       { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
 495 for (;$i<70;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 496 $code.=<<___;
 497         tst             $len
 498         bz,pn           `$bits==32?"%icc":"%xcc"`,.Ltail
 499         nop
 500 ___
 501 for (;$i<80;$i++)       { &BODY_70_79($i,@V); unshift(@V,pop(@V)); }
 502 $code.=<<___;
 503         add             $A,$Actx,$Actx
 504         add             $B,$Bctx,$Bctx
 505         add             $C,$Cctx,$Cctx
 506         add             $D,$Dctx,$Dctx
 507         add             $E,$Ectx,$Ectx
 508         mov             5,$tmp0
 509         fxors           @X[13],@X[0],@X[0]
 510         mov             $Actx,$A
 511         mov             $Bctx,$B
 512         mov             $Cctx,$C
 513         mov             $Dctx,$D
 514         mov             $Ectx,$E
 515         alignaddr       %g0,$tmp0,%g0
 516         dec             1,$len
 517         ba              .Loop
 518         mov             $nXfer,$Xfer
 519
 520 .align  32
 521 .Ltail:
 522 ___
 523 for($i=70;$i<80;$i++)   { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
 524 $code.=<<___;
 525         add     $A,$Actx,$Actx
 526         add     $B,$Bctx,$Bctx
 527         add     $C,$Cctx,$Cctx
 528         add     $D,$Dctx,$Dctx
 529         add     $E,$Ectx,$Ectx
 530
 531         st      $Actx,[$ctx+0]
 532         st      $Bctx,[$ctx+4]
 533         st      $Cctx,[$ctx+8]
 534         st      $Dctx,[$ctx+12]
 535         st      $Ectx,[$ctx+16]
 536
 537         ret
 538         restore
 539 .type   sha1_block_data_order,#function
 540 .size   sha1_block_data_order,(.-sha1_block_data_order)
 541 .asciz  "SHA1 block transform for SPARCv9a, CRYPTOGAMS by <appro\@openssl.org>"
 542 .align  4
 543 ___
 544
 545 # Purpose of these subroutines is to explicitly encode VIS instructions,
 546 # so that one can compile the module without having to specify VIS
 547 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 548 # Idea is to reserve for option to produce "universal" binary and let
 549 # programmer detect if current CPU is VIS capable at run-time.
 550 sub unvis {
 551 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 552 my ($ref,$opf);
 553 my %visopf = (  "fmul8ulx16"    => 0x037,
 554                 "faligndata"    => 0x048,
 555                 "fpadd32"       => 0x052,
 556                 "fxor"          => 0x06c,
 557                 "fxors"         => 0x06d        );
 558
 559     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 560
 561     if ($opf=$visopf{$mnemonic}) {
 562         foreach ($rs1,$rs2,$rd) {
 563             return $ref if (!/%f([0-9]{1,2})/);
 564             $_=$1;
 565             if ($1>=32) {
 566                 return $ref if ($1&1);
 567                 # re-encode for upper double register addressing
 568                 $_=($1|$1>>5)&31;
 569             }
 570         }
 571
 572         return  sprintf ".word\t0x%08x !%s",
 573                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 574                         $ref;
 575     } else {
 576         return $ref;
 577     }
 578 }
 579 sub unalignaddr {
 580 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 581 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 582 my $ref="$mnemonic\t$rs1,$rs2,$rd";
 583
 584     foreach ($rs1,$rs2,$rd) {
 585         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 586         else                    { return $ref; }
 587     }
 588     return  sprintf ".word\t0x%08x !%s",
 589                     0x81b00300|$rd<<25|$rs1<<14|$rs2,
 590                     $ref;
 591 }
 592
 593 $code =~ s/\`([^\`]*)\`/eval $1/gem;
 594 $code =~ s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),(%f[0-9]{1,2}),(%f[0-9]{1,2})/
 595                 &unvis($1,$2,$3,$4)
 596           /gem;
 597 $code =~ s/\b(alignaddr)\s+(%[goli][0-7]),(%[goli][0-7]),(%[goli][0-7])/
 598                 &unalignaddr($1,$2,$3,$4)
 599           /gem;
 600 print $code;
 601 close STDOUT;