crypto/sha/asm/sha512-sparcv9.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 #
   9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
  10 # ====================================================================
  11
  12 # SHA256 performance improvement over compiler generated code varies
  13 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  14 # build]. Just like in SHA1 module I aim to ensure scalability on
  15 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  16
  17 # SHA512 on pre-T1 UltraSPARC.
  18 #
  19 # Performance is >75% better than 64-bit code generated by Sun C and
  20 # over 2x than 32-bit code. X[16] resides on stack, but access to it
  21 # is scheduled for L2 latency and staged through 32 least significant
  22 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  23 # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
  24 # good [optimal coefficient is 50%].
  25 #
  26 # SHA512 on UltraSPARC T1.
  27 #
  28 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  29 # because 64-bit code generator has the advantage of using 64-bit
  30 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  31 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  32 # code by 60%, not to mention that it doesn't suffer from severe decay
  33 # when running 4 times physical cores threads and that it leaves gcc
  34 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  35 # performance is only 10% better, but overall throughput for maximum
  36 # amount of threads for given CPU exceeds corresponding one of SHA256
  37 # by 30% [again, optimal coefficient is 50%].
  38 #
  39 # (*)   Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  40 #       in-order, i.e. load instruction has to complete prior next
  41 #       instruction in given thread is executed, even if the latter is
  42 #       not dependent on load result! This means that on T1 two 32-bit
  43 #       loads are always slower than one 64-bit load. Once again this
  44 #       is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  45 #       2x32-bit loads can be as fast as 1x64-bit ones.
  46 #
  47 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
  48 # which is 9.3x/11.1x faster than software. Multi-process benchmark
  49 # saturates at 11.5x single-process result on 8-core processor, or
  50 # ~11/16GBps per 2.85GHz socket.
  51
  52
  53 $bits=32;
  54 for (@ARGV)     { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
  55 if ($bits==64)  { $bias=2047; $frame=192; }
  56 else            { $bias=0;    $frame=112; }
  57
  58 $output=shift;
  59 open STDOUT,">$output";
  60
  61 if ($output =~ /512/) {
  62         $label="512";
  63         $SZ=8;
  64         $LD="ldx";              # load from memory
  65         $ST="stx";              # store to memory
  66         $SLL="sllx";            # shift left logical
  67         $SRL="srlx";            # shift right logical
  68         @Sigma0=(28,34,39);
  69         @Sigma1=(14,18,41);
  70         @sigma0=( 7, 1, 8);     # right shift first
  71         @sigma1=( 6,19,61);     # right shift first
  72         $lastK=0x817;
  73         $rounds=80;
  74         $align=4;
  75
  76         $locals=16*$SZ;         # X[16]
  77
  78         $A="%o0";
  79         $B="%o1";
  80         $C="%o2";
  81         $D="%o3";
  82         $E="%o4";
  83         $F="%o5";
  84         $G="%g1";
  85         $H="%o7";
  86         @V=($A,$B,$C,$D,$E,$F,$G,$H);
  87 } else {
  88         $label="256";
  89         $SZ=4;
  90         $LD="ld";               # load from memory
  91         $ST="st";               # store to memory
  92         $SLL="sll";             # shift left logical
  93         $SRL="srl";             # shift right logical
  94         @Sigma0=( 2,13,22);
  95         @Sigma1=( 6,11,25);
  96         @sigma0=( 3, 7,18);     # right shift first
  97         @sigma1=(10,17,19);     # right shift first
  98         $lastK=0x8f2;
  99         $rounds=64;
 100         $align=8;
 101
 102         $locals=0;              # X[16] is register resident
 103         @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
 104
 105         $A="%l0";
 106         $B="%l1";
 107         $C="%l2";
 108         $D="%l3";
 109         $E="%l4";
 110         $F="%l5";
 111         $G="%l6";
 112         $H="%l7";
 113         @V=($A,$B,$C,$D,$E,$F,$G,$H);
 114 }
 115 $T1="%g2";
 116 $tmp0="%g3";
 117 $tmp1="%g4";
 118 $tmp2="%g5";
 119
 120 $ctx="%i0";
 121 $inp="%i1";
 122 $len="%i2";
 123 $Ktbl="%i3";
 124 $tmp31="%i4";
 125 $tmp32="%i5";
 126
 127 ########### SHA256
 128 $Xload = sub {
 129 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 130
 131     if ($i==0) {
 132 $code.=<<___;
 133         ldx     [$inp+0],@X[0]
 134         ldx     [$inp+16],@X[2]
 135         ldx     [$inp+32],@X[4]
 136         ldx     [$inp+48],@X[6]
 137         ldx     [$inp+8],@X[1]
 138         ldx     [$inp+24],@X[3]
 139         subcc   %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
 140         ldx     [$inp+40],@X[5]
 141         bz,pt   %icc,.Laligned
 142         ldx     [$inp+56],@X[7]
 143
 144         sllx    @X[0],$tmp31,@X[0]
 145         ldx     [$inp+64],$T1
 146 ___
 147 for($j=0;$j<7;$j++)
 148 {   $code.=<<___;
 149         srlx    @X[$j+1],$tmp32,$tmp1
 150         sllx    @X[$j+1],$tmp31,@X[$j+1]
 151         or      $tmp1,@X[$j],@X[$j]
 152 ___
 153 }
 154 $code.=<<___;
 155         srlx    $T1,$tmp32,$T1
 156         or      $T1,@X[7],@X[7]
 157 .Laligned:
 158 ___
 159     }
 160
 161     if ($i&1) {
 162         $code.="\tadd   @X[$i/2],$h,$T1\n";
 163     } else {
 164         $code.="\tsrlx  @X[$i/2],32,$T1\n\tadd  $h,$T1,$T1\n";
 165     }
 166 } if ($SZ==4);
 167
 168 ########### SHA512
 169 $Xload = sub {
 170 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 171 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
 172
 173 $code.=<<___ if ($i==0);
 174         ld      [$inp+0],%l0
 175         ld      [$inp+4],%l1
 176         ld      [$inp+8],%l2
 177         ld      [$inp+12],%l3
 178         ld      [$inp+16],%l4
 179         ld      [$inp+20],%l5
 180         ld      [$inp+24],%l6
 181         cmp     $tmp31,0
 182         ld      [$inp+28],%l7
 183 ___
 184 $code.=<<___ if ($i<15);
 185         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 186         add     $tmp31,32,$tmp0
 187         sllx    @pair[0],$tmp0,$tmp1
 188         `"ld    [$inp+".eval(32+0+$i*8)."],@pair[0]"    if ($i<12)`
 189         srlx    @pair[2],$tmp32,@pair[1]
 190         or      $tmp1,$tmp2,$tmp2
 191         or      @pair[1],$tmp2,$tmp2
 192         `"ld    [$inp+".eval(32+4+$i*8)."],@pair[1]"    if ($i<12)`
 193         add     $h,$tmp2,$T1
 194         $ST     $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
 195 ___
 196 $code.=<<___ if ($i==12);
 197         bnz,a,pn        %icc,.+8
 198         ld      [$inp+128],%l0
 199 ___
 200 $code.=<<___ if ($i==15);
 201         ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
 202         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 203         add     $tmp31,32,$tmp0
 204         ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
 205         sllx    @pair[0],$tmp0,$tmp1
 206         ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
 207         srlx    @pair[2],$tmp32,@pair[1]
 208         or      $tmp1,$tmp2,$tmp2
 209         ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
 210         or      @pair[1],$tmp2,$tmp2
 211         ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
 212         add     $h,$tmp2,$T1
 213         $ST     $tmp2,[%sp+`$bias+$frame+$i*$SZ`]
 214         ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
 215         ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
 216         ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
 217 ___
 218 } if ($SZ==8);
 219
 220 ########### common
 221 sub BODY_00_15 {
 222 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 223
 224     if ($i<16) {
 225         &$Xload(@_);
 226     } else {
 227         $code.="\tadd   $h,$T1,$T1\n";
 228     }
 229
 230 $code.=<<___;
 231         $SRL    $e,@Sigma1[0],$h        !! $i
 232         xor     $f,$g,$tmp2
 233         $SLL    $e,`$SZ*8-@Sigma1[2]`,$tmp1
 234         and     $e,$tmp2,$tmp2
 235         $SRL    $e,@Sigma1[1],$tmp0
 236         xor     $tmp1,$h,$h
 237         $SLL    $e,`$SZ*8-@Sigma1[1]`,$tmp1
 238         xor     $tmp0,$h,$h
 239         $SRL    $e,@Sigma1[2],$tmp0
 240         xor     $tmp1,$h,$h
 241         $SLL    $e,`$SZ*8-@Sigma1[0]`,$tmp1
 242         xor     $tmp0,$h,$h
 243         xor     $g,$tmp2,$tmp2          ! Ch(e,f,g)
 244         xor     $tmp1,$h,$tmp0          ! Sigma1(e)
 245
 246         $SRL    $a,@Sigma0[0],$h
 247         add     $tmp2,$T1,$T1
 248         $LD     [$Ktbl+`$i*$SZ`],$tmp2  ! K[$i]
 249         $SLL    $a,`$SZ*8-@Sigma0[2]`,$tmp1
 250         add     $tmp0,$T1,$T1
 251         $SRL    $a,@Sigma0[1],$tmp0
 252         xor     $tmp1,$h,$h
 253         $SLL    $a,`$SZ*8-@Sigma0[1]`,$tmp1
 254         xor     $tmp0,$h,$h
 255         $SRL    $a,@Sigma0[2],$tmp0
 256         xor     $tmp1,$h,$h
 257         $SLL    $a,`$SZ*8-@Sigma0[0]`,$tmp1
 258         xor     $tmp0,$h,$h
 259         xor     $tmp1,$h,$h             ! Sigma0(a)
 260
 261         or      $a,$b,$tmp0
 262         and     $a,$b,$tmp1
 263         and     $c,$tmp0,$tmp0
 264         or      $tmp0,$tmp1,$tmp1       ! Maj(a,b,c)
 265         add     $tmp2,$T1,$T1           ! +=K[$i]
 266         add     $tmp1,$h,$h
 267
 268         add     $T1,$d,$d
 269         add     $T1,$h,$h
 270 ___
 271 }
 272
 273 ########### SHA256
 274 $BODY_16_XX = sub {
 275 my $i=@_[0];
 276 my $xi;
 277
 278     if ($i&1) {
 279         $xi=$tmp32;
 280         $code.="\tsrlx  @X[(($i+1)/2)%8],32,$xi\n";
 281     } else {
 282         $xi=@X[(($i+1)/2)%8];
 283     }
 284 $code.=<<___;
 285         srl     $xi,@sigma0[0],$T1              !! Xupdate($i)
 286         sll     $xi,`32-@sigma0[2]`,$tmp1
 287         srl     $xi,@sigma0[1],$tmp0
 288         xor     $tmp1,$T1,$T1
 289         sll     $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 290         xor     $tmp0,$T1,$T1
 291         srl     $xi,@sigma0[2],$tmp0
 292         xor     $tmp1,$T1,$T1
 293 ___
 294     if ($i&1) {
 295         $xi=@X[(($i+14)/2)%8];
 296     } else {
 297         $xi=$tmp32;
 298         $code.="\tsrlx  @X[(($i+14)/2)%8],32,$xi\n";
 299     }
 300 $code.=<<___;
 301         srl     $xi,@sigma1[0],$tmp2
 302         xor     $tmp0,$T1,$T1                   ! T1=sigma0(X[i+1])
 303         sll     $xi,`32-@sigma1[2]`,$tmp1
 304         srl     $xi,@sigma1[1],$tmp0
 305         xor     $tmp1,$tmp2,$tmp2
 306         sll     $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
 307         xor     $tmp0,$tmp2,$tmp2
 308         srl     $xi,@sigma1[2],$tmp0
 309         xor     $tmp1,$tmp2,$tmp2
 310 ___
 311     if ($i&1) {
 312         $xi=@X[($i/2)%8];
 313 $code.=<<___;
 314         srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
 315         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 316         srl     @X[($i/2)%8],0,$tmp0
 317         add     $tmp2,$tmp1,$tmp1
 318         add     $xi,$T1,$T1                     ! +=X[i]
 319         xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 320         add     $tmp1,$T1,$T1
 321
 322         srl     $T1,0,$T1
 323         or      $T1,@X[($i/2)%8],@X[($i/2)%8]
 324 ___
 325     } else {
 326         $xi=@X[(($i+9)/2)%8];
 327 $code.=<<___;
 328         srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
 329         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 330         add     $xi,$T1,$T1                     ! +=X[i+9]
 331         add     $tmp2,$tmp1,$tmp1
 332         srl     @X[($i/2)%8],0,@X[($i/2)%8]
 333         add     $tmp1,$T1,$T1
 334
 335         sllx    $T1,32,$tmp0
 336         or      $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 337 ___
 338     }
 339     &BODY_00_15(@_);
 340 } if ($SZ==4);
 341
 342 ########### SHA512
 343 $BODY_16_XX = sub {
 344 my $i=@_[0];
 345 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
 346
 347 $code.=<<___;
 348         sllx    %l2,32,$tmp0            !! Xupdate($i)
 349         or      %l3,$tmp0,$tmp0
 350
 351         srlx    $tmp0,@sigma0[0],$T1
 352         ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+0`],%l2
 353         sllx    $tmp0,`64-@sigma0[2]`,$tmp1
 354         ld      [%sp+`$bias+$frame+(($i+1+1)%16)*$SZ+4`],%l3
 355         srlx    $tmp0,@sigma0[1],$tmp0
 356         xor     $tmp1,$T1,$T1
 357         sllx    $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 358         xor     $tmp0,$T1,$T1
 359         srlx    $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
 360         xor     $tmp1,$T1,$T1
 361         sllx    %l6,32,$tmp2
 362         xor     $tmp0,$T1,$T1           ! sigma0(X[$i+1])
 363         or      %l7,$tmp2,$tmp2
 364
 365         srlx    $tmp2,@sigma1[0],$tmp1
 366         ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+0`],%l6
 367         sllx    $tmp2,`64-@sigma1[2]`,$tmp0
 368         ld      [%sp+`$bias+$frame+(($i+1+14)%16)*$SZ+4`],%l7
 369         srlx    $tmp2,@sigma1[1],$tmp2
 370         xor     $tmp0,$tmp1,$tmp1
 371         sllx    $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
 372         xor     $tmp2,$tmp1,$tmp1
 373         srlx    $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
 374         xor     $tmp0,$tmp1,$tmp1
 375         sllx    %l4,32,$tmp0
 376         xor     $tmp2,$tmp1,$tmp1       ! sigma1(X[$i+14])
 377         ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+0`],%l4
 378         or      %l5,$tmp0,$tmp0
 379         ld      [%sp+`$bias+$frame+(($i+1+9)%16)*$SZ+4`],%l5
 380
 381         sllx    %l0,32,$tmp2
 382         add     $tmp1,$T1,$T1
 383         ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+0`],%l0
 384         or      %l1,$tmp2,$tmp2
 385         add     $tmp0,$T1,$T1           ! +=X[$i+9]
 386         ld      [%sp+`$bias+$frame+(($i+1+0)%16)*$SZ+4`],%l1
 387         add     $tmp2,$T1,$T1           ! +=X[$i]
 388         $ST     $T1,[%sp+`$bias+$frame+($i%16)*$SZ`]
 389 ___
 390     &BODY_00_15(@_);
 391 } if ($SZ==8);
 392
 393 $code.=<<___ if ($bits==64);
 394 .register       %g2,#scratch
 395 .register       %g3,#scratch
 396 ___
 397 $code.=<<___;
 398 #include "sparc_arch.h"
 399
 400 .section        ".text",#alloc,#execinstr
 401
 402 .align  64
 403 K${label}:
 404 .type   K${label},#object
 405 ___
 406 if ($SZ==4) {
 407 $code.=<<___;
 408         .long   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
 409         .long   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
 410         .long   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
 411         .long   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
 412         .long   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
 413         .long   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
 414         .long   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
 415         .long   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
 416         .long   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
 417         .long   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
 418         .long   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
 419         .long   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
 420         .long   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
 421         .long   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
 422         .long   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 423         .long   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 424 ___
 425 } else {
 426 $code.=<<___;
 427         .long   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
 428         .long   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
 429         .long   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
 430         .long   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
 431         .long   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
 432         .long   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
 433         .long   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
 434         .long   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
 435         .long   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
 436         .long   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
 437         .long   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
 438         .long   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
 439         .long   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
 440         .long   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
 441         .long   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
 442         .long   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
 443         .long   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
 444         .long   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
 445         .long   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
 446         .long   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
 447         .long   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
 448         .long   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
 449         .long   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
 450         .long   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
 451         .long   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
 452         .long   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
 453         .long   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
 454         .long   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
 455         .long   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
 456         .long   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
 457         .long   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
 458         .long   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
 459         .long   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
 460         .long   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
 461         .long   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
 462         .long   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
 463         .long   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
 464         .long   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
 465         .long   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
 466         .long   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
 467 ___
 468 }
 469 $code.=<<___;
 470 .size   K${label},.-K${label}
 471
 472 #ifdef __PIC__
 473 SPARC_PIC_THUNK(%g1)
 474 #endif
 475
 476 .globl  sha${label}_block_data_order
 477 .align  32
 478 sha${label}_block_data_order:
 479         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 480         ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
 481
 482         andcc   %g1, CFR_SHA${label}, %g0
 483         be      .Lsoftware
 484         nop
 485 ___
 486 $code.=<<___ if ($SZ==8);               # SHA512
 487         ldd     [%o0 + 0x00], %f0       ! load context
 488         ldd     [%o0 + 0x08], %f2
 489         ldd     [%o0 + 0x10], %f4
 490         ldd     [%o0 + 0x18], %f6
 491         ldd     [%o0 + 0x20], %f8
 492         ldd     [%o0 + 0x28], %f10
 493         andcc   %o1, 0x7, %g0
 494         ldd     [%o0 + 0x30], %f12
 495         bne,pn  %icc, .Lhwunaligned
 496          ldd    [%o0 + 0x38], %f14
 497
 498 .Lhwaligned_loop:
 499         ldd     [%o1 + 0x00], %f16
 500         ldd     [%o1 + 0x08], %f18
 501         ldd     [%o1 + 0x10], %f20
 502         ldd     [%o1 + 0x18], %f22
 503         ldd     [%o1 + 0x20], %f24
 504         ldd     [%o1 + 0x28], %f26
 505         ldd     [%o1 + 0x30], %f28
 506         ldd     [%o1 + 0x38], %f30
 507         ldd     [%o1 + 0x40], %f32
 508         ldd     [%o1 + 0x48], %f34
 509         ldd     [%o1 + 0x50], %f36
 510         ldd     [%o1 + 0x58], %f38
 511         ldd     [%o1 + 0x60], %f40
 512         ldd     [%o1 + 0x68], %f42
 513         ldd     [%o1 + 0x70], %f44
 514         subcc   %o2, 1, %o2             ! done yet?
 515         ldd     [%o1 + 0x78], %f46
 516         add     %o1, 0x80, %o1
 517         prefetch [%o1 + 63], 20
 518         prefetch [%o1 + 64+63], 20
 519
 520         .word   0x81b02860              ! SHA512
 521
 522         bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwaligned_loop
 523         nop
 524
 525 .Lhwfinish:
 526         std     %f0, [%o0 + 0x00]       ! store context
 527         std     %f2, [%o0 + 0x08]
 528         std     %f4, [%o0 + 0x10]
 529         std     %f6, [%o0 + 0x18]
 530         std     %f8, [%o0 + 0x20]
 531         std     %f10, [%o0 + 0x28]
 532         std     %f12, [%o0 + 0x30]
 533         retl
 534          std    %f14, [%o0 + 0x38]
 535
 536 .align  16
 537 .Lhwunaligned:
 538         alignaddr %o1, %g0, %o1
 539
 540         ldd     [%o1 + 0x00], %f18
 541 .Lhwunaligned_loop:
 542         ldd     [%o1 + 0x08], %f20
 543         ldd     [%o1 + 0x10], %f22
 544         ldd     [%o1 + 0x18], %f24
 545         ldd     [%o1 + 0x20], %f26
 546         ldd     [%o1 + 0x28], %f28
 547         ldd     [%o1 + 0x30], %f30
 548         ldd     [%o1 + 0x38], %f32
 549         ldd     [%o1 + 0x40], %f34
 550         ldd     [%o1 + 0x48], %f36
 551         ldd     [%o1 + 0x50], %f38
 552         ldd     [%o1 + 0x58], %f40
 553         ldd     [%o1 + 0x60], %f42
 554         ldd     [%o1 + 0x68], %f44
 555         ldd     [%o1 + 0x70], %f46
 556         ldd     [%o1 + 0x78], %f48
 557         subcc   %o2, 1, %o2             ! done yet?
 558         ldd     [%o1 + 0x80], %f50
 559         add     %o1, 0x80, %o1
 560         prefetch [%o1 + 63], 20
 561         prefetch [%o1 + 64+63], 20
 562
 563         faligndata %f18, %f20, %f16
 564         faligndata %f20, %f22, %f18
 565         faligndata %f22, %f24, %f20
 566         faligndata %f24, %f26, %f22
 567         faligndata %f26, %f28, %f24
 568         faligndata %f28, %f30, %f26
 569         faligndata %f30, %f32, %f28
 570         faligndata %f32, %f34, %f30
 571         faligndata %f34, %f36, %f32
 572         faligndata %f36, %f38, %f34
 573         faligndata %f38, %f40, %f36
 574         faligndata %f40, %f42, %f38
 575         faligndata %f42, %f44, %f40
 576         faligndata %f44, %f46, %f42
 577         faligndata %f46, %f48, %f44
 578         faligndata %f48, %f50, %f46
 579
 580         .word   0x81b02860              ! SHA512
 581
 582         bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
 583         for     %f50, %f50, %f18        ! %f18=%f50
 584
 585         ba      .Lhwfinish
 586         nop
 587 ___
 588 $code.=<<___ if ($SZ==4);               # SHA256
 589         ld      [%o0 + 0x00], %f0
 590         ld      [%o0 + 0x04], %f1
 591         ld      [%o0 + 0x08], %f2
 592         ld      [%o0 + 0x0c], %f3
 593         ld      [%o0 + 0x10], %f4
 594         ld      [%o0 + 0x14], %f5
 595         andcc   %o1, 0x7, %g0
 596         ld      [%o0 + 0x18], %f6
 597         bne,pn  %icc, .Lhwunaligned
 598          ld     [%o0 + 0x1c], %f7
 599
 600 .Lhwloop:
 601         ldd     [%o1 + 0x00], %f8
 602         ldd     [%o1 + 0x08], %f10
 603         ldd     [%o1 + 0x10], %f12
 604         ldd     [%o1 + 0x18], %f14
 605         ldd     [%o1 + 0x20], %f16
 606         ldd     [%o1 + 0x28], %f18
 607         ldd     [%o1 + 0x30], %f20
 608         subcc   %o2, 1, %o2             ! done yet?
 609         ldd     [%o1 + 0x38], %f22
 610         add     %o1, 0x40, %o1
 611         prefetch [%o1 + 63], 20
 612
 613         .word   0x81b02840              ! SHA256
 614
 615         bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwloop
 616         nop
 617
 618 .Lhwfinish:
 619         st      %f0, [%o0 + 0x00]       ! store context
 620         st      %f1, [%o0 + 0x04]
 621         st      %f2, [%o0 + 0x08]
 622         st      %f3, [%o0 + 0x0c]
 623         st      %f4, [%o0 + 0x10]
 624         st      %f5, [%o0 + 0x14]
 625         st      %f6, [%o0 + 0x18]
 626         retl
 627          st     %f7, [%o0 + 0x1c]
 628
 629 .align  8
 630 .Lhwunaligned:
 631         alignaddr %o1, %g0, %o1
 632
 633         ldd     [%o1 + 0x00], %f10
 634 .Lhwunaligned_loop:
 635         ldd     [%o1 + 0x08], %f12
 636         ldd     [%o1 + 0x10], %f14
 637         ldd     [%o1 + 0x18], %f16
 638         ldd     [%o1 + 0x20], %f18
 639         ldd     [%o1 + 0x28], %f20
 640         ldd     [%o1 + 0x30], %f22
 641         ldd     [%o1 + 0x38], %f24
 642         subcc   %o2, 1, %o2             ! done yet?
 643         ldd     [%o1 + 0x40], %f26
 644         add     %o1, 0x40, %o1
 645         prefetch [%o1 + 63], 20
 646
 647         faligndata %f10, %f12, %f8
 648         faligndata %f12, %f14, %f10
 649         faligndata %f14, %f16, %f12
 650         faligndata %f16, %f18, %f14
 651         faligndata %f18, %f20, %f16
 652         faligndata %f20, %f22, %f18
 653         faligndata %f22, %f24, %f20
 654         faligndata %f24, %f26, %f22
 655
 656         .word   0x81b02840              ! SHA256
 657
 658         bne,pt  `$bits==64?"%xcc":"%icc"`, .Lhwunaligned_loop
 659         for     %f26, %f26, %f10        ! %f10=%f26
 660
 661         ba      .Lhwfinish
 662         nop
 663 ___
 664 $code.=<<___;
 665 .align  16
 666 .Lsoftware:
 667         save    %sp,`-$frame-$locals`,%sp
 668         and     $inp,`$align-1`,$tmp31
 669         sllx    $len,`log(16*$SZ)/log(2)`,$len
 670         andn    $inp,`$align-1`,$inp
 671         sll     $tmp31,3,$tmp31
 672         add     $inp,$len,$len
 673 ___
 674 $code.=<<___ if ($SZ==8); # SHA512
 675         mov     32,$tmp32
 676         sub     $tmp32,$tmp31,$tmp32
 677 ___
 678 $code.=<<___;
 679 .Lpic:  call    .+8
 680         add     %o7,K${label}-.Lpic,$Ktbl
 681
 682         $LD     [$ctx+`0*$SZ`],$A
 683         $LD     [$ctx+`1*$SZ`],$B
 684         $LD     [$ctx+`2*$SZ`],$C
 685         $LD     [$ctx+`3*$SZ`],$D
 686         $LD     [$ctx+`4*$SZ`],$E
 687         $LD     [$ctx+`5*$SZ`],$F
 688         $LD     [$ctx+`6*$SZ`],$G
 689         $LD     [$ctx+`7*$SZ`],$H
 690
 691 .Lloop:
 692 ___
 693 for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 694 $code.=".L16_xx:\n";
 695 for (;$i<32;$i++)       { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 696 $code.=<<___;
 697         and     $tmp2,0xfff,$tmp2
 698         cmp     $tmp2,$lastK
 699         bne     .L16_xx
 700         add     $Ktbl,`16*$SZ`,$Ktbl    ! Ktbl+=16
 701
 702 ___
 703 $code.=<<___ if ($SZ==4); # SHA256
 704         $LD     [$ctx+`0*$SZ`],@X[0]
 705         $LD     [$ctx+`1*$SZ`],@X[1]
 706         $LD     [$ctx+`2*$SZ`],@X[2]
 707         $LD     [$ctx+`3*$SZ`],@X[3]
 708         $LD     [$ctx+`4*$SZ`],@X[4]
 709         $LD     [$ctx+`5*$SZ`],@X[5]
 710         $LD     [$ctx+`6*$SZ`],@X[6]
 711         $LD     [$ctx+`7*$SZ`],@X[7]
 712
 713         add     $A,@X[0],$A
 714         $ST     $A,[$ctx+`0*$SZ`]
 715         add     $B,@X[1],$B
 716         $ST     $B,[$ctx+`1*$SZ`]
 717         add     $C,@X[2],$C
 718         $ST     $C,[$ctx+`2*$SZ`]
 719         add     $D,@X[3],$D
 720         $ST     $D,[$ctx+`3*$SZ`]
 721         add     $E,@X[4],$E
 722         $ST     $E,[$ctx+`4*$SZ`]
 723         add     $F,@X[5],$F
 724         $ST     $F,[$ctx+`5*$SZ`]
 725         add     $G,@X[6],$G
 726         $ST     $G,[$ctx+`6*$SZ`]
 727         add     $H,@X[7],$H
 728         $ST     $H,[$ctx+`7*$SZ`]
 729 ___
 730 $code.=<<___ if ($SZ==8); # SHA512
 731         ld      [$ctx+`0*$SZ+0`],%l0
 732         ld      [$ctx+`0*$SZ+4`],%l1
 733         ld      [$ctx+`1*$SZ+0`],%l2
 734         ld      [$ctx+`1*$SZ+4`],%l3
 735         ld      [$ctx+`2*$SZ+0`],%l4
 736         ld      [$ctx+`2*$SZ+4`],%l5
 737         ld      [$ctx+`3*$SZ+0`],%l6
 738
 739         sllx    %l0,32,$tmp0
 740         ld      [$ctx+`3*$SZ+4`],%l7
 741         sllx    %l2,32,$tmp1
 742         or      %l1,$tmp0,$tmp0
 743         or      %l3,$tmp1,$tmp1
 744         add     $tmp0,$A,$A
 745         add     $tmp1,$B,$B
 746         $ST     $A,[$ctx+`0*$SZ`]
 747         sllx    %l4,32,$tmp2
 748         $ST     $B,[$ctx+`1*$SZ`]
 749         sllx    %l6,32,$T1
 750         or      %l5,$tmp2,$tmp2
 751         or      %l7,$T1,$T1
 752         add     $tmp2,$C,$C
 753         $ST     $C,[$ctx+`2*$SZ`]
 754         add     $T1,$D,$D
 755         $ST     $D,[$ctx+`3*$SZ`]
 756
 757         ld      [$ctx+`4*$SZ+0`],%l0
 758         ld      [$ctx+`4*$SZ+4`],%l1
 759         ld      [$ctx+`5*$SZ+0`],%l2
 760         ld      [$ctx+`5*$SZ+4`],%l3
 761         ld      [$ctx+`6*$SZ+0`],%l4
 762         ld      [$ctx+`6*$SZ+4`],%l5
 763         ld      [$ctx+`7*$SZ+0`],%l6
 764
 765         sllx    %l0,32,$tmp0
 766         ld      [$ctx+`7*$SZ+4`],%l7
 767         sllx    %l2,32,$tmp1
 768         or      %l1,$tmp0,$tmp0
 769         or      %l3,$tmp1,$tmp1
 770         add     $tmp0,$E,$E
 771         add     $tmp1,$F,$F
 772         $ST     $E,[$ctx+`4*$SZ`]
 773         sllx    %l4,32,$tmp2
 774         $ST     $F,[$ctx+`5*$SZ`]
 775         sllx    %l6,32,$T1
 776         or      %l5,$tmp2,$tmp2
 777         or      %l7,$T1,$T1
 778         add     $tmp2,$G,$G
 779         $ST     $G,[$ctx+`6*$SZ`]
 780         add     $T1,$H,$H
 781         $ST     $H,[$ctx+`7*$SZ`]
 782 ___
 783 $code.=<<___;
 784         add     $inp,`16*$SZ`,$inp              ! advance inp
 785         cmp     $inp,$len
 786         bne     `$bits==64?"%xcc":"%icc"`,.Lloop
 787         sub     $Ktbl,`($rounds-16)*$SZ`,$Ktbl  ! rewind Ktbl
 788
 789         ret
 790         restore
 791 .type   sha${label}_block_data_order,#function
 792 .size   sha${label}_block_data_order,(.-sha${label}_block_data_order)
 793 .asciz  "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 794 .align  4
 795 ___
 796
 797 # Purpose of these subroutines is to explicitly encode VIS instructions,
 798 # so that one can compile the module without having to specify VIS
 799 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 800 # Idea is to reserve for option to produce "universal" binary and let
 801 # programmer detect if current CPU is VIS capable at run-time.
 802 sub unvis {
 803 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 804 my $ref,$opf;
 805 my %visopf = (  "faligndata"    => 0x048,
 806                 "for"           => 0x07c        );
 807
 808     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 809
 810     if ($opf=$visopf{$mnemonic}) {
 811         foreach ($rs1,$rs2,$rd) {
 812             return $ref if (!/%f([0-9]{1,2})/);
 813             $_=$1;
 814             if ($1>=32) {
 815                 return $ref if ($1&1);
 816                 # re-encode for upper double register addressing
 817                 $_=($1|$1>>5)&31;
 818             }
 819         }
 820
 821         return  sprintf ".word\t0x%08x !%s",
 822                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 823                         $ref;
 824     } else {
 825         return $ref;
 826     }
 827 }
 828 sub unalignaddr {
 829 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 830 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 831 my $ref="$mnemonic\t$rs1,$rs2,$rd";
 832
 833     foreach ($rs1,$rs2,$rd) {
 834         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 835         else                    { return $ref; }
 836     }
 837     return  sprintf ".word\t0x%08x !%s",
 838                     0x81b00300|$rd<<25|$rs1<<14|$rs2,
 839                     $ref;
 840 }
 841
 842 foreach (split("\n",$code)) {
 843         s/\`([^\`]*)\`/eval $1/ge;
 844
 845         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
 846                 &unvis($1,$2,$3,$4)
 847          /ge;
 848         s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 849                 &unalignaddr($1,$2,$3,$4)
 850          /ge;
 851
 852         print $_,"\n";
 853 }
 854
 855 close STDOUT;