crypto/sha/asm/sha512-sparcv9.pl

   1 #! /usr/bin/env perl
   2 # Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved.
   3 #
   4 # Licensed under the Apache License 2.0 (the "License").  You may not use
   5 # this file except in compliance with the License.  You can obtain a copy
   6 # in the file LICENSE in the source distribution or at
   7 # https://www.openssl.org/source/license.html
   8
   9
  10 # ====================================================================
  11 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  12 # project. The module is, however, dual licensed under OpenSSL and
  13 # CRYPTOGAMS licenses depending on where you obtain it. For further
  14 # details see http://www.openssl.org/~appro/cryptogams/.
  15 #
  16 # Hardware SPARC T4 support by David S. Miller
  17 # ====================================================================
  18
  19 # SHA256 performance improvement over compiler generated code varies
  20 # from 40% for Sun C [32-bit build] to 70% for gcc [3.3, 64-bit
  21 # build]. Just like in SHA1 module I aim to ensure scalability on
  22 # UltraSPARC T1 by packing X[16] to 8 64-bit registers.
  23
  24 # SHA512 on pre-T1 UltraSPARC.
  25 #
  26 # Performance is >75% better than 64-bit code generated by Sun C and
  27 # over 2x than 32-bit code. X[16] resides on stack, but access to it
  28 # is scheduled for L2 latency and staged through 32 least significant
  29 # bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  30 # duality. Nevertheless it's ~40% faster than SHA256, which is pretty
  31 # good [optimal coefficient is 50%].
  32 #
  33 # SHA512 on UltraSPARC T1.
  34 #
  35 # It's not any faster than 64-bit code generated by Sun C 5.8. This is
  36 # because 64-bit code generator has the advantage of using 64-bit
  37 # loads(*) to access X[16], which I consciously traded for 32-/64-bit
  38 # ABI duality [as per above]. But it surpasses 32-bit Sun C generated
  39 # code by 60%, not to mention that it doesn't suffer from severe decay
  40 # when running 4 times physical cores threads and that it leaves gcc
  41 # [3.4] behind by over 4x factor! If compared to SHA256, single thread
  42 # performance is only 10% better, but overall throughput for maximum
  43 # amount of threads for given CPU exceeds corresponding one of SHA256
  44 # by 30% [again, optimal coefficient is 50%].
  45 #
  46 # (*)   Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
  47 #       in-order, i.e. load instruction has to complete prior next
  48 #       instruction in given thread is executed, even if the latter is
  49 #       not dependent on load result! This means that on T1 two 32-bit
  50 #       loads are always slower than one 64-bit load. Once again this
  51 #       is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
  52 #       2x32-bit loads can be as fast as 1x64-bit ones.
  53 #
  54 # SPARC T4 SHA256/512 hardware achieves 3.17/2.01 cycles per byte,
  55 # which is 9.3x/11.1x faster than software. Multi-process benchmark
  56 # saturates at 11.5x single-process result on 8-core processor, or
  57 # ~11/16GBps per 2.85GHz socket.
  58
  59 # $output is the last argument if it looks like a file (it has an extension)
  60 $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
  61
  62 $output and open STDOUT,">$output";
  63
  64 if ($output =~ /512/) {
  65         $label="512";
  66         $SZ=8;
  67         $LD="ldx";              # load from memory
  68         $ST="stx";              # store to memory
  69         $SLL="sllx";            # shift left logical
  70         $SRL="srlx";            # shift right logical
  71         @Sigma0=(28,34,39);
  72         @Sigma1=(14,18,41);
  73         @sigma0=( 7, 1, 8);     # right shift first
  74         @sigma1=( 6,19,61);     # right shift first
  75         $lastK=0x817;
  76         $rounds=80;
  77         $align=4;
  78
  79         $locals=16*$SZ;         # X[16]
  80
  81         $A="%o0";
  82         $B="%o1";
  83         $C="%o2";
  84         $D="%o3";
  85         $E="%o4";
  86         $F="%o5";
  87         $G="%g1";
  88         $H="%o7";
  89         @V=($A,$B,$C,$D,$E,$F,$G,$H);
  90 } else {
  91         $label="256";
  92         $SZ=4;
  93         $LD="ld";               # load from memory
  94         $ST="st";               # store to memory
  95         $SLL="sll";             # shift left logical
  96         $SRL="srl";             # shift right logical
  97         @Sigma0=( 2,13,22);
  98         @Sigma1=( 6,11,25);
  99         @sigma0=( 3, 7,18);     # right shift first
 100         @sigma1=(10,17,19);     # right shift first
 101         $lastK=0x8f2;
 102         $rounds=64;
 103         $align=8;
 104
 105         $locals=0;              # X[16] is register resident
 106         @X=("%o0","%o1","%o2","%o3","%o4","%o5","%g1","%o7");
 107
 108         $A="%l0";
 109         $B="%l1";
 110         $C="%l2";
 111         $D="%l3";
 112         $E="%l4";
 113         $F="%l5";
 114         $G="%l6";
 115         $H="%l7";
 116         @V=($A,$B,$C,$D,$E,$F,$G,$H);
 117 }
 118 $T1="%g2";
 119 $tmp0="%g3";
 120 $tmp1="%g4";
 121 $tmp2="%g5";
 122
 123 $ctx="%i0";
 124 $inp="%i1";
 125 $len="%i2";
 126 $Ktbl="%i3";
 127 $tmp31="%i4";
 128 $tmp32="%i5";
 129
 130 ########### SHA256
 131 $Xload = sub {
 132 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 133
 134     if ($i==0) {
 135 $code.=<<___;
 136         ldx     [$inp+0],@X[0]
 137         ldx     [$inp+16],@X[2]
 138         ldx     [$inp+32],@X[4]
 139         ldx     [$inp+48],@X[6]
 140         ldx     [$inp+8],@X[1]
 141         ldx     [$inp+24],@X[3]
 142         subcc   %g0,$tmp31,$tmp32 ! should be 64-$tmp31, but -$tmp31 works too
 143         ldx     [$inp+40],@X[5]
 144         bz,pt   %icc,.Laligned
 145         ldx     [$inp+56],@X[7]
 146
 147         sllx    @X[0],$tmp31,@X[0]
 148         ldx     [$inp+64],$T1
 149 ___
 150 for($j=0;$j<7;$j++)
 151 {   $code.=<<___;
 152         srlx    @X[$j+1],$tmp32,$tmp1
 153         sllx    @X[$j+1],$tmp31,@X[$j+1]
 154         or      $tmp1,@X[$j],@X[$j]
 155 ___
 156 }
 157 $code.=<<___;
 158         srlx    $T1,$tmp32,$T1
 159         or      $T1,@X[7],@X[7]
 160 .Laligned:
 161 ___
 162     }
 163
 164     if ($i&1) {
 165         $code.="\tadd   @X[$i/2],$h,$T1\n";
 166     } else {
 167         $code.="\tsrlx  @X[$i/2],32,$T1\n\tadd  $h,$T1,$T1\n";
 168     }
 169 } if ($SZ==4);
 170
 171 ########### SHA512
 172 $Xload = sub {
 173 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 174 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1),"%l".eval((($i+1)*2)%8));
 175
 176 $code.=<<___ if ($i==0);
 177         ld      [$inp+0],%l0
 178         ld      [$inp+4],%l1
 179         ld      [$inp+8],%l2
 180         ld      [$inp+12],%l3
 181         ld      [$inp+16],%l4
 182         ld      [$inp+20],%l5
 183         ld      [$inp+24],%l6
 184         cmp     $tmp31,0
 185         ld      [$inp+28],%l7
 186 ___
 187 $code.=<<___ if ($i<15);
 188         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 189         add     $tmp31,32,$tmp0
 190         sllx    @pair[0],$tmp0,$tmp1
 191         `"ld    [$inp+".eval(32+0+$i*8)."],@pair[0]"    if ($i<12)`
 192         srlx    @pair[2],$tmp32,@pair[1]
 193         or      $tmp1,$tmp2,$tmp2
 194         or      @pair[1],$tmp2,$tmp2
 195         `"ld    [$inp+".eval(32+4+$i*8)."],@pair[1]"    if ($i<12)`
 196         add     $h,$tmp2,$T1
 197         $ST     $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
 198 ___
 199 $code.=<<___ if ($i==12);
 200         bnz,a,pn        %icc,.+8
 201         ld      [$inp+128],%l0
 202 ___
 203 $code.=<<___ if ($i==15);
 204         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
 205         sllx    @pair[1],$tmp31,$tmp2   ! Xload($i)
 206         add     $tmp31,32,$tmp0
 207         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
 208         sllx    @pair[0],$tmp0,$tmp1
 209         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
 210         srlx    @pair[2],$tmp32,@pair[1]
 211         or      $tmp1,$tmp2,$tmp2
 212         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
 213         or      @pair[1],$tmp2,$tmp2
 214         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
 215         add     $h,$tmp2,$T1
 216         $ST     $tmp2,[%sp+STACK_BIAS+STACK_FRAME+`$i*$SZ`]
 217         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
 218         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
 219         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
 220 ___
 221 } if ($SZ==8);
 222
 223 ########### common
 224 sub BODY_00_15 {
 225 my ($i,$a,$b,$c,$d,$e,$f,$g,$h)=@_;
 226
 227     if ($i<16) {
 228         &$Xload(@_);
 229     } else {
 230         $code.="\tadd   $h,$T1,$T1\n";
 231     }
 232
 233 $code.=<<___;
 234         $SRL    $e,@Sigma1[0],$h        !! $i
 235         xor     $f,$g,$tmp2
 236         $SLL    $e,`$SZ*8-@Sigma1[2]`,$tmp1
 237         and     $e,$tmp2,$tmp2
 238         $SRL    $e,@Sigma1[1],$tmp0
 239         xor     $tmp1,$h,$h
 240         $SLL    $e,`$SZ*8-@Sigma1[1]`,$tmp1
 241         xor     $tmp0,$h,$h
 242         $SRL    $e,@Sigma1[2],$tmp0
 243         xor     $tmp1,$h,$h
 244         $SLL    $e,`$SZ*8-@Sigma1[0]`,$tmp1
 245         xor     $tmp0,$h,$h
 246         xor     $g,$tmp2,$tmp2          ! Ch(e,f,g)
 247         xor     $tmp1,$h,$tmp0          ! Sigma1(e)
 248
 249         $SRL    $a,@Sigma0[0],$h
 250         add     $tmp2,$T1,$T1
 251         $LD     [$Ktbl+`$i*$SZ`],$tmp2  ! K[$i]
 252         $SLL    $a,`$SZ*8-@Sigma0[2]`,$tmp1
 253         add     $tmp0,$T1,$T1
 254         $SRL    $a,@Sigma0[1],$tmp0
 255         xor     $tmp1,$h,$h
 256         $SLL    $a,`$SZ*8-@Sigma0[1]`,$tmp1
 257         xor     $tmp0,$h,$h
 258         $SRL    $a,@Sigma0[2],$tmp0
 259         xor     $tmp1,$h,$h
 260         $SLL    $a,`$SZ*8-@Sigma0[0]`,$tmp1
 261         xor     $tmp0,$h,$h
 262         xor     $tmp1,$h,$h             ! Sigma0(a)
 263
 264         or      $a,$b,$tmp0
 265         and     $a,$b,$tmp1
 266         and     $c,$tmp0,$tmp0
 267         or      $tmp0,$tmp1,$tmp1       ! Maj(a,b,c)
 268         add     $tmp2,$T1,$T1           ! +=K[$i]
 269         add     $tmp1,$h,$h
 270
 271         add     $T1,$d,$d
 272         add     $T1,$h,$h
 273 ___
 274 }
 275
 276 ########### SHA256
 277 $BODY_16_XX = sub {
 278 my $i=@_[0];
 279 my $xi;
 280
 281     if ($i&1) {
 282         $xi=$tmp32;
 283         $code.="\tsrlx  @X[(($i+1)/2)%8],32,$xi\n";
 284     } else {
 285         $xi=@X[(($i+1)/2)%8];
 286     }
 287 $code.=<<___;
 288         srl     $xi,@sigma0[0],$T1              !! Xupdate($i)
 289         sll     $xi,`32-@sigma0[2]`,$tmp1
 290         srl     $xi,@sigma0[1],$tmp0
 291         xor     $tmp1,$T1,$T1
 292         sll     $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 293         xor     $tmp0,$T1,$T1
 294         srl     $xi,@sigma0[2],$tmp0
 295         xor     $tmp1,$T1,$T1
 296 ___
 297     if ($i&1) {
 298         $xi=@X[(($i+14)/2)%8];
 299     } else {
 300         $xi=$tmp32;
 301         $code.="\tsrlx  @X[(($i+14)/2)%8],32,$xi\n";
 302     }
 303 $code.=<<___;
 304         srl     $xi,@sigma1[0],$tmp2
 305         xor     $tmp0,$T1,$T1                   ! T1=sigma0(X[i+1])
 306         sll     $xi,`32-@sigma1[2]`,$tmp1
 307         srl     $xi,@sigma1[1],$tmp0
 308         xor     $tmp1,$tmp2,$tmp2
 309         sll     $tmp1,`@sigma1[2]-@sigma1[1]`,$tmp1
 310         xor     $tmp0,$tmp2,$tmp2
 311         srl     $xi,@sigma1[2],$tmp0
 312         xor     $tmp1,$tmp2,$tmp2
 313 ___
 314     if ($i&1) {
 315         $xi=@X[($i/2)%8];
 316 $code.=<<___;
 317         srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
 318         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 319         srl     @X[($i/2)%8],0,$tmp0
 320         add     $tmp2,$tmp1,$tmp1
 321         add     $xi,$T1,$T1                     ! +=X[i]
 322         xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 323         add     $tmp1,$T1,$T1
 324
 325         srl     $T1,0,$T1
 326         or      $T1,@X[($i/2)%8],@X[($i/2)%8]
 327 ___
 328     } else {
 329         $xi=@X[(($i+9)/2)%8];
 330 $code.=<<___;
 331         srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
 332         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
 333         add     $xi,$T1,$T1                     ! +=X[i+9]
 334         add     $tmp2,$tmp1,$tmp1
 335         srl     @X[($i/2)%8],0,@X[($i/2)%8]
 336         add     $tmp1,$T1,$T1
 337
 338         sllx    $T1,32,$tmp0
 339         or      $tmp0,@X[($i/2)%8],@X[($i/2)%8]
 340 ___
 341     }
 342     &BODY_00_15(@_);
 343 } if ($SZ==4);
 344
 345 ########### SHA512
 346 $BODY_16_XX = sub {
 347 my $i=@_[0];
 348 my @pair=("%l".eval(($i*2)%8),"%l".eval(($i*2)%8+1));
 349
 350 $code.=<<___;
 351         sllx    %l2,32,$tmp0            !! Xupdate($i)
 352         or      %l3,$tmp0,$tmp0
 353
 354         srlx    $tmp0,@sigma0[0],$T1
 355         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+0`],%l2
 356         sllx    $tmp0,`64-@sigma0[2]`,$tmp1
 357         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+1)%16)*$SZ+4`],%l3
 358         srlx    $tmp0,@sigma0[1],$tmp0
 359         xor     $tmp1,$T1,$T1
 360         sllx    $tmp1,`@sigma0[2]-@sigma0[1]`,$tmp1
 361         xor     $tmp0,$T1,$T1
 362         srlx    $tmp0,`@sigma0[2]-@sigma0[1]`,$tmp0
 363         xor     $tmp1,$T1,$T1
 364         sllx    %l6,32,$tmp2
 365         xor     $tmp0,$T1,$T1           ! sigma0(X[$i+1])
 366         or      %l7,$tmp2,$tmp2
 367
 368         srlx    $tmp2,@sigma1[0],$tmp1
 369         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+0`],%l6
 370         sllx    $tmp2,`64-@sigma1[2]`,$tmp0
 371         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+14)%16)*$SZ+4`],%l7
 372         srlx    $tmp2,@sigma1[1],$tmp2
 373         xor     $tmp0,$tmp1,$tmp1
 374         sllx    $tmp0,`@sigma1[2]-@sigma1[1]`,$tmp0
 375         xor     $tmp2,$tmp1,$tmp1
 376         srlx    $tmp2,`@sigma1[2]-@sigma1[1]`,$tmp2
 377         xor     $tmp0,$tmp1,$tmp1
 378         sllx    %l4,32,$tmp0
 379         xor     $tmp2,$tmp1,$tmp1       ! sigma1(X[$i+14])
 380         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+0`],%l4
 381         or      %l5,$tmp0,$tmp0
 382         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+9)%16)*$SZ+4`],%l5
 383
 384         sllx    %l0,32,$tmp2
 385         add     $tmp1,$T1,$T1
 386         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+0`],%l0
 387         or      %l1,$tmp2,$tmp2
 388         add     $tmp0,$T1,$T1           ! +=X[$i+9]
 389         ld      [%sp+STACK_BIAS+STACK_FRAME+`(($i+1+0)%16)*$SZ+4`],%l1
 390         add     $tmp2,$T1,$T1           ! +=X[$i]
 391         $ST     $T1,[%sp+STACK_BIAS+STACK_FRAME+`($i%16)*$SZ`]
 392 ___
 393     &BODY_00_15(@_);
 394 } if ($SZ==8);
 395
 396 $code.=<<___;
 397 #ifndef __ASSEMBLER__
 398 # define __ASSEMBLER__ 1
 399 #endif
 400 #include "crypto/sparc_arch.h"
 401
 402 #ifdef __arch64__
 403 .register       %g2,#scratch
 404 .register       %g3,#scratch
 405 #endif
 406
 407 .section        ".text",#alloc,#execinstr
 408
 409 .align  64
 410 K${label}:
 411 .type   K${label},#object
 412 ___
 413 if ($SZ==4) {
 414 $code.=<<___;
 415         .long   0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
 416         .long   0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
 417         .long   0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
 418         .long   0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
 419         .long   0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
 420         .long   0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
 421         .long   0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
 422         .long   0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
 423         .long   0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
 424         .long   0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
 425         .long   0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
 426         .long   0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
 427         .long   0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
 428         .long   0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
 429         .long   0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
 430         .long   0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
 431 ___
 432 } else {
 433 $code.=<<___;
 434         .long   0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
 435         .long   0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
 436         .long   0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
 437         .long   0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
 438         .long   0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
 439         .long   0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
 440         .long   0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
 441         .long   0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
 442         .long   0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
 443         .long   0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
 444         .long   0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
 445         .long   0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
 446         .long   0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
 447         .long   0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
 448         .long   0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
 449         .long   0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
 450         .long   0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
 451         .long   0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
 452         .long   0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
 453         .long   0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
 454         .long   0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
 455         .long   0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
 456         .long   0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
 457         .long   0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
 458         .long   0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
 459         .long   0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
 460         .long   0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
 461         .long   0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
 462         .long   0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
 463         .long   0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
 464         .long   0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
 465         .long   0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
 466         .long   0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
 467         .long   0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
 468         .long   0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
 469         .long   0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
 470         .long   0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
 471         .long   0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
 472         .long   0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
 473         .long   0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
 474 ___
 475 }
 476 $code.=<<___;
 477 .size   K${label},.-K${label}
 478
 479 #ifdef __PIC__
 480 SPARC_PIC_THUNK(%g1)
 481 #endif
 482
 483 .globl  sha${label}_block_data_order
 484 .align  32
 485 sha${label}_block_data_order:
 486         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 487         ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
 488
 489         andcc   %g1, CFR_SHA${label}, %g0
 490         be      .Lsoftware
 491         nop
 492 ___
 493 $code.=<<___ if ($SZ==8);               # SHA512
 494         ldd     [%o0 + 0x00], %f0       ! load context
 495         ldd     [%o0 + 0x08], %f2
 496         ldd     [%o0 + 0x10], %f4
 497         ldd     [%o0 + 0x18], %f6
 498         ldd     [%o0 + 0x20], %f8
 499         ldd     [%o0 + 0x28], %f10
 500         andcc   %o1, 0x7, %g0
 501         ldd     [%o0 + 0x30], %f12
 502         bne,pn  %icc, .Lhwunaligned
 503          ldd    [%o0 + 0x38], %f14
 504
 505 .Lhwaligned_loop:
 506         ldd     [%o1 + 0x00], %f16
 507         ldd     [%o1 + 0x08], %f18
 508         ldd     [%o1 + 0x10], %f20
 509         ldd     [%o1 + 0x18], %f22
 510         ldd     [%o1 + 0x20], %f24
 511         ldd     [%o1 + 0x28], %f26
 512         ldd     [%o1 + 0x30], %f28
 513         ldd     [%o1 + 0x38], %f30
 514         ldd     [%o1 + 0x40], %f32
 515         ldd     [%o1 + 0x48], %f34
 516         ldd     [%o1 + 0x50], %f36
 517         ldd     [%o1 + 0x58], %f38
 518         ldd     [%o1 + 0x60], %f40
 519         ldd     [%o1 + 0x68], %f42
 520         ldd     [%o1 + 0x70], %f44
 521         subcc   %o2, 1, %o2             ! done yet?
 522         ldd     [%o1 + 0x78], %f46
 523         add     %o1, 0x80, %o1
 524         prefetch [%o1 + 63], 20
 525         prefetch [%o1 + 64+63], 20
 526
 527         .word   0x81b02860              ! SHA512
 528
 529         bne,pt  SIZE_T_CC, .Lhwaligned_loop
 530         nop
 531
 532 .Lhwfinish:
 533         std     %f0, [%o0 + 0x00]       ! store context
 534         std     %f2, [%o0 + 0x08]
 535         std     %f4, [%o0 + 0x10]
 536         std     %f6, [%o0 + 0x18]
 537         std     %f8, [%o0 + 0x20]
 538         std     %f10, [%o0 + 0x28]
 539         std     %f12, [%o0 + 0x30]
 540         retl
 541          std    %f14, [%o0 + 0x38]
 542
 543 .align  16
 544 .Lhwunaligned:
 545         alignaddr %o1, %g0, %o1
 546
 547         ldd     [%o1 + 0x00], %f18
 548 .Lhwunaligned_loop:
 549         ldd     [%o1 + 0x08], %f20
 550         ldd     [%o1 + 0x10], %f22
 551         ldd     [%o1 + 0x18], %f24
 552         ldd     [%o1 + 0x20], %f26
 553         ldd     [%o1 + 0x28], %f28
 554         ldd     [%o1 + 0x30], %f30
 555         ldd     [%o1 + 0x38], %f32
 556         ldd     [%o1 + 0x40], %f34
 557         ldd     [%o1 + 0x48], %f36
 558         ldd     [%o1 + 0x50], %f38
 559         ldd     [%o1 + 0x58], %f40
 560         ldd     [%o1 + 0x60], %f42
 561         ldd     [%o1 + 0x68], %f44
 562         ldd     [%o1 + 0x70], %f46
 563         ldd     [%o1 + 0x78], %f48
 564         subcc   %o2, 1, %o2             ! done yet?
 565         ldd     [%o1 + 0x80], %f50
 566         add     %o1, 0x80, %o1
 567         prefetch [%o1 + 63], 20
 568         prefetch [%o1 + 64+63], 20
 569
 570         faligndata %f18, %f20, %f16
 571         faligndata %f20, %f22, %f18
 572         faligndata %f22, %f24, %f20
 573         faligndata %f24, %f26, %f22
 574         faligndata %f26, %f28, %f24
 575         faligndata %f28, %f30, %f26
 576         faligndata %f30, %f32, %f28
 577         faligndata %f32, %f34, %f30
 578         faligndata %f34, %f36, %f32
 579         faligndata %f36, %f38, %f34
 580         faligndata %f38, %f40, %f36
 581         faligndata %f40, %f42, %f38
 582         faligndata %f42, %f44, %f40
 583         faligndata %f44, %f46, %f42
 584         faligndata %f46, %f48, %f44
 585         faligndata %f48, %f50, %f46
 586
 587         .word   0x81b02860              ! SHA512
 588
 589         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 590         for     %f50, %f50, %f18        ! %f18=%f50
 591
 592         ba      .Lhwfinish
 593         nop
 594 ___
 595 $code.=<<___ if ($SZ==4);               # SHA256
 596         ld      [%o0 + 0x00], %f0
 597         ld      [%o0 + 0x04], %f1
 598         ld      [%o0 + 0x08], %f2
 599         ld      [%o0 + 0x0c], %f3
 600         ld      [%o0 + 0x10], %f4
 601         ld      [%o0 + 0x14], %f5
 602         andcc   %o1, 0x7, %g0
 603         ld      [%o0 + 0x18], %f6
 604         bne,pn  %icc, .Lhwunaligned
 605          ld     [%o0 + 0x1c], %f7
 606
 607 .Lhwloop:
 608         ldd     [%o1 + 0x00], %f8
 609         ldd     [%o1 + 0x08], %f10
 610         ldd     [%o1 + 0x10], %f12
 611         ldd     [%o1 + 0x18], %f14
 612         ldd     [%o1 + 0x20], %f16
 613         ldd     [%o1 + 0x28], %f18
 614         ldd     [%o1 + 0x30], %f20
 615         subcc   %o2, 1, %o2             ! done yet?
 616         ldd     [%o1 + 0x38], %f22
 617         add     %o1, 0x40, %o1
 618         prefetch [%o1 + 63], 20
 619
 620         .word   0x81b02840              ! SHA256
 621
 622         bne,pt  SIZE_T_CC, .Lhwloop
 623         nop
 624
 625 .Lhwfinish:
 626         st      %f0, [%o0 + 0x00]       ! store context
 627         st      %f1, [%o0 + 0x04]
 628         st      %f2, [%o0 + 0x08]
 629         st      %f3, [%o0 + 0x0c]
 630         st      %f4, [%o0 + 0x10]
 631         st      %f5, [%o0 + 0x14]
 632         st      %f6, [%o0 + 0x18]
 633         retl
 634          st     %f7, [%o0 + 0x1c]
 635
 636 .align  8
 637 .Lhwunaligned:
 638         alignaddr %o1, %g0, %o1
 639
 640         ldd     [%o1 + 0x00], %f10
 641 .Lhwunaligned_loop:
 642         ldd     [%o1 + 0x08], %f12
 643         ldd     [%o1 + 0x10], %f14
 644         ldd     [%o1 + 0x18], %f16
 645         ldd     [%o1 + 0x20], %f18
 646         ldd     [%o1 + 0x28], %f20
 647         ldd     [%o1 + 0x30], %f22
 648         ldd     [%o1 + 0x38], %f24
 649         subcc   %o2, 1, %o2             ! done yet?
 650         ldd     [%o1 + 0x40], %f26
 651         add     %o1, 0x40, %o1
 652         prefetch [%o1 + 63], 20
 653
 654         faligndata %f10, %f12, %f8
 655         faligndata %f12, %f14, %f10
 656         faligndata %f14, %f16, %f12
 657         faligndata %f16, %f18, %f14
 658         faligndata %f18, %f20, %f16
 659         faligndata %f20, %f22, %f18
 660         faligndata %f22, %f24, %f20
 661         faligndata %f24, %f26, %f22
 662
 663         .word   0x81b02840              ! SHA256
 664
 665         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 666         for     %f26, %f26, %f10        ! %f10=%f26
 667
 668         ba      .Lhwfinish
 669         nop
 670 ___
 671 $code.=<<___;
 672 .align  16
 673 .Lsoftware:
 674         save    %sp,-STACK_FRAME-$locals,%sp
 675         and     $inp,`$align-1`,$tmp31
 676         sllx    $len,`log(16*$SZ)/log(2)`,$len
 677         andn    $inp,`$align-1`,$inp
 678         sll     $tmp31,3,$tmp31
 679         add     $inp,$len,$len
 680 ___
 681 $code.=<<___ if ($SZ==8); # SHA512
 682         mov     32,$tmp32
 683         sub     $tmp32,$tmp31,$tmp32
 684 ___
 685 $code.=<<___;
 686 .Lpic:  call    .+8
 687         add     %o7,K${label}-.Lpic,$Ktbl
 688
 689         $LD     [$ctx+`0*$SZ`],$A
 690         $LD     [$ctx+`1*$SZ`],$B
 691         $LD     [$ctx+`2*$SZ`],$C
 692         $LD     [$ctx+`3*$SZ`],$D
 693         $LD     [$ctx+`4*$SZ`],$E
 694         $LD     [$ctx+`5*$SZ`],$F
 695         $LD     [$ctx+`6*$SZ`],$G
 696         $LD     [$ctx+`7*$SZ`],$H
 697
 698 .Lloop:
 699 ___
 700 for ($i=0;$i<16;$i++)   { &BODY_00_15($i,@V); unshift(@V,pop(@V)); }
 701 $code.=".L16_xx:\n";
 702 for (;$i<32;$i++)       { &$BODY_16_XX($i,@V); unshift(@V,pop(@V)); }
 703 $code.=<<___;
 704         and     $tmp2,0xfff,$tmp2
 705         cmp     $tmp2,$lastK
 706         bne     .L16_xx
 707         add     $Ktbl,`16*$SZ`,$Ktbl    ! Ktbl+=16
 708
 709 ___
 710 $code.=<<___ if ($SZ==4); # SHA256
 711         $LD     [$ctx+`0*$SZ`],@X[0]
 712         $LD     [$ctx+`1*$SZ`],@X[1]
 713         $LD     [$ctx+`2*$SZ`],@X[2]
 714         $LD     [$ctx+`3*$SZ`],@X[3]
 715         $LD     [$ctx+`4*$SZ`],@X[4]
 716         $LD     [$ctx+`5*$SZ`],@X[5]
 717         $LD     [$ctx+`6*$SZ`],@X[6]
 718         $LD     [$ctx+`7*$SZ`],@X[7]
 719
 720         add     $A,@X[0],$A
 721         $ST     $A,[$ctx+`0*$SZ`]
 722         add     $B,@X[1],$B
 723         $ST     $B,[$ctx+`1*$SZ`]
 724         add     $C,@X[2],$C
 725         $ST     $C,[$ctx+`2*$SZ`]
 726         add     $D,@X[3],$D
 727         $ST     $D,[$ctx+`3*$SZ`]
 728         add     $E,@X[4],$E
 729         $ST     $E,[$ctx+`4*$SZ`]
 730         add     $F,@X[5],$F
 731         $ST     $F,[$ctx+`5*$SZ`]
 732         add     $G,@X[6],$G
 733         $ST     $G,[$ctx+`6*$SZ`]
 734         add     $H,@X[7],$H
 735         $ST     $H,[$ctx+`7*$SZ`]
 736 ___
 737 $code.=<<___ if ($SZ==8); # SHA512
 738         ld      [$ctx+`0*$SZ+0`],%l0
 739         ld      [$ctx+`0*$SZ+4`],%l1
 740         ld      [$ctx+`1*$SZ+0`],%l2
 741         ld      [$ctx+`1*$SZ+4`],%l3
 742         ld      [$ctx+`2*$SZ+0`],%l4
 743         ld      [$ctx+`2*$SZ+4`],%l5
 744         ld      [$ctx+`3*$SZ+0`],%l6
 745
 746         sllx    %l0,32,$tmp0
 747         ld      [$ctx+`3*$SZ+4`],%l7
 748         sllx    %l2,32,$tmp1
 749         or      %l1,$tmp0,$tmp0
 750         or      %l3,$tmp1,$tmp1
 751         add     $tmp0,$A,$A
 752         add     $tmp1,$B,$B
 753         $ST     $A,[$ctx+`0*$SZ`]
 754         sllx    %l4,32,$tmp2
 755         $ST     $B,[$ctx+`1*$SZ`]
 756         sllx    %l6,32,$T1
 757         or      %l5,$tmp2,$tmp2
 758         or      %l7,$T1,$T1
 759         add     $tmp2,$C,$C
 760         $ST     $C,[$ctx+`2*$SZ`]
 761         add     $T1,$D,$D
 762         $ST     $D,[$ctx+`3*$SZ`]
 763
 764         ld      [$ctx+`4*$SZ+0`],%l0
 765         ld      [$ctx+`4*$SZ+4`],%l1
 766         ld      [$ctx+`5*$SZ+0`],%l2
 767         ld      [$ctx+`5*$SZ+4`],%l3
 768         ld      [$ctx+`6*$SZ+0`],%l4
 769         ld      [$ctx+`6*$SZ+4`],%l5
 770         ld      [$ctx+`7*$SZ+0`],%l6
 771
 772         sllx    %l0,32,$tmp0
 773         ld      [$ctx+`7*$SZ+4`],%l7
 774         sllx    %l2,32,$tmp1
 775         or      %l1,$tmp0,$tmp0
 776         or      %l3,$tmp1,$tmp1
 777         add     $tmp0,$E,$E
 778         add     $tmp1,$F,$F
 779         $ST     $E,[$ctx+`4*$SZ`]
 780         sllx    %l4,32,$tmp2
 781         $ST     $F,[$ctx+`5*$SZ`]
 782         sllx    %l6,32,$T1
 783         or      %l5,$tmp2,$tmp2
 784         or      %l7,$T1,$T1
 785         add     $tmp2,$G,$G
 786         $ST     $G,[$ctx+`6*$SZ`]
 787         add     $T1,$H,$H
 788         $ST     $H,[$ctx+`7*$SZ`]
 789 ___
 790 $code.=<<___;
 791         add     $inp,`16*$SZ`,$inp              ! advance inp
 792         cmp     $inp,$len
 793         bne     SIZE_T_CC,.Lloop
 794         sub     $Ktbl,`($rounds-16)*$SZ`,$Ktbl  ! rewind Ktbl
 795
 796         ret
 797         restore
 798 .type   sha${label}_block_data_order,#function
 799 .size   sha${label}_block_data_order,(.-sha${label}_block_data_order)
 800 .asciz  "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 801 .align  4
 802 ___
 803
 804 # Purpose of these subroutines is to explicitly encode VIS instructions,
 805 # so that one can compile the module without having to specify VIS
 806 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 807 # Idea is to reserve for option to produce "universal" binary and let
 808 # programmer detect if current CPU is VIS capable at run-time.
 809 sub unvis {
 810 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 811 my $ref,$opf;
 812 my %visopf = (  "faligndata"    => 0x048,
 813                 "for"           => 0x07c        );
 814
 815     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 816
 817     if ($opf=$visopf{$mnemonic}) {
 818         foreach ($rs1,$rs2,$rd) {
 819             return $ref if (!/%f([0-9]{1,2})/);
 820             $_=$1;
 821             if ($1>=32) {
 822                 return $ref if ($1&1);
 823                 # re-encode for upper double register addressing
 824                 $_=($1|$1>>5)&31;
 825             }
 826         }
 827
 828         return  sprintf ".word\t0x%08x !%s",
 829                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 830                         $ref;
 831     } else {
 832         return $ref;
 833     }
 834 }
 835 sub unalignaddr {
 836 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 837 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 838 my $ref="$mnemonic\t$rs1,$rs2,$rd";
 839
 840     foreach ($rs1,$rs2,$rd) {
 841         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 842         else                    { return $ref; }
 843     }
 844     return  sprintf ".word\t0x%08x !%s",
 845                     0x81b00300|$rd<<25|$rs1<<14|$rs2,
 846                     $ref;
 847 }
 848
 849 foreach (split("\n",$code)) {
 850         s/\`([^\`]*)\`/eval $1/ge;
 851
 852         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
 853                 &unvis($1,$2,$3,$4)
 854          /ge;
 855         s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 856                 &unalignaddr($1,$2,$3,$4)
 857          /ge;
 858
 859         print $_,"\n";
 860 }
 861
 862 close STDOUT or die "error closing STDOUT: $!";