evp: prevent underflow in base64 decoding

[openssl.git] / crypto / sha / asm / sha512-sparcv9.pl
diff --git a/crypto/sha/asm/sha512-sparcv9.pl b/crypto/sha/asm/sha512-sparcv9.pl

index bd9afcb1155e01a192b87821b8dc97004a85bbc5..585740789e63e780cc7bb2549d120bf0d13090af 100644 (file)
--- a/crypto/sha/asm/sha512-sparcv9.pl
+++ b/crypto/sha/asm/sha512-sparcv9.pl
@@ -17,13 +17,30 @@
  # Performance is >75% better than 64-bit code generated by Sun C and
  # over 2x than 32-bit code. X[16] resides on stack, but access to it
  # is scheduled for L2 latency and staged through 32 least significant
-# bits of %l0-%l7. The latter is done to achieve 32-/64-bit bit ABI
+# bits of %l0-%l7. The latter is done to achieve 32-/64-bit ABI
  # duality. Nevetheless it's ~40% faster than SHA256, which is pretty
  # good [optimal coefficient is 50%].
  #
  # SHA512 on UltraSPARC T1.
  #
-# ...
+# It's not any faster than 64-bit code generated by Sun C 5.8. This is
+# because 64-bit code generator has the advantage of using 64-bit
+# loads(*) to access X[16], which I consciously traded for 32-/64-bit
+# ABI duality [as per above]. But it surpasses 32-bit Sun C generated
+# code by 60%, not to mention that it doesn't suffer from severe decay
+# when running 4 times physical cores threads and that it leaves gcc
+# [3.4] behind by over 4x factor! If compared to SHA256, single thread
+# performance is only 10% better, but overall throughput for maximum
+# amount of threads for given CPU exceeds corresponding one of SHA256
+# by 30% [again, optimal coefficient is 50%].
+#
+# (*)  Unlike pre-T1 UltraSPARC loads on T1 are executed strictly
+#      in-order, i.e. load instruction has to complete prior next
+#      instruction in given thread is executed, even if the latter is
+#      not dependent on load result! This means that on T1 two 32-bit
+#      loads are always slower than one 64-bit load. Once again this
+#      is unlike pre-T1 UltraSPARC, where, if scheduled appropriately,
+#      2x32-bit loads can be as fast as 1x64-bit ones.
  
  $bits=32;
  for (@ARGV)    { $bits=64 if (/\-m64/ || /\-xarch\=v9/); }
@@ -288,9 +305,9 @@ $code.=<<___;
         srlx    @X[(($i+9)/2)%8],32,$tmp1       ! X[i+9]
         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
         srl     @X[($i/2)%8],0,$tmp0
+       add     $tmp2,$tmp1,$tmp1
         add     $xi,$T1,$T1                     ! +=X[i]
         xor     $tmp0,@X[($i/2)%8],@X[($i/2)%8]
-       add     $tmp2,$T1,$T1
         add     $tmp1,$T1,$T1
  
         srl     $T1,0,$T1
@@ -301,9 +318,9 @@ ___
  $code.=<<___;
         srlx    @X[($i/2)%8],32,$tmp1           ! X[i]
         xor     $tmp0,$tmp2,$tmp2               ! sigma1(X[i+14])
-       srl     @X[($i/2)%8],0,@X[($i/2)%8]
         add     $xi,$T1,$T1                     ! +=X[i+9]
-       add     $tmp2,$T1,$T1
+       add     $tmp2,$tmp1,$tmp1
+       srl     @X[($i/2)%8],0,@X[($i/2)%8]
         add     $tmp1,$T1,$T1
  
         sllx    $T1,32,$tmp0
@@ -455,7 +472,7 @@ $code.=<<___ if ($SZ==8); # SHA512
  ___
  $code.=<<___;
  .Lpic: call    .+8
-       sub     %o7,.Lpic-K${label},$Ktbl
+       add     %o7,K${label}-.Lpic,$Ktbl
  
         $LD     [$ctx+`0*$SZ`],$A
         $LD     [$ctx+`1*$SZ`],$B
@@ -569,6 +586,7 @@ $code.=<<___;
  .type  sha${label}_block_data_order,#function
  .size  sha${label}_block_data_order,(.-sha${label}_block_data_order)
  .asciz "SHA${label} block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
+.align 4
  ___
  
  $code =~ s/\`([^\`]*)\`/eval $1/gem;