x86_64 assembly pack: allow clang to compile AVX code.

[openssl.git] / crypto / sha / asm / sha512-armv8.pl
diff --git a/crypto/sha/asm/sha512-armv8.pl b/crypto/sha/asm/sha512-armv8.pl

index ca0ee4e745fb610cdb255f0e3bfeec9e203dd4ea..bd7a0a5662e0a236f8f35dcdedd83954c9a7d1f8 100644 (file)
--- a/crypto/sha/asm/sha512-armv8.pl
+++ b/crypto/sha/asm/sha512-armv8.pl
@@ -14,12 +14,18 @@
  #
  #              SHA256-hw       SHA256(*)       SHA512
  # Apple A7     1.97            10.5 (+33%)     6.73 (-1%(**))
-# Cortex-A5x   n/a             n/a             n/a
+# Cortex-A53   2.38            15.6 (+110%)    10.1 (+190%(***))
+# Cortex-A57   2.31            11.6 (+86%)     7.51 (+260%(***))
  # 
  # (*)  Software SHA256 results are of lesser relevance, presented
  #      mostly for informational purposes.
  # (**) The result is a trade-off: it's possible to improve it by
-#      10%, but at the cost of 20% loss on Cortex-A5x.
+#      10% (or by 1 cycle per round), but at the cost of 20% loss
+#      on Cortex-A53 (or by 4 cycles per round).
+# (***)        Super-impressive coefficients over gcc-generated code are
+#      indication of some compiler "pathology", most notably code
+#      generated with -mgeneral-regs-only is significanty faster
+#      and lags behind assembly only by 50-90%.
  
  $flavour=shift;
  $output=shift;
@@ -393,7 +399,7 @@ ___
  
         $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+))?/o
         &&
-       sprintf ".long\t0x%08x\t//%s %s",
+       sprintf ".inst\t0x%08x\t//%s %s",
                         $opcode{$mnemonic}|$1|($2<<5)|($3<<16),
                         $mnemonic,$arg;
      }