x86_64 assembly pack: "optimize" for Knights Landing, add AVX-512 results.

author Andy Polyakov <appro@openssl.org>

Thu, 20 Jul 2017 07:48:35 +0000 (09:48 +0200)

committer Andy Polyakov <appro@openssl.org>

Fri, 21 Jul 2017 12:07:32 +0000 (14:07 +0200)
author Andy Polyakov <appro@openssl.org>
Thu, 20 Jul 2017 07:48:35 +0000 (09:48 +0200)
committer Andy Polyakov <appro@openssl.org>
Fri, 21 Jul 2017 12:07:32 +0000 (14:07 +0200)
diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl

index ea5dc14ae5dd5de28d22ee65e88a8f7afc4df64f..4c7119b2246c4cb9844baecfad3de9daf70da4d7 100644 (file)
--- a/crypto/aes/asm/aesni-x86_64.pl
+++ b/crypto/aes/asm/aesni-x86_64.pl
@@ -179,6 +179,7 @@
  # Haswell      4.44/0.63       0.63    0.73    0.63    0.70
  # Skylake      2.62/0.63       0.63    0.63    0.63
  # Silvermont   5.75/3.54       3.56    4.12    3.87(*) 4.11
  # Haswell      4.44/0.63       0.63    0.73    0.63    0.70
  # Skylake      2.62/0.63       0.63    0.63    0.63
  # Silvermont   5.75/3.54       3.56    4.12    3.87(*) 4.11
+# Knights L    2.54/0.77       0.78    0.85    -       1.50
  # Goldmont     3.82/1.26       1.26    1.29    1.29    1.50
  # Bulldozer    5.77/0.70       0.72    0.90    0.70    0.95
  # Ryzen                2.71/0.35       0.35    0.44    0.38    0.49
  # Goldmont     3.82/1.26       1.26    1.29    1.29    1.50
  # Bulldozer    5.77/0.70       0.72    0.90    0.70    0.95
  # Ryzen                2.71/0.35       0.35    0.44    0.38    0.49
diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl

index e2c6a32440c1c51be55dc403d2897952784e7929..0cfe8990faaa0b33f9ed030e0dbf341010362812 100755 (executable)
--- a/crypto/chacha/asm/chacha-x86_64.pl
+++ b/crypto/chacha/asm/chacha-x86_64.pl
@@ -24,7 +24,7 @@
  #
  # Performance in cycles per byte out of large buffer.
  #
  #
  # Performance in cycles per byte out of large buffer.
  #
-#              IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     8xAVX2
+#              IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     NxAVX(v)
  #
  # P4           9.48/+99%       -/22.7(ii)      -
  # Core2                7.83/+55%       7.90/8.08       4.35
  #
  # P4           9.48/+99%       -/22.7(ii)      -
  # Core2                7.83/+55%       7.90/8.08       4.35
@@ -32,8 +32,9 @@
  # Sandy Bridge 8.31/+42%       5.45/6.76       2.72
  # Ivy Bridge   6.71/+46%       5.40/6.49       2.41
  # Haswell      5.92/+43%       5.20/6.45       2.42        1.23
  # Sandy Bridge 8.31/+42%       5.45/6.76       2.72
  # Ivy Bridge   6.71/+46%       5.40/6.49       2.41
  # Haswell      5.92/+43%       5.20/6.45       2.42        1.23
-# Skylake      5.87/+39%       4.70/-          2.31        1.19
+# Skylake[-X]  5.87/+39%       4.70/-          2.31        1.19[0.57]
  # Silvermont   12.0/+33%       7.75/7.40       7.03(iii)
  # Silvermont   12.0/+33%       7.75/7.40       7.03(iii)
+# Knights L    11.7/-          -               9.60(iii)   0.80
  # Goldmont     10.6/+17%       5.10/-          3.28
  # Sledgehammer 7.28/+52%       -/14.2(ii)      -
  # Bulldozer    9.66/+28%       9.85/11.1       3.06(iv)
  # Goldmont     10.6/+17%       5.10/-          3.28
  # Sledgehammer 7.28/+52%       -/14.2(ii)      -
  # Bulldozer    9.66/+28%       9.85/11.1       3.06(iv)
@@ -50,6 +51,7 @@
  #      limitations, SSE2 can do better, but gain is considered too
  #      low to justify the [maintenance] effort;
  # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
  #      limitations, SSE2 can do better, but gain is considered too
  #      low to justify the [maintenance] effort;
  # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
+# (v)  8xAVX2 or 16xAVX-512, whichever best applicable;
  
  $flavour = shift;
  $output  = shift;
  
  $flavour = shift;
  $output  = shift;
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl

index 3cd231e22f0da49472aabced56bd71566a20c218..b42016101ebc6603919dc8668832593c3fa9429b 100644 (file)
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -35,6 +35,8 @@
  # Applications using the EVP interface will observe a few percent
  # worse performance.]
  #
  # Applications using the EVP interface will observe a few percent
  # worse performance.]
  #
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
  # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
  
  # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
  
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl

index 817f6e59a03877c737126802cbe835957764daa8..fa4c9063a2991019d90ad42ecf7511e4eea8c242 100644 (file)
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -74,6 +74,7 @@
  # Skylake      0.44(+110%)(if system doesn't support AVX)
  # Bulldozer    1.49(+27%)
  # Silvermont   2.88(+13%)
  # Skylake      0.44(+110%)(if system doesn't support AVX)
  # Bulldozer    1.49(+27%)
  # Silvermont   2.88(+13%)
+# Knights L    2.12(-)    (if system doesn't support AVX)
  # Goldmont     1.08(+24%)
  
  # March 2013
  # Goldmont     1.08(+24%)
  
  # March 2013
@@ -86,6 +87,8 @@
  # it performs in 0.41 cycles per byte on Haswell processor, in
  # 0.29 on Broadwell, and in 0.36 on Skylake.
  #
  # it performs in 0.41 cycles per byte on Haswell processor, in
  # 0.29 on Broadwell, and in 0.36 on Skylake.
  #
+# Knights Landing achieves 1.09 cpb.
+#
  # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  
  $flavour = shift;
  # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  
  $flavour = shift;
diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl

index 1dce5d61e3a081587245ff5cdd6fc0bf0ad5e2ff..1faa6ebf4639971160aabb92e7ba5ba7727b62a9 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-x86_64.pl
+++ b/crypto/poly1305/asm/poly1305-x86_64.pl
@@ -27,14 +27,15 @@
  # Numbers are cycles per processed byte with poly1305_blocks alone,
  # measured with rdtsc at fixed clock frequency.
  #
  # Numbers are cycles per processed byte with poly1305_blocks alone,
  # measured with rdtsc at fixed clock frequency.
  #
-#              IALU/gcc-4.8(*) AVX(**)         AVX2
+#              IALU/gcc-4.8(*) AVX(**)         AVX2    AVX-512
  # P4           4.46/+120%      -
  # Core 2       2.41/+90%       -
  # Westmere     1.88/+120%      -
  # Sandy Bridge 1.39/+140%      1.10
  # Haswell      1.14/+175%      1.11            0.65
  # P4           4.46/+120%      -
  # Core 2       2.41/+90%       -
  # Westmere     1.88/+120%      -
  # Sandy Bridge 1.39/+140%      1.10
  # Haswell      1.14/+175%      1.11            0.65
-# Skylake      1.13/+120%      0.96            0.51
+# Skylake[-X]  1.13/+120%      0.96            0.51    [0.35]
  # Silvermont   2.83/+95%       -
  # Silvermont   2.83/+95%       -
+# Knights L    3.60/-          1.65            1.10    (***)
  # Goldmont     1.70/+180%      -
  # VIA Nano     1.82/+150%      -
  # Sledgehammer 1.38/+160%      -
  # Goldmont     1.70/+180%      -
  # VIA Nano     1.82/+150%      -
  # Sledgehammer 1.38/+160%      -
@@ -49,6 +50,8 @@
  #      Core processors, 50-30%, less newer processor is, but slower on
  #      contemporary ones, for example almost 2x slower on Atom, and as
  #      former are naturally disappearing, SSE2 is deemed unnecessary;
  #      Core processors, 50-30%, less newer processor is, but slower on
  #      contemporary ones, for example almost 2x slower on Atom, and as
  #      former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***)        Current AVX-512 code requires BW and VL extensions and can not
+#      execute on Knights Landing;
  
  $flavour = shift;
  $output  = shift;
  
  $flavour = shift;
  $output  = shift;
diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl

index 73ac8ee5cecec6dd98495ff9ce12ff7e4a55c0a0..8b7bbfc2614fc4cfd8c6fc036084449beb49e68e 100755 (executable)
--- a/crypto/sha/asm/sha1-x86_64.pl
+++ b/crypto/sha/asm/sha1-x86_64.pl
@@ -86,6 +86,7 @@
  # VIA Nano     9.32            7.15/+30%
  # Atom         10.3            9.17/+12%
  # Silvermont   13.1(*)         9.37/+40%
  # VIA Nano     9.32            7.15/+30%
  # Atom         10.3            9.17/+12%
  # Silvermont   13.1(*)         9.37/+40%
+# Knights L    13.2(*)         9.68/+36%       8.30/+59%
  # Goldmont     8.13            6.42/+27%       1.70/+380%(**)
  #
  # (*)  obviously suboptimal result, nothing was done about it,
  # Goldmont     8.13            6.42/+27%       1.70/+380%(**)
  #
  # (*)  obviously suboptimal result, nothing was done about it,
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl

index bb7561e2397ba80f52d0b773b8bcb8f99f9a575d..f2ebdfdb68b64e4341da3e192186cd2f7bdd8f27 100755 (executable)
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -99,6 +99,7 @@
  # VIA Nano     23.0    16.5(+39%)  -               14.7    -
  # Atom         23.0    18.9(+22%)  -               14.7    -
  # Silvermont   27.4    20.6(+33%)  -               17.5    -
  # VIA Nano     23.0    16.5(+39%)  -               14.7    -
  # Atom         23.0    18.9(+22%)  -               14.7    -
  # Silvermont   27.4    20.6(+33%)  -               17.5    -
+# Knights L    27.4    21.0(+30%)  19.6(+40%)      17.5    12.8(+37%)
  # Goldmont     18.9    14.3(+32%)  4.16(+350%)     12.0    -
  #
  # (*)  whichever best applicable, including SHAEXT;
  # Goldmont     18.9    14.3(+32%)  4.16(+350%)     12.0    -
  #
  # (*)  whichever best applicable, including SHAEXT;
diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl

index 2467af7e9e70b695b74611d78fd79df36aeae590..a9f93bb2cf442d81a52dd630ce8d128db066c0f9 100644 (file)
--- a/crypto/x86_64cpuid.pl
+++ b/crypto/x86_64cpuid.pl
@@ -145,8 +145,19 @@ OPENSSL_ia32_cpuid:
         or      \$0x40000000,%edx       # set reserved bit#30 on Intel CPUs
         and     \$15,%ah
         cmp     \$15,%ah                # examine Family ID
         or      \$0x40000000,%edx       # set reserved bit#30 on Intel CPUs
         and     \$15,%ah
         cmp     \$15,%ah                # examine Family ID
-       jne     .Lnotintel
+       jne     .LnotP4
         or      \$0x00100000,%edx       # set reserved bit#20 to engage RC4_CHAR
         or      \$0x00100000,%edx       # set reserved bit#20 to engage RC4_CHAR
+.LnotP4:
+       cmp     \$6,%ah
+       jne     .Lnotintel
+       and     \$0x0ffff0f0,%eax
+       cmp     \$0x00050670,%eax       # Knights Landing
+       je      .Lknights
+       cmp     \$0x00080650,%eax       # Knights Mill (according to sde)
+       jne     .Lnotintel
+.Lknights:
+       and     \$0xfbffffff,%ecx       # clear XSAVE flag to mimic Silvermont
+
  .Lnotintel:
         bt      \$28,%edx               # test hyper-threading bit
         jnc     .Lgeneric
  .Lnotintel:
         bt      \$28,%edx               # test hyper-threading bit
         jnc     .Lgeneric
@@ -171,6 +182,10 @@ OPENSSL_ia32_cpuid:
         mov     \$7,%eax
         xor     %ecx,%ecx
         cpuid
         mov     \$7,%eax
         xor     %ecx,%ecx
         cpuid
+       bt      \$26,%r9d               # check XSAVE bit, cleared on Knights
+       jc      .Lnotknights
+       and     \$0xfff7ffff,%ebx       # clear ADCX/ADOX flag
+.Lnotknights:
         mov     %ebx,8(%rdi)            # save extended feature flags
  .Lno_extended_info:
  
         mov     %ebx,8(%rdi)            # save extended feature flags
  .Lno_extended_info:
author	Andy Polyakov <appro@openssl.org>
	Thu, 20 Jul 2017 07:48:35 +0000 (09:48 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Fri, 21 Jul 2017 12:07:32 +0000 (14:07 +0200)
crypto/aes/asm/aesni-x86_64.pl		patch \| blob \| history
crypto/chacha/asm/chacha-x86_64.pl		patch \| blob \| history
crypto/modes/asm/aesni-gcm-x86_64.pl		patch \| blob \| history
crypto/modes/asm/ghash-x86_64.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-x86_64.pl		patch \| blob \| history
crypto/sha/asm/sha1-x86_64.pl		patch \| blob \| history
crypto/sha/asm/sha512-x86_64.pl		patch \| blob \| history
crypto/x86_64cpuid.pl		patch \| blob \| history