From 64d92d74985ebb3d0be58a9718f9e080a14a8e7f Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Thu, 20 Jul 2017 09:48:35 +0200 Subject: [PATCH] x86_64 assembly pack: "optimize" for Knights Landing, add AVX-512 results. "Optimize" is in quotes because it's rather a "salvage operation" for now. Idea is to identify processor capability flags that drive Knights Landing to suboptimial code paths and mask them. Two flags were identified, XSAVE and ADCX/ADOX. Former affects choice of AES-NI code path specific for Silvermont (Knights Landing is of Silvermont "ancestry"). And 64-bit ADCX/ADOX instructions are effectively mishandled at decode time. In both cases we are looking at ~2x improvement. AVX-512 results cover even Skylake-X :-) Hardware used for benchmarking courtesy of Atos, experiments run by Romain Dolbeau . Kudos! Reviewed-by: Rich Salz --- crypto/aes/asm/aesni-x86_64.pl | 1 + crypto/chacha/asm/chacha-x86_64.pl | 6 ++++-- crypto/modes/asm/aesni-gcm-x86_64.pl | 2 ++ crypto/modes/asm/ghash-x86_64.pl | 3 +++ crypto/poly1305/asm/poly1305-x86_64.pl | 7 +++++-- crypto/sha/asm/sha1-x86_64.pl | 1 + crypto/sha/asm/sha512-x86_64.pl | 1 + crypto/x86_64cpuid.pl | 17 ++++++++++++++++- 8 files changed, 33 insertions(+), 5 deletions(-) diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index ea5dc14ae5..4c7119b224 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -179,6 +179,7 @@ # Haswell 4.44/0.63 0.63 0.73 0.63 0.70 # Skylake 2.62/0.63 0.63 0.63 0.63 # Silvermont 5.75/3.54 3.56 4.12 3.87(*) 4.11 +# Knights L 2.54/0.77 0.78 0.85 - 1.50 # Goldmont 3.82/1.26 1.26 1.29 1.29 1.50 # Bulldozer 5.77/0.70 0.72 0.90 0.70 0.95 # Ryzen 2.71/0.35 0.35 0.44 0.38 0.49 diff --git a/crypto/chacha/asm/chacha-x86_64.pl b/crypto/chacha/asm/chacha-x86_64.pl index e2c6a32440..0cfe8990fa 100755 --- a/crypto/chacha/asm/chacha-x86_64.pl +++ b/crypto/chacha/asm/chacha-x86_64.pl @@ -24,7 +24,7 @@ # # Performance in cycles per byte out of large buffer. # -# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 8xAVX2 +# IALU/gcc 4.8(i) 1xSSSE3/SSE2 4xSSSE3 NxAVX(v) # # P4 9.48/+99% -/22.7(ii) - # Core2 7.83/+55% 7.90/8.08 4.35 @@ -32,8 +32,9 @@ # Sandy Bridge 8.31/+42% 5.45/6.76 2.72 # Ivy Bridge 6.71/+46% 5.40/6.49 2.41 # Haswell 5.92/+43% 5.20/6.45 2.42 1.23 -# Skylake 5.87/+39% 4.70/- 2.31 1.19 +# Skylake[-X] 5.87/+39% 4.70/- 2.31 1.19[0.57] # Silvermont 12.0/+33% 7.75/7.40 7.03(iii) +# Knights L 11.7/- - 9.60(iii) 0.80 # Goldmont 10.6/+17% 5.10/- 3.28 # Sledgehammer 7.28/+52% -/14.2(ii) - # Bulldozer 9.66/+28% 9.85/11.1 3.06(iv) @@ -50,6 +51,7 @@ # limitations, SSE2 can do better, but gain is considered too # low to justify the [maintenance] effort; # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20; +# (v) 8xAVX2 or 16xAVX-512, whichever best applicable; $flavour = shift; $output = shift; diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl index 3cd231e22f..b42016101e 100644 --- a/crypto/modes/asm/aesni-gcm-x86_64.pl +++ b/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -35,6 +35,8 @@ # Applications using the EVP interface will observe a few percent # worse performance.] # +# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). +# # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index 817f6e59a0..fa4c9063a2 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -74,6 +74,7 @@ # Skylake 0.44(+110%)(if system doesn't support AVX) # Bulldozer 1.49(+27%) # Silvermont 2.88(+13%) +# Knights L 2.12(-) (if system doesn't support AVX) # Goldmont 1.08(+24%) # March 2013 @@ -86,6 +87,8 @@ # it performs in 0.41 cycles per byte on Haswell processor, in # 0.29 on Broadwell, and in 0.36 on Skylake. # +# Knights Landing achieves 1.09 cpb. +# # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest $flavour = shift; diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index 1dce5d61e3..1faa6ebf46 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -27,14 +27,15 @@ # Numbers are cycles per processed byte with poly1305_blocks alone, # measured with rdtsc at fixed clock frequency. # -# IALU/gcc-4.8(*) AVX(**) AVX2 +# IALU/gcc-4.8(*) AVX(**) AVX2 AVX-512 # P4 4.46/+120% - # Core 2 2.41/+90% - # Westmere 1.88/+120% - # Sandy Bridge 1.39/+140% 1.10 # Haswell 1.14/+175% 1.11 0.65 -# Skylake 1.13/+120% 0.96 0.51 +# Skylake[-X] 1.13/+120% 0.96 0.51 [0.35] # Silvermont 2.83/+95% - +# Knights L 3.60/- 1.65 1.10 (***) # Goldmont 1.70/+180% - # VIA Nano 1.82/+150% - # Sledgehammer 1.38/+160% - @@ -49,6 +50,8 @@ # Core processors, 50-30%, less newer processor is, but slower on # contemporary ones, for example almost 2x slower on Atom, and as # former are naturally disappearing, SSE2 is deemed unnecessary; +# (***) Current AVX-512 code requires BW and VL extensions and can not +# execute on Knights Landing; $flavour = shift; $output = shift; diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl index 73ac8ee5ce..8b7bbfc261 100755 --- a/crypto/sha/asm/sha1-x86_64.pl +++ b/crypto/sha/asm/sha1-x86_64.pl @@ -86,6 +86,7 @@ # VIA Nano 9.32 7.15/+30% # Atom 10.3 9.17/+12% # Silvermont 13.1(*) 9.37/+40% +# Knights L 13.2(*) 9.68/+36% 8.30/+59% # Goldmont 8.13 6.42/+27% 1.70/+380%(**) # # (*) obviously suboptimal result, nothing was done about it, diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl index bb7561e239..f2ebdfdb68 100755 --- a/crypto/sha/asm/sha512-x86_64.pl +++ b/crypto/sha/asm/sha512-x86_64.pl @@ -99,6 +99,7 @@ # VIA Nano 23.0 16.5(+39%) - 14.7 - # Atom 23.0 18.9(+22%) - 14.7 - # Silvermont 27.4 20.6(+33%) - 17.5 - +# Knights L 27.4 21.0(+30%) 19.6(+40%) 17.5 12.8(+37%) # Goldmont 18.9 14.3(+32%) 4.16(+350%) 12.0 - # # (*) whichever best applicable, including SHAEXT; diff --git a/crypto/x86_64cpuid.pl b/crypto/x86_64cpuid.pl index 2467af7e9e..a9f93bb2cf 100644 --- a/crypto/x86_64cpuid.pl +++ b/crypto/x86_64cpuid.pl @@ -145,8 +145,19 @@ OPENSSL_ia32_cpuid: or \$0x40000000,%edx # set reserved bit#30 on Intel CPUs and \$15,%ah cmp \$15,%ah # examine Family ID - jne .Lnotintel + jne .LnotP4 or \$0x00100000,%edx # set reserved bit#20 to engage RC4_CHAR +.LnotP4: + cmp \$6,%ah + jne .Lnotintel + and \$0x0ffff0f0,%eax + cmp \$0x00050670,%eax # Knights Landing + je .Lknights + cmp \$0x00080650,%eax # Knights Mill (according to sde) + jne .Lnotintel +.Lknights: + and \$0xfbffffff,%ecx # clear XSAVE flag to mimic Silvermont + .Lnotintel: bt \$28,%edx # test hyper-threading bit jnc .Lgeneric @@ -171,6 +182,10 @@ OPENSSL_ia32_cpuid: mov \$7,%eax xor %ecx,%ecx cpuid + bt \$26,%r9d # check XSAVE bit, cleared on Knights + jc .Lnotknights + and \$0xfff7ffff,%ebx # clear ADCX/ADOX flag +.Lnotknights: mov %ebx,8(%rdi) # save extended feature flags .Lno_extended_info: -- 2.34.1