x86_64 assembly pack: "optimize" for Knights Landing, add AVX-512 results.
authorAndy Polyakov <appro@openssl.org>
Thu, 20 Jul 2017 07:48:35 +0000 (09:48 +0200)
committerAndy Polyakov <appro@openssl.org>
Fri, 21 Jul 2017 12:07:32 +0000 (14:07 +0200)
"Optimize" is in quotes because it's rather a "salvage operation"
for now. Idea is to identify processor capability flags that
drive Knights Landing to suboptimial code paths and mask them.
Two flags were identified, XSAVE and ADCX/ADOX. Former affects
choice of AES-NI code path specific for Silvermont (Knights Landing
is of Silvermont "ancestry"). And 64-bit ADCX/ADOX instructions are
effectively mishandled at decode time. In both cases we are looking
at ~2x improvement.

AVX-512 results cover even Skylake-X :-)

Hardware used for benchmarking courtesy of Atos, experiments run by
Romain Dolbeau <romain.dolbeau@atos.net>. Kudos!

Reviewed-by: Rich Salz <rsalz@openssl.org>
crypto/aes/asm/aesni-x86_64.pl
crypto/chacha/asm/chacha-x86_64.pl
crypto/modes/asm/aesni-gcm-x86_64.pl
crypto/modes/asm/ghash-x86_64.pl
crypto/poly1305/asm/poly1305-x86_64.pl
crypto/sha/asm/sha1-x86_64.pl
crypto/sha/asm/sha512-x86_64.pl
crypto/x86_64cpuid.pl

index ea5dc14ae5dd5de28d22ee65e88a8f7afc4df64f..4c7119b2246c4cb9844baecfad3de9daf70da4d7 100644 (file)
 # Haswell      4.44/0.63       0.63    0.73    0.63    0.70
 # Skylake      2.62/0.63       0.63    0.63    0.63
 # Silvermont   5.75/3.54       3.56    4.12    3.87(*) 4.11
 # Haswell      4.44/0.63       0.63    0.73    0.63    0.70
 # Skylake      2.62/0.63       0.63    0.63    0.63
 # Silvermont   5.75/3.54       3.56    4.12    3.87(*) 4.11
+# Knights L    2.54/0.77       0.78    0.85    -       1.50
 # Goldmont     3.82/1.26       1.26    1.29    1.29    1.50
 # Bulldozer    5.77/0.70       0.72    0.90    0.70    0.95
 # Ryzen                2.71/0.35       0.35    0.44    0.38    0.49
 # Goldmont     3.82/1.26       1.26    1.29    1.29    1.50
 # Bulldozer    5.77/0.70       0.72    0.90    0.70    0.95
 # Ryzen                2.71/0.35       0.35    0.44    0.38    0.49
index e2c6a32440c1c51be55dc403d2897952784e7929..0cfe8990faaa0b33f9ed030e0dbf341010362812 100755 (executable)
@@ -24,7 +24,7 @@
 #
 # Performance in cycles per byte out of large buffer.
 #
 #
 # Performance in cycles per byte out of large buffer.
 #
-#              IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     8xAVX2
+#              IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     NxAVX(v)
 #
 # P4           9.48/+99%       -/22.7(ii)      -
 # Core2                7.83/+55%       7.90/8.08       4.35
 #
 # P4           9.48/+99%       -/22.7(ii)      -
 # Core2                7.83/+55%       7.90/8.08       4.35
@@ -32,8 +32,9 @@
 # Sandy Bridge 8.31/+42%       5.45/6.76       2.72
 # Ivy Bridge   6.71/+46%       5.40/6.49       2.41
 # Haswell      5.92/+43%       5.20/6.45       2.42        1.23
 # Sandy Bridge 8.31/+42%       5.45/6.76       2.72
 # Ivy Bridge   6.71/+46%       5.40/6.49       2.41
 # Haswell      5.92/+43%       5.20/6.45       2.42        1.23
-# Skylake      5.87/+39%       4.70/-          2.31        1.19
+# Skylake[-X]  5.87/+39%       4.70/-          2.31        1.19[0.57]
 # Silvermont   12.0/+33%       7.75/7.40       7.03(iii)
 # Silvermont   12.0/+33%       7.75/7.40       7.03(iii)
+# Knights L    11.7/-          -               9.60(iii)   0.80
 # Goldmont     10.6/+17%       5.10/-          3.28
 # Sledgehammer 7.28/+52%       -/14.2(ii)      -
 # Bulldozer    9.66/+28%       9.85/11.1       3.06(iv)
 # Goldmont     10.6/+17%       5.10/-          3.28
 # Sledgehammer 7.28/+52%       -/14.2(ii)      -
 # Bulldozer    9.66/+28%       9.85/11.1       3.06(iv)
@@ -50,6 +51,7 @@
 #      limitations, SSE2 can do better, but gain is considered too
 #      low to justify the [maintenance] effort;
 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
 #      limitations, SSE2 can do better, but gain is considered too
 #      low to justify the [maintenance] effort;
 # (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
+# (v)  8xAVX2 or 16xAVX-512, whichever best applicable;
 
 $flavour = shift;
 $output  = shift;
 
 $flavour = shift;
 $output  = shift;
index 3cd231e22f0da49472aabced56bd71566a20c218..b42016101ebc6603919dc8668832593c3fa9429b 100644 (file)
@@ -35,6 +35,8 @@
 # Applications using the EVP interface will observe a few percent
 # worse performance.]
 #
 # Applications using the EVP interface will observe a few percent
 # worse performance.]
 #
+# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP).
+#
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
 
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
 
index 817f6e59a03877c737126802cbe835957764daa8..fa4c9063a2991019d90ad42ecf7511e4eea8c242 100644 (file)
@@ -74,6 +74,7 @@
 # Skylake      0.44(+110%)(if system doesn't support AVX)
 # Bulldozer    1.49(+27%)
 # Silvermont   2.88(+13%)
 # Skylake      0.44(+110%)(if system doesn't support AVX)
 # Bulldozer    1.49(+27%)
 # Silvermont   2.88(+13%)
+# Knights L    2.12(-)    (if system doesn't support AVX)
 # Goldmont     1.08(+24%)
 
 # March 2013
 # Goldmont     1.08(+24%)
 
 # March 2013
@@ -86,6 +87,8 @@
 # it performs in 0.41 cycles per byte on Haswell processor, in
 # 0.29 on Broadwell, and in 0.36 on Skylake.
 #
 # it performs in 0.41 cycles per byte on Haswell processor, in
 # 0.29 on Broadwell, and in 0.36 on Skylake.
 #
+# Knights Landing achieves 1.09 cpb.
+#
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
 $flavour = shift;
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 
 $flavour = shift;
index 1dce5d61e3a081587245ff5cdd6fc0bf0ad5e2ff..1faa6ebf4639971160aabb92e7ba5ba7727b62a9 100755 (executable)
 # Numbers are cycles per processed byte with poly1305_blocks alone,
 # measured with rdtsc at fixed clock frequency.
 #
 # Numbers are cycles per processed byte with poly1305_blocks alone,
 # measured with rdtsc at fixed clock frequency.
 #
-#              IALU/gcc-4.8(*) AVX(**)         AVX2
+#              IALU/gcc-4.8(*) AVX(**)         AVX2    AVX-512
 # P4           4.46/+120%      -
 # Core 2       2.41/+90%       -
 # Westmere     1.88/+120%      -
 # Sandy Bridge 1.39/+140%      1.10
 # Haswell      1.14/+175%      1.11            0.65
 # P4           4.46/+120%      -
 # Core 2       2.41/+90%       -
 # Westmere     1.88/+120%      -
 # Sandy Bridge 1.39/+140%      1.10
 # Haswell      1.14/+175%      1.11            0.65
-# Skylake      1.13/+120%      0.96            0.51
+# Skylake[-X]  1.13/+120%      0.96            0.51    [0.35]
 # Silvermont   2.83/+95%       -
 # Silvermont   2.83/+95%       -
+# Knights L    3.60/-          1.65            1.10    (***)
 # Goldmont     1.70/+180%      -
 # VIA Nano     1.82/+150%      -
 # Sledgehammer 1.38/+160%      -
 # Goldmont     1.70/+180%      -
 # VIA Nano     1.82/+150%      -
 # Sledgehammer 1.38/+160%      -
@@ -49,6 +50,8 @@
 #      Core processors, 50-30%, less newer processor is, but slower on
 #      contemporary ones, for example almost 2x slower on Atom, and as
 #      former are naturally disappearing, SSE2 is deemed unnecessary;
 #      Core processors, 50-30%, less newer processor is, but slower on
 #      contemporary ones, for example almost 2x slower on Atom, and as
 #      former are naturally disappearing, SSE2 is deemed unnecessary;
+# (***)        Current AVX-512 code requires BW and VL extensions and can not
+#      execute on Knights Landing;
 
 $flavour = shift;
 $output  = shift;
 
 $flavour = shift;
 $output  = shift;
index 73ac8ee5cecec6dd98495ff9ce12ff7e4a55c0a0..8b7bbfc2614fc4cfd8c6fc036084449beb49e68e 100755 (executable)
@@ -86,6 +86,7 @@
 # VIA Nano     9.32            7.15/+30%
 # Atom         10.3            9.17/+12%
 # Silvermont   13.1(*)         9.37/+40%
 # VIA Nano     9.32            7.15/+30%
 # Atom         10.3            9.17/+12%
 # Silvermont   13.1(*)         9.37/+40%
+# Knights L    13.2(*)         9.68/+36%       8.30/+59%
 # Goldmont     8.13            6.42/+27%       1.70/+380%(**)
 #
 # (*)  obviously suboptimal result, nothing was done about it,
 # Goldmont     8.13            6.42/+27%       1.70/+380%(**)
 #
 # (*)  obviously suboptimal result, nothing was done about it,
index bb7561e2397ba80f52d0b773b8bcb8f99f9a575d..f2ebdfdb68b64e4341da3e192186cd2f7bdd8f27 100755 (executable)
@@ -99,6 +99,7 @@
 # VIA Nano     23.0    16.5(+39%)  -               14.7    -
 # Atom         23.0    18.9(+22%)  -               14.7    -
 # Silvermont   27.4    20.6(+33%)  -               17.5    -
 # VIA Nano     23.0    16.5(+39%)  -               14.7    -
 # Atom         23.0    18.9(+22%)  -               14.7    -
 # Silvermont   27.4    20.6(+33%)  -               17.5    -
+# Knights L    27.4    21.0(+30%)  19.6(+40%)      17.5    12.8(+37%)
 # Goldmont     18.9    14.3(+32%)  4.16(+350%)     12.0    -
 #
 # (*)  whichever best applicable, including SHAEXT;
 # Goldmont     18.9    14.3(+32%)  4.16(+350%)     12.0    -
 #
 # (*)  whichever best applicable, including SHAEXT;
index 2467af7e9e70b695b74611d78fd79df36aeae590..a9f93bb2cf442d81a52dd630ce8d128db066c0f9 100644 (file)
@@ -145,8 +145,19 @@ OPENSSL_ia32_cpuid:
        or      \$0x40000000,%edx       # set reserved bit#30 on Intel CPUs
        and     \$15,%ah
        cmp     \$15,%ah                # examine Family ID
        or      \$0x40000000,%edx       # set reserved bit#30 on Intel CPUs
        and     \$15,%ah
        cmp     \$15,%ah                # examine Family ID
-       jne     .Lnotintel
+       jne     .LnotP4
        or      \$0x00100000,%edx       # set reserved bit#20 to engage RC4_CHAR
        or      \$0x00100000,%edx       # set reserved bit#20 to engage RC4_CHAR
+.LnotP4:
+       cmp     \$6,%ah
+       jne     .Lnotintel
+       and     \$0x0ffff0f0,%eax
+       cmp     \$0x00050670,%eax       # Knights Landing
+       je      .Lknights
+       cmp     \$0x00080650,%eax       # Knights Mill (according to sde)
+       jne     .Lnotintel
+.Lknights:
+       and     \$0xfbffffff,%ecx       # clear XSAVE flag to mimic Silvermont
+
 .Lnotintel:
        bt      \$28,%edx               # test hyper-threading bit
        jnc     .Lgeneric
 .Lnotintel:
        bt      \$28,%edx               # test hyper-threading bit
        jnc     .Lgeneric
@@ -171,6 +182,10 @@ OPENSSL_ia32_cpuid:
        mov     \$7,%eax
        xor     %ecx,%ecx
        cpuid
        mov     \$7,%eax
        xor     %ecx,%ecx
        cpuid
+       bt      \$26,%r9d               # check XSAVE bit, cleared on Knights
+       jc      .Lnotknights
+       and     \$0xfff7ffff,%ebx       # clear ADCX/ADOX flag
+.Lnotknights:
        mov     %ebx,8(%rdi)            # save extended feature flags
 .Lno_extended_info:
 
        mov     %ebx,8(%rdi)            # save extended feature flags
 .Lno_extended_info: