From: Andy Polyakov Date: Tue, 12 Jun 2012 14:18:21 +0000 (+0000) Subject: x86[_64] assembly pack: update benchmark results. X-Git-Tag: master-post-reformat~1805 X-Git-Url: https://git.openssl.org/?p=openssl.git;a=commitdiff_plain;h=d2e1803197477e0ef19735ff976a75284cb2fe70 x86[_64] assembly pack: update benchmark results. --- diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl index c6f6b3334a..5f49e4024e 100644 --- a/crypto/aes/asm/aesni-sha1-x86_64.pl +++ b/crypto/aes/asm/aesni-sha1-x86_64.pl @@ -23,14 +23,20 @@ # AES-128-CBC +SHA1 stitch gain # Westmere 3.77[+5.6] 9.37 6.65 +41% # Sandy Bridge 5.05[+5.2(6.3)] 10.25(11.35) 6.16(7.08) +67%(+60%) +# Ivy Bridge 5.05[+4.7] 9.75 5.59 +74% +# Bulldozer 5.77[+6.1] 11.87 6.47 +83% # # AES-192-CBC # Westmere 4.51 10.11 6.97 +45% # Sandy Bridge 6.05 11.25(12.35) 6.34(7.27) +77%(+70%) +# Ivy Bridge 6.05 10.75 6.07 +77% +# Bulldozer 6.89 12.99 7.02 +85% # # AES-256-CBC # Westmere 5.25 10.85 7.25 +50% # Sandy Bridge 7.05 12.25(13.35) 7.06(7.70) +74%(+73%) +# Ivy Bridge 7.05 11.75 7.12 +65% +# Bulldozer 8.00 14.10 8.24 +71% # # (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for # background information. Above numbers in parentheses are SSSE3 @@ -47,6 +53,8 @@ # AES-128-CBC AES-192-CBC AES-256-CBC # Westmere 1.31 1.55 1.80 # Sandy Bridge 0.93 1.06 1.22 +# Ivy Bridge 0.92 1.06 1.21 +# Bulldozer 0.76 0.90 1.04 $flavour = shift; $output = shift; diff --git a/crypto/aes/asm/aesni-x86_64.pl b/crypto/aes/asm/aesni-x86_64.pl index 801c1e3257..07252efca4 100644 --- a/crypto/aes/asm/aesni-x86_64.pl +++ b/crypto/aes/asm/aesni-x86_64.pl @@ -157,6 +157,13 @@ # one byte out of 8KB with 128-bit key, Sandy Bridge - 0.97. Just like # in CTR mode AES instruction interleave factor was chosen to be 6x. +###################################################################### +# For reference, AMD Bulldozer spends 5.77 cycles per byte processed +# with 128-bit key in CBC encrypt and 0.76 cycles in CBC decrypt, 0.70 +# in ECB, 0.94 in CTR, 0.95 in XTS... This means that aes[enc|dec] +# instruction latency is 9 cycles and that they can be issued every +# cycle. + $PREFIX="aesni"; # if $PREFIX is set to "AES", the script # generates drop-in replacement for # crypto/aes/asm/aes-x86_64.pl:-) diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl index 25a16a355b..d31fbae0d8 100644 --- a/crypto/modes/asm/ghash-x86.pl +++ b/crypto/modes/asm/ghash-x86.pl @@ -26,6 +26,8 @@ # P4 125/125 17.8 84(***) # Opteron 66 /70 10.1 30 # Core2 54 /67 8.4 18 +# Atom 105/105 16.8 53 +# VIA Nano 69 /71 13.0 27 # # (*) gcc 3.4.x was observed to generate few percent slower code, # which is one of reasons why 2.95.3 results were chosen, @@ -113,6 +115,10 @@ # similar manner resulted in almost 20% degradation on Sandy Bridge, # where original 64-bit code processes one byte in 1.95 cycles. +##################################################################### +# For reference, AMD Bulldozer processes one byte in 1.98 cycles in +# 32-bit mode and 1.89 in 64-bit. + $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl index a5ae180882..37653db668 100644 --- a/crypto/modes/asm/ghash-x86_64.pl +++ b/crypto/modes/asm/ghash-x86_64.pl @@ -22,6 +22,8 @@ # P4 28.6 14.0 +100% # Opteron 19.3 7.7 +150% # Core2 17.8 8.1(**) +120% +# Atom 31.6 16.8 +88% +# VIA Nano 21.8 10.1 +115% # # (*) comparison is not completely fair, because C results are # for vanilla "256B" implementation, while assembler results diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 84f1a798cb..6a3a686082 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -43,6 +43,9 @@ # Westmere 5.1/+94%(**) # Sandy Bridge 5.0/+8% # Atom 12.6/+6% +# VIA Nano 6.4/+9% +# Ivy Bridge 4.9/±0% +# Bulldozer 4.9/+15% # # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, # but this specific code performs poorly on Core2. And vice diff --git a/crypto/rc4/asm/rc4-md5-x86_64.pl b/crypto/rc4/asm/rc4-md5-x86_64.pl index 7f684092d4..fbb19dabae 100644 --- a/crypto/rc4/asm/rc4-md5-x86_64.pl +++ b/crypto/rc4/asm/rc4-md5-x86_64.pl @@ -30,6 +30,9 @@ # Westmere 4.3 5.2 9.5 7.0 +36% # Sandy Bridge 4.2 5.5 9.7 6.8 +43% # Atom 9.3 6.5 15.8 11.1 +42% +# VIA Nano 6.3 5.4 11.7 8.6 +37% +# Ivy Bridge 4.1 5.2 9.3 6.0 +54% +# Bulldozer 4.5 5.4 9.9 7.7 +29% # # (*) rc4-x86_64.pl delivers 5.3 on Opteron, so real improvement # is +53%... diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index e18e8a0008..de2e13dcf2 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -92,6 +92,9 @@ # Westmere 4.2/+60% # Sandy Bridge 4.2/+120% # Atom 9.3/+80% +# VIA Nano 6.4/+4% +# Ivy Bridge 4.1/+30% +# Bulldozer 4.5/+30%(*) # # (*) But corresponding loop has less instructions, which should have # positive effect on upcoming Bulldozer, which has one less ALU. diff --git a/crypto/sha/asm/sha1-586.pl b/crypto/sha/asm/sha1-586.pl index 1084d227fe..6b706be3d7 100644 --- a/crypto/sha/asm/sha1-586.pl +++ b/crypto/sha/asm/sha1-586.pl @@ -92,6 +92,9 @@ # Atom 12.5 9.5(*)/+32% - # Westmere 7.3 5.6/+30% - # Sandy Bridge 8.8 6.2/+40% 5.1(**)/+70% +# Ivy Bridge 7.2 4.9/+47% 4.8(**)/+50% +# Bulldozer 11.6 6.2/+88% +# VIA Nano 10.6 7.5/+41% # # (*) Loop is 1056 instructions long and expected result is ~8.25. # It remains mystery [to me] why ILP is limited to 1.7. diff --git a/crypto/sha/asm/sha1-x86_64.pl b/crypto/sha/asm/sha1-x86_64.pl index f27c1e3fb0..70cb40d85e 100755 --- a/crypto/sha/asm/sha1-x86_64.pl +++ b/crypto/sha/asm/sha1-x86_64.pl @@ -60,6 +60,9 @@ # Atom 11.0 9.7/+13% - # Westmere 7.1 5.6/+27% - # Sandy Bridge 7.9 6.3/+25% 5.2/+51% +# Ivy Bridge 6.4 4.8/+33% 4.7/+36% +# Bulldozer 10.9 6.1/+79% +# VIA Nano 10.2 7.4/+38% $flavour = shift; $output = shift; diff --git a/crypto/sha/asm/sha512-586.pl b/crypto/sha/asm/sha512-586.pl index 7eab6a5b88..df9e8eebca 100644 --- a/crypto/sha/asm/sha512-586.pl +++ b/crypto/sha/asm/sha512-586.pl @@ -11,15 +11,16 @@ # # Performance in clock cycles per processed byte (less is better): # -# Pentium PIII P4 AMD K8 Core2 -# gcc 100 75 116 54 66 -# icc 97 77 95 55 57 -# x86 asm 61 56 82 36 40 -# SSE2 asm - - 38 24 20 -# x86_64 asm(*) - - 30 10.0 10.5 +# PIII P4 AMD K8 Core2 SB Atom Bldzr +# gcc 75 116 54 66 58 126 121 +# icc 77 95 55 57 - - - +# x86 asm 56 82 36 40 35 68 50 +# SSE2 asm - 38 24 20 16 64(**) 18 +# x86_64 asm(*) - 33 9.6 10.3 11.3 14.7 13.5 # -# (*) x86_64 assembler performance is presented for reference -# purposes. +# (*) x86_64 assembler performance is presented for reference +# purposes. +# (**) paddq is increadibly slow on Atom. # # IALU code-path is optimized for elder Pentiums. On vanilla Pentium # performance improvement over compiler generated code reaches ~60%, @@ -315,6 +316,7 @@ if ($sse2) { &bswap ("edx"); &mov (&DWP(8*9+4,"esp"),"ecx"); &mov (&DWP(8*9+0,"esp"),"edx"); + &jmp (&label("00_14_sse2")); &set_label("00_14_sse2",16); &mov ("eax",&DWP(0,"edi"));