X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fsha%2Fasm%2Fsha256-mb-x86_64.pl;h=e86f0bc7fa9da335ae66289d02b04c38ede8ecf1;hp=2e4b102f52829a41ff6811ad30bb4eadd3c224ed;hb=3847d15d6bf124b1703fbc27f69bdce7755f768d;hpb=61ba602af5d7c3a642d732c4cc6518ed81dfc1c6 diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl index 2e4b102f52..e86f0bc7fa 100644 --- a/crypto/sha/asm/sha256-mb-x86_64.pl +++ b/crypto/sha/asm/sha256-mb-x86_64.pl @@ -15,7 +15,7 @@ # this +aesni(i) sha256 aesni-sha256 gain(iv) # ------------------------------------------------------------------- # Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126% -# Atom(ii) 39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93% +# Atom(ii) ?39.1/n +3.93=13.7(n=4) 20.8 +5.69=26.5 +93% # Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103% # Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82% # Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170% @@ -27,8 +27,9 @@ # AES-NI-SHA256 stitch for these processors; # (iii) "this" is for n=8, when we gather twice as much data, result # for n=4 is 20.3+4.44=24.7; -# (iv) improvement coefficients in real-life application are somewhat -# lower and range from 75% to 130% (on Haswell); +# (iv) presented improvement coefficients are asymptotic limits and +# in real-life application are somewhat lower, e.g. for 2KB +# fragments they range from 75% to 13% (on Haswell); $flavour = shift; $output = shift; @@ -135,6 +136,7 @@ $code.=<<___; psrld \$25-11,$t2 movdqa $e,$t1 + `"prefetch 63(@ptr[0])" if ($i==15)` pxor $t3,$sigma movdqa $e,$axb # borrow $axb pslld \$26-21,$t3 @@ -142,6 +144,7 @@ $code.=<<___; pand $f,$axb pxor $t2,$sigma + `"prefetch 63(@ptr[1])" if ($i==15)` movdqa $a,$t2 pxor $t3,$sigma # Sigma1(e) movdqa $a,$t3 @@ -153,6 +156,7 @@ $code.=<<___; pslld \$10,$t3 pxor $a,$axb # a^b, b^c in next round + `"prefetch 63(@ptr[2])" if ($i==15)` psrld \$13,$sigma pxor $t3,$t2 paddd $t1,$Xi # Xi+=Ch(e,f,g) @@ -160,6 +164,7 @@ $code.=<<___; pand $axb,$bxc pxor $sigma,$t2 + `"prefetch 63(@ptr[3])" if ($i==15)` psrld \$22-13,$sigma pxor $t3,$t2 movdqa $b,$h @@ -465,30 +470,38 @@ $code.=<<___; vpsrld \$25,$e,$t2 vpxor $t3,$sigma,$sigma + `"prefetch 63(@ptr[0])" if ($i==15)` vpslld \$7,$e,$t3 vpandn $g,$e,$t1 vpand $f,$e,$axb # borrow $axb + `"prefetch 63(@ptr[1])" if ($i==15)` vpxor $t2,$sigma,$sigma vpsrld \$2,$a,$h # borrow $h vpxor $t3,$sigma,$sigma # Sigma1(e) + `"prefetch 63(@ptr[2])" if ($i==15)` vpslld \$30,$a,$t2 vpxor $axb,$t1,$t1 # Ch(e,f,g) vpxor $a,$b,$axb # a^b, b^c in next round + `"prefetch 63(@ptr[3])" if ($i==15)` vpxor $t2,$h,$h vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e) vpsrld \$13,$a,$t2 + `"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)` vpslld \$19,$a,$t3 vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g) vpand $axb,$bxc,$bxc + `"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$h,$sigma vpsrld \$22,$a,$t2 vpxor $t3,$sigma,$sigma + `"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)` vpslld \$10,$a,$t3 vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b) vpaddd $Xi,$d,$d # d+=Xi + `"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)` vpxor $t2,$sigma,$sigma vpxor $t3,$sigma,$sigma # Sigma0(a)