projects
/
openssl.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
[aesni|sha*]-mb-x86_64.pl: add data prefetching.
[openssl.git]
/
crypto
/
sha
/
asm
/
sha256-mb-x86_64.pl
diff --git
a/crypto/sha/asm/sha256-mb-x86_64.pl
b/crypto/sha/asm/sha256-mb-x86_64.pl
index 2e4b102f52829a41ff6811ad30bb4eadd3c224ed..e86f0bc7fa9da335ae66289d02b04c38ede8ecf1 100644
(file)
--- a/
crypto/sha/asm/sha256-mb-x86_64.pl
+++ b/
crypto/sha/asm/sha256-mb-x86_64.pl
@@
-15,7
+15,7
@@
# this +aesni(i) sha256 aesni-sha256 gain(iv)
# -------------------------------------------------------------------
# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
# this +aesni(i) sha256 aesni-sha256 gain(iv)
# -------------------------------------------------------------------
# Westmere(ii) 23.3/n +1.28=7.11(n=4) 12.3 +3.75=16.1 +126%
-# Atom(ii)
39.1/n
+3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
+# Atom(ii)
?39.1/n
+3.93=13.7(n=4) 20.8 +5.69=26.5 +93%
# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
# Sandy Bridge (20.5 +5.15=25.7)/n 11.6 13.0 +103%
# Ivy Bridge (20.4 +5.14=25.5)/n 10.3 11.6 +82%
# Haswell(iii) (21.0 +5.00=26.0)/n 7.80 8.79 +170%
@@
-27,8
+27,9
@@
# AES-NI-SHA256 stitch for these processors;
# (iii) "this" is for n=8, when we gather twice as much data, result
# for n=4 is 20.3+4.44=24.7;
# AES-NI-SHA256 stitch for these processors;
# (iii) "this" is for n=8, when we gather twice as much data, result
# for n=4 is 20.3+4.44=24.7;
-# (iv) improvement coefficients in real-life application are somewhat
-# lower and range from 75% to 130% (on Haswell);
+# (iv) presented improvement coefficients are asymptotic limits and
+# in real-life application are somewhat lower, e.g. for 2KB
+# fragments they range from 75% to 13% (on Haswell);
$flavour = shift;
$output = shift;
$flavour = shift;
$output = shift;
@@
-135,6
+136,7
@@
$code.=<<___;
psrld \$25-11,$t2
movdqa $e,$t1
psrld \$25-11,$t2
movdqa $e,$t1
+ `"prefetch 63(@ptr[0])" if ($i==15)`
pxor $t3,$sigma
movdqa $e,$axb # borrow $axb
pslld \$26-21,$t3
pxor $t3,$sigma
movdqa $e,$axb # borrow $axb
pslld \$26-21,$t3
@@
-142,6
+144,7
@@
$code.=<<___;
pand $f,$axb
pxor $t2,$sigma
pand $f,$axb
pxor $t2,$sigma
+ `"prefetch 63(@ptr[1])" if ($i==15)`
movdqa $a,$t2
pxor $t3,$sigma # Sigma1(e)
movdqa $a,$t3
movdqa $a,$t2
pxor $t3,$sigma # Sigma1(e)
movdqa $a,$t3
@@
-153,6
+156,7
@@
$code.=<<___;
pslld \$10,$t3
pxor $a,$axb # a^b, b^c in next round
pslld \$10,$t3
pxor $a,$axb # a^b, b^c in next round
+ `"prefetch 63(@ptr[2])" if ($i==15)`
psrld \$13,$sigma
pxor $t3,$t2
paddd $t1,$Xi # Xi+=Ch(e,f,g)
psrld \$13,$sigma
pxor $t3,$t2
paddd $t1,$Xi # Xi+=Ch(e,f,g)
@@
-160,6
+164,7
@@
$code.=<<___;
pand $axb,$bxc
pxor $sigma,$t2
pand $axb,$bxc
pxor $sigma,$t2
+ `"prefetch 63(@ptr[3])" if ($i==15)`
psrld \$22-13,$sigma
pxor $t3,$t2
movdqa $b,$h
psrld \$22-13,$sigma
pxor $t3,$t2
movdqa $b,$h
@@
-465,30
+470,38
@@
$code.=<<___;
vpsrld \$25,$e,$t2
vpxor $t3,$sigma,$sigma
vpsrld \$25,$e,$t2
vpxor $t3,$sigma,$sigma
+ `"prefetch 63(@ptr[0])" if ($i==15)`
vpslld \$7,$e,$t3
vpandn $g,$e,$t1
vpand $f,$e,$axb # borrow $axb
vpslld \$7,$e,$t3
vpandn $g,$e,$t1
vpand $f,$e,$axb # borrow $axb
+ `"prefetch 63(@ptr[1])" if ($i==15)`
vpxor $t2,$sigma,$sigma
vpsrld \$2,$a,$h # borrow $h
vpxor $t3,$sigma,$sigma # Sigma1(e)
vpxor $t2,$sigma,$sigma
vpsrld \$2,$a,$h # borrow $h
vpxor $t3,$sigma,$sigma # Sigma1(e)
+ `"prefetch 63(@ptr[2])" if ($i==15)`
vpslld \$30,$a,$t2
vpxor $axb,$t1,$t1 # Ch(e,f,g)
vpxor $a,$b,$axb # a^b, b^c in next round
vpslld \$30,$a,$t2
vpxor $axb,$t1,$t1 # Ch(e,f,g)
vpxor $a,$b,$axb # a^b, b^c in next round
+ `"prefetch 63(@ptr[3])" if ($i==15)`
vpxor $t2,$h,$h
vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
vpsrld \$13,$a,$t2
vpxor $t2,$h,$h
vpaddd $sigma,$Xi,$Xi # Xi+=Sigma1(e)
vpsrld \$13,$a,$t2
+ `"prefetch 63(@ptr[4])" if ($i==15 && $REG_SZ==32)`
vpslld \$19,$a,$t3
vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
vpand $axb,$bxc,$bxc
vpslld \$19,$a,$t3
vpaddd $t1,$Xi,$Xi # Xi+=Ch(e,f,g)
vpand $axb,$bxc,$bxc
+ `"prefetch 63(@ptr[5])" if ($i==15 && $REG_SZ==32)`
vpxor $t2,$h,$sigma
vpsrld \$22,$a,$t2
vpxor $t3,$sigma,$sigma
vpxor $t2,$h,$sigma
vpsrld \$22,$a,$t2
vpxor $t3,$sigma,$sigma
+ `"prefetch 63(@ptr[6])" if ($i==15 && $REG_SZ==32)`
vpslld \$10,$a,$t3
vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
vpaddd $Xi,$d,$d # d+=Xi
vpslld \$10,$a,$t3
vpxor $bxc,$b,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
vpaddd $Xi,$d,$d # d+=Xi
+ `"prefetch 63(@ptr[7])" if ($i==15 && $REG_SZ==32)`
vpxor $t2,$sigma,$sigma
vpxor $t3,$sigma,$sigma # Sigma0(a)
vpxor $t2,$sigma,$sigma
vpxor $t3,$sigma,$sigma # Sigma0(a)