projects
/
openssl.git
/ blobdiff
commit
grep
author
committer
pickaxe
?
search:
re
summary
|
shortlog
|
log
|
commit
|
commitdiff
|
tree
raw
|
inline
| side by side
Skylake performance results.
[openssl.git]
/
crypto
/
aes
/
asm
/
aesni-sha1-x86_64.pl
diff --git
a/crypto/aes/asm/aesni-sha1-x86_64.pl
b/crypto/aes/asm/aesni-sha1-x86_64.pl
index ff0b068229cb4e6f32d939f754345b5a1b269b38..952b4cfdd668297ead9176cbc5ee68f3b3380fee 100644
(file)
--- a/
crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/
crypto/aes/asm/aesni-sha1-x86_64.pl
@@
-25,6
+25,7
@@
# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
# Sandy Bridge 5.05[+5.0(6.1)] 10.06(11.15) 5.98(7.05) +68%(+58%)
# Ivy Bridge 5.05[+4.6] 9.65 5.54 +74%
# Haswell 4.43[+3.6(4.2)] 8.00(8.58) 4.55(5.21) +75%(+65%)
+# Skylake 2.63[+3.5(4.1)] 6.17(6.69) 4.23(4.44) +46%(+51%)
# Bulldozer 5.77[+6.0] 11.72 6.37 +84%
#
# AES-192-CBC
# Bulldozer 5.77[+6.0] 11.72 6.37 +84%
#
# AES-192-CBC
@@
-39,6
+40,7
@@
# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
# Ivy Bridge 7.05 11.65 7.12 +64%
# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
# Sandy Bridge 7.05 12.06(13.15) 7.12(7.72) +69%(+70%)
# Ivy Bridge 7.05 11.65 7.12 +64%
# Haswell 6.19 9.76(10.34) 6.21(6.25) +57%(+65%)
+# Skylake 3.62 7.16(7.68) 4.56(4.76) +57%(+61$)
# Bulldozer 8.00 13.95 8.25 +69%
#
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
# Bulldozer 8.00 13.95 8.25 +69%
#
# (*) There are two code paths: SSSE3 and AVX. See sha1-568.pl for
@@
-94,6
+96,9
@@
$avx=1 if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./ &&
$1>=10);
$avx=1 if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./ &&
$1>=10);
+$avx=1 if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/ && $2>=3.0);
+
+$shaext=1; ### set to zero if compiling for 1.0.1
$stitched_decrypt=0;
$stitched_decrypt=0;
@@
-119,6
+124,8
@@
aesni_cbc_sha1_enc:
# caller should check for SSSE3 and AES-NI bits
mov OPENSSL_ia32cap_P+0(%rip),%r10d
mov OPENSSL_ia32cap_P+4(%rip),%r11
# caller should check for SSSE3 and AES-NI bits
mov OPENSSL_ia32cap_P+0(%rip),%r10d
mov OPENSSL_ia32cap_P+4(%rip),%r11
+___
+$code.=<<___ if ($shaext);
bt \$61,%r11 # check SHA bit
jc aesni_cbc_sha1_enc_shaext
___
bt \$61,%r11 # check SHA bit
jc aesni_cbc_sha1_enc_shaext
___
@@
-722,7
+729,7
@@
___
if ($stitched_decrypt) {{{
# reset
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
if ($stitched_decrypt) {{{
# reset
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
-$j=$jj=$r=$
sn=$
rx=0;
+$j=$jj=$r=$rx=0;
$Xi=4;
# reassign for Atom Silvermont (see above)
$Xi=4;
# reassign for Atom Silvermont (see above)
@@
-990,7
+997,7
@@
$code.=<<___;
.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
___
}}}
.size aesni256_cbc_sha1_dec_ssse3,.-aesni256_cbc_sha1_dec_ssse3
___
}}}
-$j=$jj=$r=$
sn=$
rx=0;
+$j=$jj=$r=$rx=0;
if ($avx) {
my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
if ($avx) {
my ($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
@@
-1436,7
+1443,7
@@
___
# reset
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
# reset
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
-$j=$jj=$r=$
sn=$
rx=0;
+$j=$jj=$r=$rx=0;
$Xi=4;
@aes256_dec = (
$Xi=4;
@aes256_dec = (
@@
-1657,7
+1664,7
@@
K_XX_XX:
.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
.asciz "AESNI-CBC+SHA1 stitch for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
- {{{
+
if ($shaext)
{{{
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
$rounds="%r11d";
($in0,$out,$len,$key,$ivp,$ctx,$inp)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9","%r10");
$rounds="%r11d";
@@
-1676,7
+1683,7
@@
aesni_cbc_sha1_enc_shaext:
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
___
$code.=<<___ if ($win64);
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
___
$code.=<<___ if ($win64);
- lea `-8-
4
*16`(%rsp),%rsp
+ lea `-8-
10
*16`(%rsp),%rsp
movaps %xmm6,-8-10*16(%rax)
movaps %xmm7,-8-9*16(%rax)
movaps %xmm8,-8-8*16(%rax)
movaps %xmm6,-8-10*16(%rax)
movaps %xmm7,-8-9*16(%rax)
movaps %xmm8,-8-8*16(%rax)
@@
-1867,7
+1874,21
@@
ssse3_handler:
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
+___
+$code.=<<___ if ($shaext);
+ lea aesni_cbc_sha1_enc_shaext(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lseh_no_shaext
+ lea (%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea 168(%rax),%rax # adjust stack pointer
+ jmp .Lcommon_seh_tail
+.Lseh_no_shaext:
+___
+$code.=<<___;
lea 96(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
lea 96(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
@@
-1939,6
+1960,11
@@
$code.=<<___ if ($avx);
.rva .LSEH_end_aesni_cbc_sha1_enc_avx
.rva .LSEH_info_aesni_cbc_sha1_enc_avx
___
.rva .LSEH_end_aesni_cbc_sha1_enc_avx
.rva .LSEH_info_aesni_cbc_sha1_enc_avx
___
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_aesni_cbc_sha1_enc_shaext
+ .rva .LSEH_end_aesni_cbc_sha1_enc_shaext
+ .rva .LSEH_info_aesni_cbc_sha1_enc_shaext
+___
$code.=<<___;
.section .xdata
.align 8
$code.=<<___;
.section .xdata
.align 8
@@
-1953,6
+1979,12
@@
$code.=<<___ if ($avx);
.rva ssse3_handler
.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
___
.rva ssse3_handler
.rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
___
+$code.=<<___ if ($shaext);
+.LSEH_info_aesni_cbc_sha1_enc_shaext:
+ .byte 9,0,0,0
+ .rva ssse3_handler
+ .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
+___
}
####################################################################
}
####################################################################