# Sandy Bridge 5.05/6.05/7.05+11.6 13.0 +28%/36%/43%
# Ivy Bridge 5.05/6.05/7.05+10.3 11.6 +32%/41%/50%
# Haswell 4.43/5.29/6.19+7.80 8.79 +39%/49%/59%
+# Skylake 2.62/3.14/3.62+7.70 8.10 +27%/34%/40%
# Bulldozer 5.77/6.89/8.00+13.7 13.7 +42%/50%/58%
#
# (*) there are XOP, AVX1 and AVX2 code pathes, meaning that
if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
`ml64 2>&1` =~ /Version ([0-9]+)\./) {
- $avx = ($1>=10) + ($1>=11);
+ $avx = ($1>=10) + ($1>=12);
}
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+$shaext=$avx; ### set to zero if compiling for 1.0.1
+$avx=1 if (!$shaext && $avx);
+
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
.align 16
$func:
___
-$code.=<<___ if ($avx);
+ if ($avx) {
+$code.=<<___;
lea OPENSSL_ia32cap_P(%rip),%r11
mov \$1,%eax
cmp \$0,`$win64?"%rcx":"%rdi"`
je .Lprobe
mov 0(%r11),%eax
mov 4(%r11),%r10
-
+___
+$code.=<<___ if ($shaext);
bt \$61,%r10 # check for SHA
jc ${func}_shaext
-
+___
+$code.=<<___;
mov %r10,%r11
shr \$32,%r11
cmp \$`1<<8|1<<5|1<<3`,%r11d
je ${func}_avx2
___
-$code.=<<___ if ($avx);
+$code.=<<___;
and \$`1<<30`,%eax # mask "Intel CPU" bit
and \$`1<<28|1<<9`,%r10d # mask AVX+SSSE3 bits
or %eax,%r10d
je ${func}_avx
ud2
___
+ }
$code.=<<___;
xor %eax,%eax
cmp \$0,`$win64?"%rcx":"%rdi"`
$r++; unshift(@rndkey,pop(@rndkey));
};
+if ($shaext) {
+my $Tbl="%rax";
+
$code.=<<___;
.type ${func}_shaext,\@function,6
.align 32
${func}_shaext:
- mov %rsp,%rax
mov `($win64?56:8)`(%rsp),$inp # load 7th argument
- push %rbx
___
$code.=<<___ if ($win64);
- lea `-4*16`(%rsp),%rsp
+ lea `-8-10*16`(%rsp),%rsp
movaps %xmm6,-8-10*16(%rax)
movaps %xmm7,-8-9*16(%rax)
movaps %xmm8,-8-8*16(%rax)
movdqu $CDGH,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -8-10*16(%rax),%xmm6
- movaps -8-9*16(%rax),%xmm7
- movaps -8-8*16(%rax),%xmm8
- movaps -8-7*16(%rax),%xmm9
- movaps -8-6*16(%rax),%xmm10
- movaps -8-5*16(%rax),%xmm11
- movaps -8-4*16(%rax),%xmm12
- movaps -8-3*16(%rax),%xmm13
- movaps -8-2*16(%rax),%xmm14
- movaps -8-1*16(%rax),%xmm15
+ movaps 0*16(%rsp),%xmm6
+ movaps 1*16(%rsp),%xmm7
+ movaps 2*16(%rsp),%xmm8
+ movaps 3*16(%rsp),%xmm9
+ movaps 4*16(%rsp),%xmm10
+ movaps 5*16(%rsp),%xmm11
+ movaps 6*16(%rsp),%xmm12
+ movaps 7*16(%rsp),%xmm13
+ movaps 8*16(%rsp),%xmm14
+ movaps 9*16(%rsp),%xmm15
+ lea 8+10*16(%rsp),%rsp
.Lepilogue_shaext:
___
$code.=<<___;
- mov -8(%rax),%rbx
- mov %rax,%rsp
ret
.size ${func}_shaext,.-${func}_shaext
___
+}
}}}}}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
-if ($win64) {
+if ($win64 && $avx) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
-$code.=<<___ if ($avx);
+$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
___
+$code.=<<___ if ($shaext);
+ lea aesni_cbc_sha256_enc_shaext(%rip),%r10
+ cmp %r10,%rbx
+ jb .Lnot_in_shaext
+
+ lea (%rax),%rsi
+ lea 512($context),%rdi # &context.Xmm6
+ mov \$20,%ecx
+ .long 0xa548f3fc # cld; rep movsq
+ lea 168(%rax),%rax # adjust stack pointer
+ jmp .Lin_prologue
+.Lnot_in_shaext:
+___
$code.=<<___ if ($avx>1);
lea .Lavx2_shortcut(%rip),%r10
cmp %r10,%rbx # context->Rip<avx2_shortcut
.rva .LSEH_end_${func}_avx2
.rva .LSEH_info_${func}_avx2
___
-$code.=<<___ if ($avx);
+$code.=<<___ if ($shaext);
+ .rva .LSEH_begin_${func}_shaext
+ .rva .LSEH_end_${func}_shaext
+ .rva .LSEH_info_${func}_shaext
+___
+$code.=<<___;
.section .xdata
.align 8
.LSEH_info_${func}_xop:
.rva se_handler
.rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
___
+$code.=<<___ if ($shaext);
+.LSEH_info_${func}_shaext:
+ .byte 9,0,0,0
+ .rva se_handler
+ .rva .Lprologue_shaext,.Lepilogue_shaext # HandlerData[]
+___
}
####################################################################
sub sha256op38 {
my $instr = shift;
- if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
+ if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
my @opcode=(0x0f,0x38);
rex(\@opcode,$2,$1);
push @opcode,$opcodelet{$instr};