# Haswell 5.45 4.15/+31% 3.57/+53%
# Skylake 5.18 4.06/+28% 3.54/+46%
# Bulldozer 9.11 5.95/+53%
+# Ryzen 4.75 ? 1.93/+150%(**)
# VIA Nano 9.32 7.15/+30%
# Atom 10.3 9.17/+12%
# Silvermont 13.1(*) 9.37/+40%
+# Goldmont 8.13 6.42/+27% 1.70/+380%(**)
#
# (*) obviously suboptimal result, nothing was done about it,
# because SSSE3 code is compiled unconditionally;
+# (**) SHAEXT result
$flavour = shift;
$output = shift;
$shaext=1; ### set to zero if compiling for 1.0.1
$avx=1 if (!$shaext && $avx);
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$ctx="%rdi"; # 1st arg
.type sha1_block_data_order,\@function,3
.align 16
sha1_block_data_order:
+.cfi_startproc
mov OPENSSL_ia32cap_P+0(%rip),%r9d
mov OPENSSL_ia32cap_P+4(%rip),%r8d
mov OPENSSL_ia32cap_P+8(%rip),%r10d
jz .Lialu
___
$code.=<<___ if ($shaext);
- test \$`1<<29`,%r10d # check SHA bit
+ test \$`1<<29`,%r10d # check SHA bit
jnz _shaext_shortcut
___
$code.=<<___ if ($avx>1);
.align 16
.Lialu:
mov %rsp,%rax
+.cfi_def_cfa_register %rax
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
mov %rdi,$ctx # reassigned argument
sub \$`8+16*4`,%rsp
mov %rsi,$inp # reassigned argument
and \$-64,%rsp
mov %rdx,$num # reassigned argument
mov %rax,`16*4`(%rsp)
+.cfi_cfa_expression %rsp+64,deref,+8
.Lprologue:
mov 0($ctx),$A
jnz .Lloop
mov `16*4`(%rsp),%rsi
+.cfi_def_cfa %rsi,8
mov -40(%rsi),%r14
+.cfi_restore %r14
mov -32(%rsi),%r13
+.cfi_restore %r13
mov -24(%rsi),%r12
+.cfi_restore %r12
mov -16(%rsi),%rbp
+.cfi_restore %rbp
mov -8(%rsi),%rbx
+.cfi_restore %rbx
lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue:
ret
+.cfi_endproc
.size sha1_block_data_order,.-sha1_block_data_order
___
if ($shaext) {{{
.align 32
sha1_block_data_order_shaext:
_shaext_shortcut:
+.cfi_startproc
___
$code.=<<___ if ($win64);
lea `-8-4*16`(%rsp),%rsp
.align 16
.Loop_shaext:
dec $num
- lea 0x40($inp),%rax # next input block
+ lea 0x40($inp),%r8 # next input block
paddd @MSG[0],$E
- cmovne %rax,$inp
+ cmovne %r8,$inp
movdqa $ABCD,$ABCD_SAVE # offload $ABCD
___
for($i=0;$i<20-4;$i+=2) {
.Lepilogue_shaext:
___
$code.=<<___;
+.cfi_endproc
ret
.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
___
my @T=("%esi","%edi");
my $j=0;
my $rx=0;
-my $K_XX_XX="%r11";
+my $K_XX_XX="%r14";
+my $fp="%r11";
my $_rol=sub { &rol(@_) };
my $_ror=sub { &ror(@_) };
.align 16
sha1_block_data_order_ssse3:
_ssse3_shortcut:
- mov %rsp,%rax
+.cfi_startproc
+ mov %rsp,$fp # frame pointer
+.cfi_def_cfa_register $fp
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13 # redundant, done to share Win64 SE handler
+.cfi_push %r13
push %r14
+.cfi_push %r14
lea `-64-($win64?6*16:0)`(%rsp),%rsp
___
$code.=<<___ if ($win64);
- movaps %xmm6,-40-6*16(%rax)
- movaps %xmm7,-40-5*16(%rax)
- movaps %xmm8,-40-4*16(%rax)
- movaps %xmm9,-40-3*16(%rax)
- movaps %xmm10,-40-2*16(%rax)
- movaps %xmm11,-40-1*16(%rax)
+ movaps %xmm6,-40-6*16($fp)
+ movaps %xmm7,-40-5*16($fp)
+ movaps %xmm8,-40-4*16($fp)
+ movaps %xmm9,-40-3*16($fp)
+ movaps %xmm10,-40-2*16($fp)
+ movaps %xmm11,-40-1*16($fp)
.Lprologue_ssse3:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_ssse3:
ret
+.cfi_endproc
.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
___
.align 16
sha1_block_data_order_avx:
_avx_shortcut:
- mov %rsp,%rax
+.cfi_startproc
+ mov %rsp,$fp
+.cfi_def_cfa_register $fp
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13 # redundant, done to share Win64 SE handler
+.cfi_push %r13
push %r14
+.cfi_push %r14
lea `-64-($win64?6*16:0)`(%rsp),%rsp
vzeroupper
___
$code.=<<___ if ($win64);
- vmovaps %xmm6,-40-6*16(%rax)
- vmovaps %xmm7,-40-5*16(%rax)
- vmovaps %xmm8,-40-4*16(%rax)
- vmovaps %xmm9,-40-3*16(%rax)
- vmovaps %xmm10,-40-2*16(%rax)
- vmovaps %xmm11,-40-1*16(%rax)
+ vmovaps %xmm6,-40-6*16($fp)
+ vmovaps %xmm7,-40-5*16($fp)
+ vmovaps %xmm8,-40-4*16($fp)
+ vmovaps %xmm9,-40-3*16($fp)
+ vmovaps %xmm10,-40-2*16($fp)
+ vmovaps %xmm11,-40-1*16($fp)
.Lprologue_avx:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
and \$-64,%rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov $E,16($ctx)
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx:
ret
+.cfi_endproc
.size sha1_block_data_order_avx,.-sha1_block_data_order_avx
___
.align 16
sha1_block_data_order_avx2:
_avx2_shortcut:
- mov %rsp,%rax
+.cfi_startproc
+ mov %rsp,$fp
+.cfi_def_cfa_register $fp
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
vzeroupper
___
$code.=<<___ if ($win64);
lea -6*16(%rsp),%rsp
- vmovaps %xmm6,-40-6*16(%rax)
- vmovaps %xmm7,-40-5*16(%rax)
- vmovaps %xmm8,-40-4*16(%rax)
- vmovaps %xmm9,-40-3*16(%rax)
- vmovaps %xmm10,-40-2*16(%rax)
- vmovaps %xmm11,-40-1*16(%rax)
+ vmovaps %xmm6,-40-6*16($fp)
+ vmovaps %xmm7,-40-5*16($fp)
+ vmovaps %xmm8,-40-4*16($fp)
+ vmovaps %xmm9,-40-3*16($fp)
+ vmovaps %xmm10,-40-2*16($fp)
+ vmovaps %xmm11,-40-1*16($fp)
.Lprologue_avx2:
___
$code.=<<___;
- mov %rax,%r14 # original %rsp
mov %rdi,$ctx # reassigned argument
mov %rsi,$inp # reassigned argument
mov %rdx,$num # reassigned argument
vzeroupper
___
$code.=<<___ if ($win64);
- movaps -40-6*16(%r14),%xmm6
- movaps -40-5*16(%r14),%xmm7
- movaps -40-4*16(%r14),%xmm8
- movaps -40-3*16(%r14),%xmm9
- movaps -40-2*16(%r14),%xmm10
- movaps -40-1*16(%r14),%xmm11
+ movaps -40-6*16($fp),%xmm6
+ movaps -40-5*16($fp),%xmm7
+ movaps -40-4*16($fp),%xmm8
+ movaps -40-3*16($fp),%xmm9
+ movaps -40-2*16($fp),%xmm10
+ movaps -40-1*16($fp),%xmm11
___
$code.=<<___;
- lea (%r14),%rsi
- mov -40(%rsi),%r14
- mov -32(%rsi),%r13
- mov -24(%rsi),%r12
- mov -16(%rsi),%rbp
- mov -8(%rsi),%rbx
- lea (%rsi),%rsp
+ mov -40($fp),%r14
+.cfi_restore %r14
+ mov -32($fp),%r13
+.cfi_restore %r13
+ mov -24($fp),%r12
+.cfi_restore %r12
+ mov -16($fp),%rbp
+.cfi_restore %rbp
+ mov -8($fp),%rbx
+.cfi_restore %rbx
+ lea ($fp),%rsp
+.cfi_def_cfa_register %rsp
.Lepilogue_avx2:
ret
+.cfi_endproc
.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
___
}
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
- mov 152($context),%rax # pull context->Rsp
+ mov 208($context),%rax # pull context->R11
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
- mov 232($context),%rax # pull context->R14
-
lea -40-6*16(%rax),%rsi
lea 512($context),%rdi # &context.Xmm6
mov \$12,%ecx