#! /usr/bin/env perl
-# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2010-2019 The OpenSSL Project Authors. All Rights Reserved.
#
# Licensed under the OpenSSL license (the "License"). You may not use
# this file except in compliance with the License. You can obtain a copy
# See ghash-x86.pl for background information and details about coding
# techniques.
#
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
# December 2012
#
# Skylake 0.44(+110%)(if system doesn't support AVX)
# Bulldozer 1.49(+27%)
# Silvermont 2.88(+13%)
+# Knights L 2.12(-) (if system doesn't support AVX)
+# Goldmont 1.08(+24%)
# March 2013
#
# it performs in 0.41 cycles per byte on Haswell processor, in
# 0.29 on Broadwell, and in 0.36 on Skylake.
#
+# Knights Landing achieves 1.09 cpb.
+#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
$flavour = shift;
.type gcm_gmult_4bit,\@function,2
.align 16
gcm_gmult_4bit:
+.cfi_startproc
push %rbx
- push %rbp # %rbp and %r12 are pushed exclusively in
+.cfi_push %rbx
+ push %rbp # %rbp and others are pushed exclusively in
+.cfi_push %rbp
push %r12 # order to reuse Win64 exception handler...
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ sub \$280,%rsp
+.cfi_adjust_cfa_offset 280
.Lgmult_prologue:
movzb 15($Xi),$Zlo
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- mov 16(%rsp),%rbx
- lea 24(%rsp),%rsp
+ lea 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lgmult_epilogue:
ret
+.cfi_endproc
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
\f
.type gcm_ghash_4bit,\@function,4
.align 16
gcm_ghash_4bit:
+.cfi_startproc
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
sub \$280,%rsp
+.cfi_adjust_cfa_offset 280
.Lghash_prologue:
mov $inp,%r14 # reassign couple of args
mov $len,%r15
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- lea 280(%rsp),%rsi
- mov 0(%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ lea 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea 0(%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lghash_epilogue:
ret
+.cfi_endproc
.size gcm_ghash_4bit,.-gcm_ghash_4bit
___
\f
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
.type gcm_init_clmul,\@abi-omnipotent
.align 16
gcm_init_clmul:
+.cfi_startproc
.L_init_clmul:
___
$code.=<<___ if ($win64);
___
$code.=<<___;
ret
+.cfi_endproc
.size gcm_init_clmul,.-gcm_init_clmul
___
}
.type gcm_gmult_clmul,\@abi-omnipotent
.align 16
gcm_gmult_clmul:
+.cfi_startproc
.L_gmult_clmul:
movdqu ($Xip),$Xi
movdqa .Lbswap_mask(%rip),$T3
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
- # no dependency between the two multiplications...
+ # no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
mov \$0xA040608020C0E000,%r10 # ((7..0)ยท0xE0)&0xff
mov \$0x07,%r11d
pshufb $T3,$Xi
movdqu $Xi,($Xip)
ret
+.cfi_endproc
.size gcm_gmult_clmul,.-gcm_gmult_clmul
___
}
.type gcm_ghash_clmul,\@abi-omnipotent
.align 32
gcm_ghash_clmul:
+.cfi_startproc
.L_ghash_clmul:
___
$code.=<<___ if ($win64);
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
___
$code.=<<___;
ret
+.cfi_endproc
.size gcm_ghash_clmul,.-gcm_ghash_clmul
___
}
.type gcm_init_avx,\@abi-omnipotent
.align 32
gcm_init_avx:
+.cfi_startproc
___
if ($avx) {
my ($Htbl,$Xip)=@_4args;
___
$code.=<<___;
ret
+.cfi_endproc
.size gcm_init_avx,.-gcm_init_avx
___
} else {
.type gcm_gmult_avx,\@abi-omnipotent
.align 32
gcm_gmult_avx:
+.cfi_startproc
jmp .L_gmult_clmul
+.cfi_endproc
.size gcm_gmult_avx,.-gcm_gmult_avx
___
\f
.type gcm_ghash_avx,\@abi-omnipotent
.align 32
gcm_ghash_avx:
+.cfi_startproc
___
if ($avx) {
my ($Xip,$Htbl,$inp,$len)=@_4args;
___
$code.=<<___;
ret
+.cfi_endproc
.size gcm_ghash_avx,.-gcm_ghash_avx
___
} else {
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
- lea 24(%rax),%rax # adjust "rsp"
+ lea 48+280(%rax),%rax # adjust "rsp"
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
.Lin_prologue:
mov 8(%rax),%rdi