-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# longer. A CPU with higher pclmulqdq issue rate would also benefit
# from higher aggregate factor...
#
-# Westmere 1.76(+14%)
-# Sandy Bridge 1.79(+9%)
-# Ivy Bridge 1.79(+8%)
+# Westmere 1.78(+13%)
+# Sandy Bridge 1.80(+8%)
+# Ivy Bridge 1.80(+7%)
# Haswell 0.55(+93%) (if system doesn't support AVX)
-# Bulldozer 1.52(+25%)
+# Broadwell 0.45(+110%)(if system doesn't support AVX)
+# Skylake 0.44(+110%)(if system doesn't support AVX)
+# Bulldozer 1.49(+27%)
+# Silvermont 2.88(+13%)
+# Goldmont 1.08(+24%)
# March 2013
#
# CPUs such as Sandy and Ivy Bridge can execute it, the code performs
# sub-optimally in comparison to above mentioned version. But thanks
# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that
-# it performs in 0.41 cycles per byte on Haswell processor.
+# it performs in 0.41 cycles per byte on Haswell processor, in
+# 0.29 on Broadwell, and in 0.36 on Skylake.
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
=~ /GNU assembler version ([2-9]\.[0-9]+)/) {
- $avx = ($1>=2.19) + ($1>=2.22);
+ $avx = ($1>=2.20) + ($1>=2.22);
}
if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
$avx = ($1>=10) + ($1>=11);
}
-open OUT,"| \"$^X\" $xlate $flavour $output";
+if (!$avx && `$ENV{CC} -v 2>&1` =~ /((?:^clang|LLVM) version|.*based on LLVM) ([3-9]\.[0-9]+)/) {
+ $avx = ($2>=3.0) + ($2>3.0);
+}
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$do4xaggr=1;
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl gcm_gmult_4bit
.type gcm_gmult_4bit,\@function,2
.align 16
gcm_gmult_4bit:
+.cfi_startproc
push %rbx
- push %rbp # %rbp and %r12 are pushed exclusively in
+.cfi_push %rbx
+ push %rbp # %rbp and others are pushed exclusively in
+.cfi_push %rbp
push %r12 # order to reuse Win64 exception handler...
+.cfi_push %r12
+ push %r13
+.cfi_push %r13
+ push %r14
+.cfi_push %r14
+ push %r15
+.cfi_push %r15
+ sub \$280,%rsp
+.cfi_adjust_cfa_offset 280
.Lgmult_prologue:
movzb 15($Xi),$Zlo
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- mov 16(%rsp),%rbx
- lea 24(%rsp),%rsp
+ lea 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea (%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lgmult_epilogue:
ret
+.cfi_endproc
.size gcm_gmult_4bit,.-gcm_gmult_4bit
___
\f
.type gcm_ghash_4bit,\@function,4
.align 16
gcm_ghash_4bit:
+.cfi_startproc
push %rbx
+.cfi_push %rbx
push %rbp
+.cfi_push %rbp
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
push %r14
+.cfi_push %r14
push %r15
+.cfi_push %r15
sub \$280,%rsp
+.cfi_adjust_cfa_offset 280
.Lghash_prologue:
mov $inp,%r14 # reassign couple of args
mov $len,%r15
mov $Zlo,8($Xi)
mov $Zhi,($Xi)
- lea 280(%rsp),%rsi
- mov 0(%rsi),%r15
- mov 8(%rsi),%r14
- mov 16(%rsi),%r13
- mov 24(%rsi),%r12
- mov 32(%rsi),%rbp
- mov 40(%rsi),%rbx
- lea 48(%rsi),%rsp
+ lea 280+48(%rsp),%rsi
+.cfi_def_cfa %rsi,8
+ mov -48(%rsi),%r15
+.cfi_restore %r15
+ mov -40(%rsi),%r14
+.cfi_restore %r14
+ mov -32(%rsi),%r13
+.cfi_restore %r13
+ mov -24(%rsi),%r12
+.cfi_restore %r12
+ mov -16(%rsi),%rbp
+.cfi_restore %rbp
+ mov -8(%rsi),%rbx
+.cfi_restore %rbx
+ lea 0(%rsi),%rsp
+.cfi_def_cfa_register %rsp
.Lghash_epilogue:
ret
+.cfi_endproc
.size gcm_ghash_4bit,.-gcm_ghash_4bit
___
\f
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
&clmul64x64_T2 ($Xhi,$Xi,$Hkey,$T2);
$code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
# experimental alternative. special thing about is that there
- # no dependency between the two multiplications...
+ # no dependency between the two multiplications...
mov \$`0xE1<<1`,%eax
- mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
+ mov \$0xA040608020C0E000,%r10 # ((7..0)·0xE0)&0xff
mov \$0x07,%r11d
movq %rax,$T1
movq %r10,$T2
movq %r11,$T3 # borrow $T3
pand $Xi,$T3
- pshufb $T3,$T2 # ($Xi&7)·0xE0
+ pshufb $T3,$T2 # ($Xi&7)·0xE0
movq %rax,$T3
- pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
+ pclmulqdq \$0x00,$Xi,$T1 # ·(0xE1<<1)
pxor $Xi,$T2
pslldq \$15,$T2
paddd $T2,$T2 # <<(64+56+1)
}
\f
{ my ($Xip,$Htbl,$inp,$len)=@_4args;
- my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(6..10));
+ my ($Xln,$Xmn,$Xhn,$Hkey2,$HK) = map("%xmm$_",(3..7));
+ my ($T1,$T2,$T3)=map("%xmm$_",(8..10));
$code.=<<___;
.globl gcm_ghash_clmul
___
$code.=<<___;
movdqa .Lbswap_mask(%rip),$T3
- mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu ($Xip),$Xi
movdqu ($Htbl),$Hkey
my ($Xl,$Xm,$Xh,$Hkey3,$Hkey4)=map("%xmm$_",(11..15));
$code.=<<___;
+ mov OPENSSL_ia32cap_P+4(%rip),%eax
cmp \$0x30,$len
jb .Lskip4x
+ and \$`1<<26|1<<22`,%eax # isolate MOVBE+XSAVE
+ cmp \$`1<<22`,%eax # check for MOVBE without XSAVE
+ je .Lskip4x
+
sub \$0x30,$len
+ mov \$0xA040608020C0E000,%rax # ((7..0)·0xE0)&0xff
movdqu 0x30($Htbl),$Hkey3
movdqu 0x40($Htbl),$Hkey4
pxor $Xl,$Xm
pclmulqdq \$0x00,$Hkey2,$Xl
pclmulqdq \$0x11,$Hkey2,$Xh
- xorps $Xl,$Xln
pclmulqdq \$0x10,$HK,$Xm
+ xorps $Xl,$Xln
xorps $Xh,$Xhn
movups 0x50($Htbl),$HK
xorps $Xm,$Xmn
pshufd \$0b01001110,$Xi,$T1
pxor $Xi,$T1
pclmulqdq \$0x11,$Hkey3,$Xh
- xorps $Xl,$Xln
pclmulqdq \$0x00,$HK,$Xm
+ xorps $Xl,$Xln
xorps $Xh,$Xhn
lea 0x40($inp),$inp
xorps $Xln,$Xi
movdqu 0x20($inp),$Xln
movdqa $Xl,$Xh
- pshufd \$0b01001110,$Xl,$Xm
pclmulqdq \$0x10,$HK,$T1
+ pshufd \$0b01001110,$Xl,$Xm
xorps $Xhn,$Xhi
pxor $Xl,$Xm
pshufb $T3,$Xln
movups 0x20($Htbl),$HK
- pclmulqdq \$0x00,$Hkey,$Xl
xorps $Xmn,$T1
- movdqa $Xln,$Xhn
+ pclmulqdq \$0x00,$Hkey,$Xl
pshufd \$0b01001110,$Xln,$Xmn
pxor $Xi,$T1 # aggregated Karatsuba post-processing
- pxor $Xln,$Xmn
+ movdqa $Xln,$Xhn
pxor $Xhi,$T1 #
+ pxor $Xln,$Xmn
movdqa $T1,$T2 #
- pslldq \$8,$T1
pclmulqdq \$0x11,$Hkey,$Xh
+ pslldq \$8,$T1
psrldq \$8,$T2 #
pxor $T1,$Xi
movdqa .L7_mask(%rip),$T1
pand $Xi,$T1 # 1st phase
pshufb $T1,$T2 #
- pclmulqdq \$0x00,$HK,$Xm
pxor $Xi,$T2 #
+ pclmulqdq \$0x00,$HK,$Xm
psllq \$57,$T2 #
movdqa $T2,$T1 #
pslldq \$8,$T2
pclmulqdq \$0x00,$Hkey2,$Xln
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
pxor $T1,$Xhi #
movdqu 0($inp),$T1
movdqa $Xl,$Xh
pxor $Xm,$Xmn
pshufd \$0b01001110,$Xl,$Xm
- pxor $Xl,$Xm
- pclmulqdq \$0x00,$Hkey3,$Xl
pxor $T2,$Xi #
pxor $T1,$Xhi
+ pxor $Xl,$Xm
+ pclmulqdq \$0x00,$Hkey3,$Xl
psrlq \$1,$Xi #
+ pxor $Xhi,$Xi #
+ movdqa $Xi,$Xhi
pclmulqdq \$0x11,$Hkey3,$Xh
xorps $Xl,$Xln
- pxor $Xhi,$Xi #
+ pshufd \$0b01001110,$Xi,$T1
+ pxor $Xi,$T1
pclmulqdq \$0x00,$HK,$Xm
xorps $Xh,$Xhn
- movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T1
- pxor $Xi,$T1
-
lea 0x40($inp),$inp
sub \$0x40,$len
jnc .Lmod4_loop
.Ltail4x:
pclmulqdq \$0x00,$Hkey4,$Xi
- xorps $Xm,$Xmn
pclmulqdq \$0x11,$Hkey4,$Xhi
- xorps $Xln,$Xi
pclmulqdq \$0x10,$HK,$T1
+ xorps $Xm,$Xmn
+ xorps $Xln,$Xi
xorps $Xhn,$Xhi
pxor $Xi,$Xhi # aggregated Karatsuba post-processing
pxor $Xmn,$T1
pxor $T1,$Xi # Ii+Xi
movdqa $Xln,$Xhn
- pshufd \$0b01001110,$Xln,$T1
- pxor $Xln,$T1
+ pshufd \$0b01001110,$Xln,$Xmn
+ pxor $Xln,$Xmn
pclmulqdq \$0x00,$Hkey,$Xln
pclmulqdq \$0x11,$Hkey,$Xhn
- pclmulqdq \$0x00,$HK,$T1
+ pclmulqdq \$0x00,$HK,$Xmn
lea 32($inp),$inp # i+=2
+ nop
sub \$0x20,$len
jbe .Leven_tail
+ nop
jmp .Lmod_loop
.align 32
.Lmod_loop:
movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T2 #
- pxor $Xi,$T2 #
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
- pclmulqdq \$0x10,$HK,$T2
+ pclmulqdq \$0x10,$HK,$Xmn
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
- movdqu ($inp),$Xhn # Ii
- pshufb $T3,$Xhn
+ movdqu ($inp),$T2 # Ii
+ pxor $Xi,$T1 # aggregated Karatsuba post-processing
+ pshufb $T3,$T2
movdqu 16($inp),$Xln # Ii+1
- pxor $Xi,$T1 # aggregated Karatsuba post-processing
pxor $Xhi,$T1
- pxor $Xhn,$Xhi # "Ii+Xi", consume early
- pxor $T1,$T2
+ pxor $T2,$Xhi # "Ii+Xi", consume early
+ pxor $T1,$Xmn
pshufb $T3,$Xln
- movdqa $T2,$T1 #
+ movdqa $Xmn,$T1 #
psrldq \$8,$T1
- pslldq \$8,$T2 #
+ pslldq \$8,$Xmn #
pxor $T1,$Xhi
- pxor $T2,$Xi #
+ pxor $Xmn,$Xi #
movdqa $Xln,$Xhn #
movdqa $Xi,$T2 # 1st phase
movdqa $Xi,$T1
psllq \$5,$Xi
- pclmulqdq \$0x00,$Hkey,$Xln #######
pxor $Xi,$T1 #
+ pclmulqdq \$0x00,$Hkey,$Xln #######
psllq \$1,$Xi
pxor $T1,$Xi #
psllq \$57,$Xi #
movdqa $Xi,$T1 #
pslldq \$8,$Xi
- psrldq \$8,$T1 #
+ psrldq \$8,$T1 #
pxor $T2,$Xi
+ pshufd \$0b01001110,$Xhn,$Xmn
pxor $T1,$Xhi #
- pshufd \$0b01001110,$Xhn,$T1
- pxor $Xhn,$T1 #
+ pxor $Xhn,$Xmn #
- pclmulqdq \$0x11,$Hkey,$Xhn #######
movdqa $Xi,$T2 # 2nd phase
psrlq \$1,$Xi
+ pclmulqdq \$0x11,$Hkey,$Xhn #######
pxor $T2,$Xhi #
pxor $Xi,$T2
psrlq \$5,$Xi
pxor $T2,$Xi #
+ lea 32($inp),$inp
psrlq \$1,$Xi #
- pclmulqdq \$0x00,$HK,$T1 #######
+ pclmulqdq \$0x00,$HK,$Xmn #######
pxor $Xhi,$Xi #
- lea 32($inp),$inp
sub \$0x20,$len
ja .Lmod_loop
.Leven_tail:
movdqa $Xi,$Xhi
- pshufd \$0b01001110,$Xi,$T2 #
- pxor $Xi,$T2 #
+ movdqa $Xmn,$T1
+ pshufd \$0b01001110,$Xi,$Xmn #
+ pxor $Xi,$Xmn #
pclmulqdq \$0x00,$Hkey2,$Xi
pclmulqdq \$0x11,$Hkey2,$Xhi
- pclmulqdq \$0x10,$HK,$T2
+ pclmulqdq \$0x10,$HK,$Xmn
pxor $Xln,$Xi # (H*Ii+1) + H^2*(Ii+Xi)
pxor $Xhn,$Xhi
pxor $Xi,$T1
pxor $Xhi,$T1
- pxor $T1,$T2
- movdqa $T2,$T1 #
+ pxor $T1,$Xmn
+ movdqa $Xmn,$T1 #
psrldq \$8,$T1
- pslldq \$8,$T2 #
+ pslldq \$8,$Xmn #
pxor $T1,$Xhi
- pxor $T2,$Xi #
+ pxor $Xmn,$Xi #
___
&reduction_alg9 ($Xhi,$Xi);
$code.=<<___;
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
- lea 24(%rax),%rax # adjust "rsp"
+ lea 48+280(%rax),%rax # adjust "rsp"
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
+ mov -32(%rax),%r13
+ mov -40(%rax),%r14
+ mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
+ mov %r13,224($context) # restore context->R13
+ mov %r14,232($context) # restore context->R14
+ mov %r15,240($context) # restore context->R15
.Lin_prologue:
mov 8(%rax),%rdi