From 7a1a12232a84621271bf808107f3be9a2df5121a Mon Sep 17 00:00:00 2001 From: Andy Polyakov Date: Mon, 9 Sep 2013 21:43:21 +0200 Subject: [PATCH] crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization. Avoid occasional up to 8% performance drops. --- crypto/modes/asm/aesni-gcm-x86_64.pl | 34 +++++++++++++++++++++++----- 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl index 31987146b0..3781933917 100644 --- a/crypto/modes/asm/aesni-gcm-x86_64.pl +++ b/crypto/modes/asm/aesni-gcm-x86_64.pl @@ -21,8 +21,8 @@ # justify. This module is based on combination of Intel submissions, # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max # Locktyukhin of Intel Corp. who verified that it reduces shuffles -# pressure with notable relative improvement on upcoming Haswell -# processor. [Exact performance numbers to be added at launch.] +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor. # # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf @@ -422,17 +422,28 @@ $code.=<<___; vzeroupper vmovdqu ($ivp),$T1 # input counter value - sub \$128,%rsp + add \$-128,%rsp mov 12($ivp),$counter lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 vmovdqu ($Xip),$Xi # load Xi - and \$-64,%rsp # ensure stack alignment + and \$-128,%rsp # ensure stack alignment vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask lea 0x80($key),$key # size optimization lea 0x20+0x20($Xip),$Xip # size optimization mov 0xf0-0x80($key),$rounds vpshufb $Ii,$Xi,$Xi + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Ldec_no_key_aliasing + cmp \$768,$end0 + jnc .Ldec_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Ldec_no_key_aliasing: + vmovdqu 0x50($inp),$Z3 # I[5] lea ($inp),$in0 vmovdqu 0x40($inp),$Z0 @@ -621,14 +632,25 @@ $code.=<<___; vzeroupper vmovdqu ($ivp),$T1 # input counter value - sub \$128,%rsp + add \$-128,%rsp mov 12($ivp),$counter lea .Lbswap_mask(%rip),$const + lea -0x80($key),$in0 # borrow $in0 + mov \$0xf80,$end0 # borrow $end0 lea 0x80($key),$key # size optimization vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask - and \$-64,%rsp # ensure stack alignment + and \$-128,%rsp # ensure stack alignment mov 0xf0-0x80($key),$rounds + and $end0,$in0 + and %rsp,$end0 + sub $in0,$end0 + jc .Lenc_no_key_aliasing + cmp \$768,$end0 + jnc .Lenc_no_key_aliasing + sub $end0,%rsp # avoid aliasing with key +.Lenc_no_key_aliasing: + lea ($out),$in0 lea -0xc0($out,$len),$end0 shr \$4,$len -- 2.34.1