From 7a1a12232a84621271bf808107f3be9a2df5121a Mon Sep 17 00:00:00 2001
From: Andy Polyakov <appro@openssl.org>
Date: Mon, 9 Sep 2013 21:43:21 +0200
Subject: [PATCH] crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization.

Avoid occasional up to 8% performance drops.
---
 crypto/modes/asm/aesni-gcm-x86_64.pl | 34 +++++++++++++++++++++++-----
 1 file changed, 28 insertions(+), 6 deletions(-)

diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl
index 31987146b0..3781933917 100644
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -21,8 +21,8 @@
 # justify. This module is based on combination of Intel submissions,
 # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
 # Locktyukhin of Intel Corp. who verified that it reduces shuffles
-# pressure with notable relative improvement on upcoming Haswell
-# processor. [Exact performance numbers to be added at launch.]
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor.
 #
 # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
 # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -422,17 +422,28 @@ $code.=<<___;
 	vzeroupper
 
 	vmovdqu		($ivp),$T1		# input counter value
-	sub		\$128,%rsp
+	add		\$-128,%rsp
 	mov		12($ivp),$counter
 	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
 	vmovdqu		($Xip),$Xi		# load Xi
-	and		\$-64,%rsp		# ensure stack alignment
+	and		\$-128,%rsp		# ensure stack alignment
 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
 	lea		0x80($key),$key		# size optimization
 	lea		0x20+0x20($Xip),$Xip	# size optimization
 	mov		0xf0-0x80($key),$rounds
 	vpshufb		$Ii,$Xi,$Xi
 
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Ldec_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Ldec_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Ldec_no_key_aliasing:
+
 	vmovdqu		0x50($inp),$Z3		# I[5]
 	lea		($inp),$in0
 	vmovdqu		0x40($inp),$Z0
@@ -621,14 +632,25 @@ $code.=<<___;
 	vzeroupper
 
 	vmovdqu		($ivp),$T1		# input counter value
-	sub		\$128,%rsp
+	add		\$-128,%rsp
 	mov		12($ivp),$counter
 	lea		.Lbswap_mask(%rip),$const
+	lea		-0x80($key),$in0	# borrow $in0
+	mov		\$0xf80,$end0		# borrow $end0
 	lea		0x80($key),$key		# size optimization
 	vmovdqu		($const),$Ii		# borrow $Ii for .Lbswap_mask
-	and		\$-64,%rsp		# ensure stack alignment
+	and		\$-128,%rsp		# ensure stack alignment
 	mov		0xf0-0x80($key),$rounds
 
+	and		$end0,$in0
+	and		%rsp,$end0
+	sub		$in0,$end0
+	jc		.Lenc_no_key_aliasing
+	cmp		\$768,$end0
+	jnc		.Lenc_no_key_aliasing
+	sub		$end0,%rsp		# avoid aliasing with key
+.Lenc_no_key_aliasing:
+
 	lea		($out),$in0
 	lea		-0xc0($out,$len),$end0
 	shr		\$4,$len
-- 
2.34.1