crypto/modes/asm/aesni-gcm-x86_64.pl: minor optimization.

[openssl.git] / crypto / modes / asm / aesni-gcm-x86_64.pl
diff --git a/crypto/modes/asm/aesni-gcm-x86_64.pl b/crypto/modes/asm/aesni-gcm-x86_64.pl

index 31987146b0e1d444d32cfab95a273a430c185881..3781933917227dd127352fa8d7d066804dacba13 100644 (file)
--- a/crypto/modes/asm/aesni-gcm-x86_64.pl
+++ b/crypto/modes/asm/aesni-gcm-x86_64.pl
@@ -21,8 +21,8 @@
  # justify. This module is based on combination of Intel submissions,
  # [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
  # Locktyukhin of Intel Corp. who verified that it reduces shuffles
-# pressure with notable relative improvement on upcoming Haswell
-# processor. [Exact performance numbers to be added at launch.]
+# pressure with notable relative improvement, achieving 1.0 cycle per
+# byte processed with 128-bit key on Haswell processor.
  #
  # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  # [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
@@ -422,17 +422,28 @@ $code.=<<___;
         vzeroupper
  
         vmovdqu         ($ivp),$T1              # input counter value
-       sub             \$128,%rsp
+       add             \$-128,%rsp
         mov             12($ivp),$counter
         lea             .Lbswap_mask(%rip),$const
+       lea             -0x80($key),$in0        # borrow $in0
+       mov             \$0xf80,$end0           # borrow $end0
         vmovdqu         ($Xip),$Xi              # load Xi
-       and             \$-64,%rsp              # ensure stack alignment
+       and             \$-128,%rsp             # ensure stack alignment
         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
         lea             0x80($key),$key         # size optimization
         lea             0x20+0x20($Xip),$Xip    # size optimization
         mov             0xf0-0x80($key),$rounds
         vpshufb         $Ii,$Xi,$Xi
  
+       and             $end0,$in0
+       and             %rsp,$end0
+       sub             $in0,$end0
+       jc              .Ldec_no_key_aliasing
+       cmp             \$768,$end0
+       jnc             .Ldec_no_key_aliasing
+       sub             $end0,%rsp              # avoid aliasing with key
+.Ldec_no_key_aliasing:
+
         vmovdqu         0x50($inp),$Z3          # I[5]
         lea             ($inp),$in0
         vmovdqu         0x40($inp),$Z0
@@ -621,14 +632,25 @@ $code.=<<___;
         vzeroupper
  
         vmovdqu         ($ivp),$T1              # input counter value
-       sub             \$128,%rsp
+       add             \$-128,%rsp
         mov             12($ivp),$counter
         lea             .Lbswap_mask(%rip),$const
+       lea             -0x80($key),$in0        # borrow $in0
+       mov             \$0xf80,$end0           # borrow $end0
         lea             0x80($key),$key         # size optimization
         vmovdqu         ($const),$Ii            # borrow $Ii for .Lbswap_mask
-       and             \$-64,%rsp              # ensure stack alignment
+       and             \$-128,%rsp             # ensure stack alignment
         mov             0xf0-0x80($key),$rounds
  
+       and             $end0,$in0
+       and             %rsp,$end0
+       sub             $in0,$end0
+       jc              .Lenc_no_key_aliasing
+       cmp             \$768,$end0
+       jnc             .Lenc_no_key_aliasing
+       sub             $end0,%rsp              # avoid aliasing with key
+.Lenc_no_key_aliasing:
+
         lea             ($out),$in0
         lea             -0xc0($out,$len),$end0
         shr             \$4,$len