ghash-x86.pl: engage original MMX version in no-sse2 builds.

[openssl.git] / crypto / modes / asm / ghash-x86.pl
diff --git a/crypto/modes/asm/ghash-x86.pl b/crypto/modes/asm/ghash-x86.pl

index 2805bad0ba93a21d9f672235baae91949c8d7bf8..6b09669d474abbc55c8643b4460f0b6de770eab6 100644 (file)
--- a/crypto/modes/asm/ghash-x86.pl
+++ b/crypto/modes/asm/ghash-x86.pl
@@ -21,17 +21,18 @@
  #
  #              gcc 2.95.3(*)   MMX assembler   x86 assembler
  #
-# Pentium      100/112(**)     -               50
-# PIII         63 /77          12.2            24
-# P4           96 /122         18.0            84(***)
-# Opteron      50 /71          10.1            30
-# Core2                54 /68          8.6             18
+# Pentium      105/111(**)     -               50
+# PIII         68 /75          12.2            24
+# P4           125/125         17.8            84(***)
+# Opteron      66 /70          10.1            30
+# Core2                54 /67          8.4             18
  #
  # (*)  gcc 3.4.x was observed to generate few percent slower code,
  #      which is one of reasons why 2.95.3 results were chosen,
  #      another reason is lack of 3.4.x results for older CPUs;
-#      comparison is not completely fair, because C results are
-#      for vanilla "256B" implementations, not "528B";-)
+#      comparison with MMX results is not completely fair, because C
+#      results are for vanilla "256B" implementation, while
+#      assembler results are for "528B";-)
  # (**) second number is result for code compiled with -fPIC flag,
  #      which is actually more relevant, because assembler code is
  #      position-independent;
@@ -44,7 +45,7 @@
  
  # May 2010
  #
-# Add PCLMULQDQ version performing at 2.13 cycles per processed byte.
+# Add PCLMULQDQ version performing at 2.10 cycles per processed byte.
  # The question is how close is it to theoretical limit? The pclmulqdq
  # instruction latency appears to be 14 cycles and there can't be more
  # than 2 of them executing at any given time. This means that single
@@ -60,38 +61,36 @@
  # Before we proceed to this implementation let's have closer look at
  # the best-performing code suggested by Intel in their white paper.
  # By tracing inter-register dependencies Tmod is estimated as ~19
-# cycles and Naggr is 4, resulting in 2.05 cycles per processed byte.
-# As implied, this is quite optimistic estimate, because it does not
-# account for Karatsuba pre- and post-processing, which for a single
-# multiplication is ~5 cycles. Unfortunately Intel does not provide
-# performance data for GHASH alone, only for fused GCM mode. But
-# we can estimate it by subtracting CTR performance result provided
-# in "AES Instruction Set" white paper: 3.54-1.38=2.16 cycles per
-# processed byte or 5% off the estimate. It should be noted though
-# that 3.54 is GCM result for 16KB block size, while 1.38 is CTR for
-# 1KB block size, meaning that real number is likely to be a bit
-# further from estimate.
+# cycles and Naggr chosen by Intel is 4, resulting in 2.05 cycles per
+# processed byte. As implied, this is quite optimistic estimate,
+# because it does not account for Karatsuba pre- and post-processing,
+# which for a single multiplication is ~5 cycles. Unfortunately Intel
+# does not provide performance data for GHASH alone. But benchmarking
+# AES_GCM_encrypt ripped out of Fig. 15 of the white paper with aadt
+# alone resulted in 2.46 cycles per byte of out 16KB buffer. Note that
+# the result accounts even for pre-computing of degrees of the hash
+# key H, but its portion is negligible at 16KB buffer size.
  #
  # Moving on to the implementation in question. Tmod is estimated as
  # ~13 cycles and Naggr is 2, giving asymptotic performance of ...
  # 2.16. How is it possible that measured performance is better than
  # optimistic theoretical estimate? There is one thing Intel failed
-# to recognize. By fusing GHASH with CTR former's performance is
-# really limited to above (Tmul + Tmod/Naggr) equation. But if GHASH
-# procedure is detached, the modulo-reduction can be interleaved with
-# Naggr-1 multiplications and under ideal conditions even disappear
-# from the equation. So that optimistic theoretical estimate for this
-# implementation is ... 28/16=1.75, and not 2.16. Well, it's probably
-# way too optimistic, at least for such small Naggr. I'd argue that
-# (28+Tproc/Naggr), where Tproc is time required for Karatsuba pre-
-# and post-processing, is more realistic estimate. In this case it
-# gives ... 1.91 cycles per processed byte. Or in other words,
-# depending on how well we can interleave reduction and one of the
-# two multiplications the performance should be betwen 1.91 and 2.16.
-# As already mentioned, this implementation processes one byte [out
-# of 1KB buffer] in 2.13 cycles, while x86_64 counterpart - in 2.07.
-# x86_64 performance is better, because larger register bank allows
-# to interleave reduction and multiplication better.
+# to recognize. By serializing GHASH with CTR in same subroutine
+# former's performance is really limited to above (Tmul + Tmod/Naggr)
+# equation. But if GHASH procedure is detached, the modulo-reduction
+# can be interleaved with Naggr-1 multiplications at instruction level
+# and under ideal conditions even disappear from the equation. So that
+# optimistic theoretical estimate for this implementation is ...
+# 28/16=1.75, and not 2.16. Well, it's probably way too optimistic,
+# at least for such small Naggr. I'd argue that (28+Tproc/Naggr),
+# where Tproc is time required for Karatsuba pre- and post-processing,
+# is more realistic estimate. In this case it gives ... 1.91 cycles.
+# Or in other words, depending on how well we can interleave reduction
+# and one of the two multiplications the performance should be betwen
+# 1.91 and 2.16. As already mentioned, this implementation processes
+# one byte out of 8KB buffer in 2.10 cycles, while x86_64 counterpart
+# - in 2.02. x86_64 performance is better, because larger register
+# bank allows to interleave reduction and multiplication better.
  #
  # Does it make sense to increase Naggr? To start with it's virtually
  # impossible in 32-bit mode, because of limited register bank
@@ -104,6 +103,16 @@
  # providing access to a Westmere-based system on behalf of Intel
  # Open Source Technology Centre.
  
+# January 2010
+#
+# Tweaked to optimize transitions between integer and FP operations
+# on same XMM register, PCLMULQDQ subroutine was measured to process
+# one byte in 2.07 cycles on Sandy Bridge, and in 2.12 - on Westmere.
+# The minor regression on Westmere is outweighed by ~15% improvement
+# on Sandy Bridge. Strangely enough attempt to modify 64-bit code in
+# similar manner resulted in almost 20% degradation on Sandy Bridge,
+# where original 64-bit code processes one byte in 1.95 cycles.
+
  $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  push(@INC,"${dir}","${dir}../../perlasm");
  require "x86asm.pl";
@@ -322,7 +331,7 @@ if (!$x86only) {{{
  
  &static_label("rem_4bit");
  
-if (0) {{      # "May" MMX version is kept for reference...
+if (!$sse2) {{ # pure-MMX "May" version...
  
  $S=12;         # shift factor for rem_4bit
  
@@ -824,8 +833,8 @@ my ($Xhi,$Xi,$Hkey)=@_;
         &pclmulqdq      ($Xi,$Hkey,0x00);       #######
         &pclmulqdq      ($Xhi,$Hkey,0x11);      #######
         &pclmulqdq      ($T1,$T2,0x00);         #######
-       &pxor           ($T1,$Xi);              #
-       &pxor           ($T1,$Xhi);             #
+       &xorps          ($T1,$Xi);              #
+       &xorps          ($T1,$Xhi);             #
  
         &movdqa         ($T2,$T1);              #
         &psrldq         ($T1,8);
@@ -945,7 +954,7 @@ my ($Xhi,$Xi) = @_;
  
         &movdqu         ($Xi,&QWP(0,$Xip));
         &movdqa         ($T3,&QWP(0,$const));
-       &movdqu         ($Hkey,&QWP(0,$Htbl));
+       &movups         ($Hkey,&QWP(0,$Htbl));
         &pshufb         ($Xi,$T3);
  
         &clmul64x64_T2  ($Xhi,$Xi,$Hkey);
@@ -988,7 +997,7 @@ my ($Xhi,$Xi) = @_;
         &pxor           ($Xi,$T1);              # Ii+Xi
  
         &clmul64x64_T2  ($Xhn,$Xn,$Hkey);       # H*Ii+1
-       &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
+       &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
  
         &lea            ($inp,&DWP(32,$inp));   # i+=2
         &sub            ($len,0x20);
@@ -997,7 +1006,7 @@ my ($Xhi,$Xi) = @_;
  &set_label("mod_loop");
         &clmul64x64_T2  ($Xhi,$Xi,$Hkey);       # H^2*(Ii+Xi)
         &movdqu         ($T1,&QWP(0,$inp));     # Ii
-       &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
+       &movups         ($Hkey,&QWP(0,$Htbl));  # load H
  
         &pxor           ($Xi,$Xn);              # (H*Ii+1) + H^2*(Ii+Xi)
         &pxor           ($Xhi,$Xhn);
@@ -1038,9 +1047,9 @@ my ($Xhi,$Xi) = @_;
           &pxor         ($Xi,$T2);              #
  
         &pclmulqdq      ($T1,$T3,0x00);         #######
-       &movdqu         ($Hkey,&QWP(16,$Htbl)); # load H^2
-       &pxor           ($T1,$Xn);              #
-       &pxor           ($T1,$Xhn);             #
+       &movups         ($Hkey,&QWP(16,$Htbl)); # load H^2
+       &xorps          ($T1,$Xn);              #
+       &xorps          ($T1,$Xhn);             #
  
         &movdqa         ($T3,$T1);              #
         &psrldq         ($T1,8);
@@ -1064,7 +1073,7 @@ my ($Xhi,$Xi) = @_;
         &test           ($len,$len);
         &jnz            (&label("done"));
  
-       &movdqu         ($Hkey,&QWP(0,$Htbl));  # load H
+       &movups         ($Hkey,&QWP(0,$Htbl));  # load H
  &set_label("odd_tail");
         &movdqu         ($T1,&QWP(0,$inp));     # Ii
         &pshufb         ($T1,$T3);