ghash-ia64.pl: excuse myself from implementing "528B" variant.

[openssl.git] / crypto / modes / asm / ghash-ia64.pl
diff --git a/crypto/modes/asm/ghash-ia64.pl b/crypto/modes/asm/ghash-ia64.pl

index 86c08c64778f518666910048a2a835202c7c4524..299848ff85492aa055021fdd828482da759c6441 100755 (executable)
--- a/crypto/modes/asm/ghash-ia64.pl
+++ b/crypto/modes/asm/ghash-ia64.pl
@@ -9,16 +9,25 @@
  #
  # March 2010
  #
-# The module implements "4-bit" Galois field multiplication and
-# streamed GHASH function. "4-bit" means that it uses 256 bytes
-# per-key table [+128 bytes shared table]. Streamed GHASH performance
-# was measured to be 6.35 cycles per processed byte on Itanium 2,
-# which is >90% better than Microsoft compiler generated code. Well,
-# the number should have been ~6.5. The deviation has everything to do
-# with the way performance is measured, as difference between GCM and
-# straightforward 128-bit counter mode. To anchor to something else
-# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium
-# GHASH should run at ~8.5 cycles per byte.
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.35 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. Well, the number should have been ~6.5. The deviation has
+# everything to do with the way performance is measured: as difference
+# between GCM and straightforward 128-bit counter mode. To anchor to
+# something else sha1-ia64.pl module processes one byte in 6.0 cycles.
+# On Itanium GHASH should run at ~8.5 cycles per byte.
+
+# Note about "528B" variant. In Itanium 2 case it makes lesser sense
+# to implement it for following reason. Because number of functional
+# units is naturally limited, it's impossible to implement "528B" loop
+# in 4 cycles, only in 5. This means that theoretically performance
+# improvement can't be more than 20%, ~15% is more realistic. This
+# is considered below justification level for implementing new code.
+# Not to mention that on original Itanium it would actually run
+# slower, spending >9 cycles per byte.
  
  $output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
  
@@ -142,13 +151,13 @@ gcm_ghash_4bit:
         .prologue
  { .mmi;        .save   ar.pfs,prevfs
         alloc   prevfs=ar.pfs,4,4,0,8
-       $ADDP   inp=15,in0                      // &inp[15]
+       $ADDP   inp=15,in2                      // &inp[15]
         mov     rem_4bitp=ip            }
-{ .mmi;        $ADDP   end=in1,in0                     // &inp[len]
-       $ADDP   Xi=15,in2                       // &Xi[15]
+{ .mmi;        $ADDP   end=in3,in2                     // &inp[len]
+       $ADDP   Xi=15,in0                       // &Xi[15]
         .save   ar.lc,prevlc
         mov     prevlc=ar.lc            };;
-{ .mmi;        $ADDP   Htbl=8,in3                      // &Htbl[0].lo
+{ .mmi;        $ADDP   Htbl=8,in1                      // &Htbl[0].lo
         mov     mask0xf0=0xf0
         .save   pr,prevpr
         mov     prevpr=pr               }