#
# March 2010
#
-# The module implements "4-bit" Galois field multiplication and
-# streamed GHASH function. "4-bit" means that it uses 256 bytes
-# per-key table [+128 bytes shared table]. Streamed GHASH performance
-# was measured to be 6.35 cycles per processed byte on Itanium 2,
-# which is >90% better than Microsoft compiler generated code. Well,
-# the number should have been ~6.5. The deviation has everything to do
-# with the way performance is measured, as difference between GCM and
-# straightforward 128-bit counter mode. To anchor to something else
-# sha1-ia64.pl module processes one byte in 6.0 cycles. On Itanium
-# GHASH should run at ~8.5 cycles per byte.
+# The module implements "4-bit" GCM GHASH function and underlying
+# single multiplication operation in GF(2^128). "4-bit" means that it
+# uses 256 bytes per-key table [+128 bytes shared table]. Streamed
+# GHASH performance was measured to be 6.35 cycles per processed byte
+# on Itanium 2, which is >90% better than Microsoft compiler generated
+# code. Well, the number should have been ~6.5. The deviation has
+# everything to do with the way performance is measured: as difference
+# between GCM and straightforward 128-bit counter mode. To anchor to
+# something else sha1-ia64.pl module processes one byte in 6.0 cycles.
+# On Itanium GHASH should run at ~8.5 cycles per byte.
+
+# Note about "528B" variant. In Itanium 2 case it makes lesser sense
+# to implement it for following reason. Because number of functional
+# units is naturally limited, it's impossible to implement "528B" loop
+# in 4 cycles, only in 5. This means that theoretically performance
+# improvement can't be more than 20%, ~15% is more realistic. This
+# is considered below justification level for implementing new code.
+# Not to mention that on original Itanium it would actually run
+# slower, spending >9 cycles per byte.
$output=shift and (open STDOUT,">$output" or die "can't open $output: $!");
.prologue
{ .mmi; .save ar.pfs,prevfs
alloc prevfs=ar.pfs,4,4,0,8
- $ADDP inp=15,in0 // &inp[15]
+ $ADDP inp=15,in2 // &inp[15]
mov rem_4bitp=ip }
-{ .mmi; $ADDP end=in1,in0 // &inp[len]
- $ADDP Xi=15,in2 // &Xi[15]
+{ .mmi; $ADDP end=in3,in2 // &inp[len]
+ $ADDP Xi=15,in0 // &Xi[15]
.save ar.lc,prevlc
mov prevlc=ar.lc };;
-{ .mmi; $ADDP Htbl=8,in3 // &Htbl[0].lo
+{ .mmi; $ADDP Htbl=8,in1 // &Htbl[0].lo
mov mask0xf0=0xf0
.save pr,prevpr
mov prevpr=pr }