# remains z/Architecture specific. On z990 it was measured to perform
# 2.8x better than 32-bit code generated by gcc 4.3.
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
$flavour = shift;
if ($flavour =~ /3[12]/) {
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
-$softonly=1; # disable hardware support for now
+$softonly=0;
$Zhi="%r0";
$Zlo="%r1";
.align 32
gcm_gmult_4bit:
___
-$code.=<<___ if(!$softonly);
+$code.=<<___ if(!$softonly && 0); # hardware is slow for single block...
larl %r1,OPENSSL_s390xcap_P
lg %r0,0(%r1)
tmhl %r0,0x4000 # check for message-security-assist
.align 32
.Lsoft_ghash:
___
-$cdoe.=<<___ if ($flavour =~ /3[12]/);
+$code.=<<___ if ($flavour =~ /3[12]/);
llgfr $len,$len
___
$code.=<<___;
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
j .Lghash_inner
.align 16
.Lghash_inner:
srlg $Zlo,$Zlo,4
- sllg $tmp,$Zhi,60
- xg $Zlo,8($nlo,$Htbl)
srlg $Zhi,$Zhi,4
+ xg $Zlo,8($nlo,$Htbl)
llgc $xi,0($cnt,$Xi)
xg $Zhi,0($nlo,$Htbl)
sllg $nlo,$xi,4
sllg $rem1,$Zlo,3
xgr $Zlo,$tmp
ngr $rem1,$x78
+ sllg $tmp,$Zhi,60
brct $cnt,.Lghash_inner
- sllg $tmp,$Zhi,60
srlg $Zlo,$Zlo,4
srlg $Zhi,$Zhi,4
xg $Zlo,8($nlo,$Htbl)