modes/ocb128.c: ocb_lookup_l to allow non-contiguous lookup

[openssl.git] / crypto / modes / asm / ghash-s390x.pl
diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl

index 16ad034fc12d61c165d7d21adde534ab89d09d3b..39096b423ad805d7e6475e6aa42bdc8b6ed041ae 100644 (file)
--- a/crypto/modes/asm/ghash-s390x.pl
+++ b/crypto/modes/asm/ghash-s390x.pl
@@ -28,6 +28,15 @@
  # remains z/Architecture specific. On z990 it was measured to perform
  # 2.8x better than 32-bit code generated by gcc 4.3.
  
+# March 2011.
+#
+# Support for hardware KIMD-GHASH is verified to produce correct
+# result and therefore is engaged. On z196 it was measured to process
+# 8KB buffer ~7 faster than software implementation. It's not as
+# impressive for smaller buffer sizes and for smallest 16-bytes buffer
+# it's actually almost 2 times slower. Which is the reason why
+# KIMD-GHASH is not used in gcm_gmult_4bit.
+
  $flavour = shift;
  
  if ($flavour =~ /3[12]/) {
@@ -41,7 +50,7 @@ if ($flavour =~ /3[12]/) {
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";
  
-$softonly=1;   # disable hardware support for now
+$softonly=0;
  
  $Zhi="%r0";
  $Zlo="%r1";
@@ -70,7 +79,7 @@ $code.=<<___;
  .align 32
  gcm_gmult_4bit:
  ___
-$code.=<<___ if(!$softonly);
+$code.=<<___ if(!$softonly && 0);      # hardware is slow for single block...
         larl    %r1,OPENSSL_s390xcap_P
         lg      %r0,0(%r1)
         tmhl    %r0,0x4000      # check for message-security-assist
@@ -129,7 +138,7 @@ $code.=<<___ if(!$softonly);
  .align 32
  .Lsoft_ghash:
  ___
-$cdoe.=<<___ if ($flavour =~ /3[12]/);
+$code.=<<___ if ($flavour =~ /3[12]/);
         llgfr   $len,$len
  ___
  $code.=<<___;
@@ -177,13 +186,13 @@ $code.=<<___;
         sllg    $rem1,$Zlo,3
         xgr     $Zlo,$tmp
         ngr     $rem1,$x78
+       sllg    $tmp,$Zhi,60
         j       .Lghash_inner
  .align 16
  .Lghash_inner:
         srlg    $Zlo,$Zlo,4
-       sllg    $tmp,$Zhi,60
-       xg      $Zlo,8($nlo,$Htbl)
         srlg    $Zhi,$Zhi,4
+       xg      $Zlo,8($nlo,$Htbl)
         llgc    $xi,0($cnt,$Xi)
         xg      $Zhi,0($nlo,$Htbl)
         sllg    $nlo,$xi,4
@@ -204,9 +213,9 @@ $code.=<<___;
         sllg    $rem1,$Zlo,3
         xgr     $Zlo,$tmp
         ngr     $rem1,$x78
+       sllg    $tmp,$Zhi,60
         brct    $cnt,.Lghash_inner
  
-       sllg    $tmp,$Zhi,60
         srlg    $Zlo,$Zlo,4
         srlg    $Zhi,$Zhi,4
         xg      $Zlo,8($nlo,$Htbl)