Fix some assembler generating scripts for better unification

[openssl.git] / crypto / bn / asm / s390x-gf2m.pl
diff --git a/crypto/bn/asm/s390x-gf2m.pl b/crypto/bn/asm/s390x-gf2m.pl

index eb389b323a2188ce249fa43b63700134f31f12b0..1d76c9f416c3f0e6efbe63a09ffb87c36be559f2 100644 (file)
--- a/crypto/bn/asm/s390x-gf2m.pl
+++ b/crypto/bn/asm/s390x-gf2m.pl
@@ -12,17 +12,18 @@
  # The module implements bn_GF2m_mul_2x2 polynomial multiplication used
  # in bn_gf2m.c. It's kind of low-hanging mechanical port from C for
  # the time being... gcc 4.3 appeared to generate poor code, therefore
-# the effort. The module delivers 55%-90% improvement on haviest ECDSA
-# verify and ECDH benchmarks for 163- and 571-bit keys on z990, and
-# 25%-30% - on z196(*). This is for 64-bit build. In 32-bit "highgprs"
-# case improvement is even higher, for example on z990 it was measured
-# 80%-150%. ECDSA sign is modest 9%-12% faster. Keep in mind that
-# these coefficients are not ones for bn_GF2m_mul_2x2 itself, as not
-# all CPU time is burnt in it...
+# the effort. And indeed, the module delivers 55%-90%(*) improvement
+# on haviest ECDSA verify and ECDH benchmarks for 163- and 571-bit
+# key lengths on z990, 30%-55%(*) - on z10, and 70%-110%(*) - on z196.
+# This is for 64-bit build. In 32-bit "highgprs" case improvement is
+# even higher, for example on z990 it was measured 80%-150%. ECDSA
+# sign is modest 9%-12% faster. Keep in mind that these coefficients
+# are not ones for bn_GF2m_mul_2x2 itself, as not all CPU time is
+# burnt in it...
  #
-# (*)  Though no improvement could be measured if compared to code
-#      generated by gcc 4.1. Keep in mind that z196 is out-of-order
-#      execution core and is better at executing poor code.
+# (*)  gcc 4.1 was observed to deliver better results than gcc 4.3,
+#      so that improvement coefficients can vary from one specific
+#      setup to another.
  
  $flavour = shift;
  
@@ -34,7 +35,7 @@ if ($flavour =~ /3[12]/) {
          $g="g";
  }
  
-while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
+while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";
  
  $stdframe=16*$SIZE_T+4*8;
@@ -171,19 +172,19 @@ ___
  if ($SIZE_T==8) {
  my @r=map("%r$_",(6..9));
  $code.=<<___;
-       bras    $ra,_mul_1x1                    # a1·b1
+       bras    $ra,_mul_1x1                    # a1Â·b1
         stmg    $lo,$hi,16($rp)
  
         lg      $a,`$stdframe+128+4*$SIZE_T`($sp)
         lg      $b,`$stdframe+128+6*$SIZE_T`($sp)
-       bras    $ra,_mul_1x1                    # a0·b0
+       bras    $ra,_mul_1x1                    # a0Â·b0
         stmg    $lo,$hi,0($rp)
  
         lg      $a,`$stdframe+128+3*$SIZE_T`($sp)
         lg      $b,`$stdframe+128+5*$SIZE_T`($sp)
         xg      $a,`$stdframe+128+4*$SIZE_T`($sp)
         xg      $b,`$stdframe+128+6*$SIZE_T`($sp)
-       bras    $ra,_mul_1x1                    # (a0+a1)·(b0+b1)
+       bras    $ra,_mul_1x1                    # (a0+a1)Â·(b0+b1)
         lmg     @r[0],@r[3],0($rp)
  
         xgr     $lo,$hi