s390x assembler pack: adapt for -m31 build, see commentary in Configure

author Andy Polyakov <appro@openssl.org>

Mon, 29 Nov 2010 20:52:43 +0000 (20:52 +0000)

committer Andy Polyakov <appro@openssl.org>

Mon, 29 Nov 2010 20:52:43 +0000 (20:52 +0000)
author Andy Polyakov <appro@openssl.org>
Mon, 29 Nov 2010 20:52:43 +0000 (20:52 +0000)
committer Andy Polyakov <appro@openssl.org>
Mon, 29 Nov 2010 20:52:43 +0000 (20:52 +0000)
diff --git a/Configure b/Configure

index cb4cec619719fe12b4d20805d739d051fa9763c2..e2efb5b864b479e1c2524cf230baf9d2e578d2ee 100755 (executable)
--- a/Configure
+++ b/Configure
@@ -134,7 +134,7 @@ my $sparcv8_asm=":sparcv8.o:des_enc-sparc.o fcrypt_b.o::::::::::::void";
  my $alpha_asm="alphacpuid.o:bn_asm.o alpha-mont.o:::::sha1-alpha.o:::::::ghash-alpha.o:void";
  my $mips32_asm=":bn-mips.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o:::::::";
  my $mips64_asm=":bn-mips.o mips-mont.o::aes_cbc.o aes-mips.o:::sha1-mips.o sha256-mips.o sha512-mips.o:::::::";
-my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:void";
+my $s390x_asm="s390xcap.o s390xcpuid.o:bn-s390x.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o";
  my $armv4_asm=":bn_asm.o armv4-mont.o::aes_cbc.o aes-armv4.o:::sha1-armv4-large.o sha256-armv4.o sha512-armv4.o:::::::ghash-armv4.o:void";
  my $parisc11_asm="pariscid.o:bn_asm.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:32";
  my $parisc20_asm="pariscid.o:pa-risc2W.o parisc-mont.o::aes_core.o aes_cbc.o aes-parisc.o:::sha1-parisc.o sha256-parisc.o sha512-parisc.o::rc4-parisc.o:::::ghash-parisc.o:64";
@@ -356,7 +356,21 @@ my %table=(
  "linux-ia64-ecc","ecc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "linux-ia64-icc","icc:-DL_ENDIAN -DTERMIO -O2 -Wall -no_cpprt::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_RISC1 DES_INT:${ia64_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
  "linux-x86_64",        "gcc:-m64 -DL_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_UNROLL:${x86_64_asm}:elf:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
-"linux-s390x", "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
+"linux64-s390x",       "gcc:-m64 -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:${s390x_asm}:64:dlfcn:linux-shared:-fPIC:-m64:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::64",
+#### So called "highgprs" target for z/Architecture CPUs
+# "Highgprs" is kernel feature first implemented in Linux 2.6.32, see
+# /proc/cpuinfo. The idea is to preserve most significant bits of
+# general purpose registers not only upon 32-bit process context
+# switch, but even on asynchronous signal delivery to such process.
+# This makes it possible to deploy 64-bit instructions even in legacy
+# application context and achieve better [or should we say adequate]
+# performance. The build is binary compatible with linux-generic32,
+# and the idea is to be able to install the resulting libcrypto.so
+# alongside generic one, e.g. as /lib/highgprs/libcrypto.so.x.y, for
+# ldconfig and run-time linker to autodiscover. Unfortunately it
+# doesn't work just yet, because of couple of bugs in glibc
+# sysdep/s390/dl-procinfo.c affecting ldconfig and ld.so.1...
+"linux32-s390x",       "gcc:-m31 -Wa,-mzarch -DB_ENDIAN -DTERMIO -O3 -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL:s390xcap.o s390xcpuid.o:bn_asm.o s390x-mont.o::aes_ctr.o aes-s390x.o:::sha1-s390x.o sha256-s390x.o sha512-s390x.o::rc4-s390x.o:::::ghash-s390x.o:31:dlfcn:linux-shared:-fPIC:-m31:.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR):::/highgprs",
  #### SPARC Linux setups
  # Ray Miller <ray.miller@computing-services.oxford.ac.uk> has patiently
  # assisted with debugging of following two configs.
diff --git a/config b/config

index 965884a6277b4b50599ee3db3fad00fca08ea8e6..bcc725eb18d9b8c8c54789faf55714341c3aa7d9 100755 (executable)
--- a/config
+++ b/config
@@ -629,7 +629,18 @@ case "$GUESSOS" in
    sh*-*-linux2)  OUT="linux-generic32"; options="$options -DL_ENDIAN" ;;
    m68k*-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
    s390-*-linux2) OUT="linux-generic32"; options="$options -DB_ENDIAN" ;;
-  s390x-*-linux2) OUT="linux-s390x" ;;
+  s390x-*-linux2)
+       # To be uncommented when glibc bug is fixed, see Configure...
+       #if egrep -e '^features.* highgprs' /proc/cpuinfo >/dev/null ; then
+       #  echo "WARNING! If you wish to build \"highgprs\" 32-bit library, then you"
+       #  echo "         have to invoke './Configure linux32-s390x' *manually*."
+       #  if [ "$TEST" = "false" -a -t -1 ]; then
+       #    echo "         You have about 5 seconds to press Ctrl-C to abort."
+       #    (trap "stty `stty -g`" 2 0; stty -icanon min 0 time 50; read waste) <&1
+       #  fi
+       #fi
+       OUT="linux64-s390x"
+       ;;
    x86_64-*-linux?) OUT="linux-x86_64" ;;
    *86-*-linux2) OUT="linux-elf"
         if [ "$GCCVER" -gt 28 ]; then
diff --git a/crypto/aes/asm/aes-s390x.pl b/crypto/aes/asm/aes-s390x.pl

index 4be64e3e512f23ce2d25d4e4c1cae0a68144e8a0..db963c9df0fa0fb17794ed353d68f0d4845f3461 100644 (file)
--- a/crypto/aes/asm/aes-s390x.pl
+++ b/crypto/aes/asm/aes-s390x.pl
@@ -60,6 +60,26 @@
  # maximum, but *on average* it would be as much as ~98%. Meaning that
  # worst case is unlike, it's like hitting ravine on plateau.
  
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";
  
@@ -82,6 +102,8 @@ $rounds="%r13";
  $ra="%r14";
  $sp="%r15";
  
+$stdframe=16*$SIZE_T+4*8;
+
  sub _data_word()
  { my $i;
      while(defined($i=shift)) { $code.=sprintf".long\t0x%08x,0x%08x\n",$i,$i; }
@@ -223,7 +245,7 @@ $code.=<<___ if (!$softonly);
  .Lesoft:
  ___
  $code.=<<___;
-       stmg    %r3,$ra,24($sp)
+       stm${g} %r3,$ra,3*$SIZE_T($sp)
  
         llgf    $s0,0($inp)
         llgf    $s1,4($inp)
@@ -233,20 +255,20 @@ $code.=<<___;
         larl    $tbl,AES_Te
         bras    $ra,_s390x_AES_encrypt
  
-       lg      $out,24($sp)
+       l${g}   $out,3*$SIZE_T($sp)
         st      $s0,0($out)
         st      $s1,4($out)
         st      $s2,8($out)
         st      $s3,12($out)
  
-       lmg     %r6,$ra,48($sp)
+       lm${g}  %r6,$ra,6*$SIZE_T($sp)
         br      $ra
  .size  AES_encrypt,.-AES_encrypt
  
  .type   _s390x_AES_encrypt,\@function
  .align 16
  _s390x_AES_encrypt:
-       stg     $ra,152($sp)
+       st${g}  $ra,`$stdframe-$SIZE_T`($sp)
         x       $s0,0($key)
         x       $s1,4($key)
         x       $s2,8($key)
@@ -410,7 +432,7 @@ _s390x_AES_encrypt:
         or      $s2,$i3
         or      $s3,$t3
  
-       lg      $ra,152($sp)
+       l${g}   $ra,`$stdframe-$SIZE_T`($sp)
         xr      $s0,$t0
         xr      $s1,$t2
         x       $s2,24($key)
@@ -549,7 +571,7 @@ $code.=<<___ if (!$softonly);
  .Ldsoft:
  ___
  $code.=<<___;
-       stmg    %r3,$ra,24($sp)
+       stm${g} %r3,$ra,3*$SIZE_T($sp)
  
         llgf    $s0,0($inp)
         llgf    $s1,4($inp)
@@ -559,20 +581,20 @@ $code.=<<___;
         larl    $tbl,AES_Td
         bras    $ra,_s390x_AES_decrypt
  
-       lg      $out,24($sp)
+       l${g}   $out,3*$SIZE_T($sp)
         st      $s0,0($out)
         st      $s1,4($out)
         st      $s2,8($out)
         st      $s3,12($out)
  
-       lmg     %r6,$ra,48($sp)
+       lm${g}  %r6,$ra,6*$SIZE_T($sp)
         br      $ra
  .size  AES_decrypt,.-AES_decrypt
  
  .type   _s390x_AES_decrypt,\@function
  .align 16
  _s390x_AES_decrypt:
-       stg     $ra,152($sp)
+       st${g}  $ra,`$stdframe-$SIZE_T`($sp)
         x       $s0,0($key)
         x       $s1,4($key)
         x       $s2,8($key)
@@ -716,7 +738,7 @@ _s390x_AES_decrypt:
         nr      $i1,$mask
         nr      $i2,$mask
  
-       lg      $ra,152($sp)
+       l${g}   $ra,`$stdframe-$SIZE_T`($sp)
         or      $s1,$t1
         l       $t0,16($key)
         l       $t1,20($key)
@@ -750,9 +772,9 @@ $code.=<<___;
  .align 16
  AES_set_encrypt_key:
         lghi    $t0,0
-       clgr    $inp,$t0
+       cl${g}r $inp,$t0
         je      .Lminus1
-       clgr    $key,$t0
+       cl${g}r $key,$t0
         je      .Lminus1
  
         lghi    $t0,128
@@ -810,7 +832,7 @@ ___
  $code.=<<___;
  .align 16
  .Lekey_internal:
-       stmg    %r6,%r13,48($sp)        # all non-volatile regs
+       stm${g} %r6,%r13,6*$SIZE_T($sp) # all non-volatile regs
  
         larl    $tbl,AES_Te+2048
  
@@ -871,7 +893,7 @@ $code.=<<___;
         la      $t3,4($t3)              # i++
         brct    $rounds,.L128_loop
         lghi    %r2,0
-       lmg     %r6,%r13,48($sp)
+       lm${g}  %r6,%r13,6*$SIZE_T($sp)
         br      $ra
  
  .align 16
@@ -919,7 +941,7 @@ $code.=<<___;
         st      $s3,36($key)
         brct    $rounds,.L192_continue
         lghi    %r2,0
-       lmg     %r6,%r13,48($sp)
+       lm${g}  %r6,%r13,6*$SIZE_T($sp)
         br      $ra
  
  .align 16
@@ -981,7 +1003,7 @@ $code.=<<___;
         st      $s3,44($key)
         brct    $rounds,.L256_continue
         lghi    %r2,0
-       lmg     %r6,%r13,48($sp)
+       lm${g}  %r6,%r13,6*$SIZE_T($sp)
         br      $ra
  
  .align 16
@@ -1032,11 +1054,11 @@ $code.=<<___;
  .type  AES_set_decrypt_key,\@function
  .align 16
  AES_set_decrypt_key:
-       stg     $key,32($sp)            # I rely on AES_set_encrypt_key to
-       stg     $ra,112($sp)            # save non-volatile registers!
+       st${g}  $key,4*$SIZE_T($sp)     # I rely on AES_set_encrypt_key to
+       st${g}  $ra,14*$SIZE_T($sp)     # save non-volatile registers!
         bras    $ra,AES_set_encrypt_key
-       lg      $key,32($sp)
-       lg      $ra,112($sp)
+       l${g}   $key,4*$SIZE_T($sp)
+       l${g}   $ra,14*$SIZE_T($sp)
         ltgr    %r2,%r2
         bnzr    $ra
  ___
@@ -1051,11 +1073,11 @@ $code.=<<___ if (!$softonly);
  
  .align 16
  .Ldkey_internal:
-       stg     $key,32($sp)
-       stg     $ra,40($sp)
+       st${g}  $key,4*$SIZE_T($sp)
+       st${g}  $ra,14*$SIZE_T($sp)
         bras    $ra,.Lekey_internal
-       lg      $key,32($sp)
-       lg      $ra,40($sp)
+       l${g}   $key,4*$SIZE_T($sp)
+       l${g}   $ra,14*$SIZE_T($sp)
  ___
  $code.=<<___;
  
@@ -1136,7 +1158,7 @@ $code.=<<___;
         la      $key,4($key)
         brct    $rounds,.Lmix
  
-       lmg     %r6,%r13,48($sp)# as was saved by AES_set_encrypt_key!
+       lm${g}  %r6,%r13,6*$SIZE_T($sp)# as was saved by AES_set_encrypt_key!
         lghi    %r2,0
         br      $ra
  .size  AES_set_decrypt_key,.-AES_set_decrypt_key
@@ -1176,7 +1198,7 @@ $code.=<<___ if (!$softonly);
         l       %r0,240($key)   # load kmc code
         lghi    $key,15         # res=len%16, len-=res;
         ngr     $key,$len
-       slgr    $len,$key
+       sl${g}r $len,$key
         la      %r1,16($sp)     # parameter block - ivec || key
         jz      .Lkmc_truncated
         .long   0xb92f0042      # kmc %r4,%r2
@@ -1194,34 +1216,34 @@ $code.=<<___ if (!$softonly);
         tmll    %r0,0x80
         jnz     .Lkmc_truncated_dec
         lghi    %r1,0
-       stg     %r1,128($sp)
-       stg     %r1,136($sp)
+       stg     %r1,16*$SIZE_T($sp)
+       stg     %r1,16*$SIZE_T+8($sp)
         bras    %r1,1f
-       mvc     128(1,$sp),0($inp)
+       mvc     16*$SIZE_T(1,$sp),0($inp)
  1:     ex      $key,0(%r1)
         la      %r1,16($sp)     # restore parameter block
-       la      $inp,128($sp)
+       la      $inp,16*$SIZE_T($sp)
         lghi    $len,16
         .long   0xb92f0042      # kmc %r4,%r2
         j       .Lkmc_done
  .align 16
  .Lkmc_truncated_dec:
-       stg     $out,64($sp)
-       la      $out,128($sp)
+       st${g}  $out,4*$SIZE_T($sp)
+       la      $out,16*$SIZE_T($sp)
         lghi    $len,16
         .long   0xb92f0042      # kmc %r4,%r2
-       lg      $out,64($sp)
+       l${g}   $out,4*$SIZE_T($sp)
         bras    %r1,2f
-       mvc     0(1,$out),128($sp)
+       mvc     0(1,$out),16*$SIZE_T($sp)
  2:     ex      $key,0(%r1)
         j       .Lkmc_done
  .align 16
  .Lcbc_software:
  ___
  $code.=<<___;
-       stmg    $key,$ra,40($sp)
+       stm${g} $key,$ra,5*$SIZE_T($sp)
         lhi     %r0,0
-       cl      %r0,164($sp)
+       cl      %r0,`$stdframe+$SIZE_T-4`($sp)
         je      .Lcbc_decrypt
  
         larl    $tbl,AES_Te
@@ -1232,10 +1254,10 @@ $code.=<<___;
         llgf    $s3,12($ivp)
  
         lghi    $t0,16
-       slgr    $len,$t0
+       sl${g}r $len,$t0
         brc     4,.Lcbc_enc_tail        # if borrow
  .Lcbc_enc_loop:
-       stmg    $inp,$out,16($sp)
+       stm${g} $inp,$out,2*$SIZE_T($sp)
         x       $s0,0($inp)
         x       $s1,4($inp)
         x       $s2,8($inp)
@@ -1244,7 +1266,7 @@ $code.=<<___;
  
         bras    $ra,_s390x_AES_encrypt
  
-       lmg     $inp,$key,16($sp)
+       lm${g}  $inp,$key,2*$SIZE_T($sp)
         st      $s0,0($out)
         st      $s1,4($out)
         st      $s2,8($out)
@@ -1253,33 +1275,33 @@ $code.=<<___;
         la      $inp,16($inp)
         la      $out,16($out)
         lghi    $t0,16
-       ltgr    $len,$len
+       lt${g}r $len,$len
         jz      .Lcbc_enc_done
-       slgr    $len,$t0
+       sl${g}r $len,$t0
         brc     4,.Lcbc_enc_tail        # if borrow
         j       .Lcbc_enc_loop
  .align 16
  .Lcbc_enc_done:
-       lg      $ivp,48($sp)
+       l${g}   $ivp,6*$SIZE_T($sp)
         st      $s0,0($ivp)
         st      $s1,4($ivp)     
         st      $s2,8($ivp)
         st      $s3,12($ivp)
  
-       lmg     %r7,$ra,56($sp)
+       lm${g}  %r7,$ra,7*$SIZE_T($sp)
         br      $ra
  
  .align 16
  .Lcbc_enc_tail:
         aghi    $len,15
         lghi    $t0,0
-       stg     $t0,128($sp)
-       stg     $t0,136($sp)
+       stg     $t0,16*$SIZE_T($sp)
+       stg     $t0,16*$SIZE_T+8($sp)
         bras    $t1,3f
-       mvc     128(1,$sp),0($inp)
+       mvc     16*$SIZE_T(1,$sp),0($inp)
  3:     ex      $len,0($t1)
         lghi    $len,0
-       la      $inp,128($sp)
+       la      $inp,16*$SIZE_T($sp)
         j       .Lcbc_enc_loop
  
  .align 16
@@ -1288,10 +1310,10 @@ $code.=<<___;
  
         lg      $t0,0($ivp)
         lg      $t1,8($ivp)
-       stmg    $t0,$t1,128($sp)
+       stmg    $t0,$t1,16*$SIZE_T($sp)
  
  .Lcbc_dec_loop:
-       stmg    $inp,$out,16($sp)
+       stm${g} $inp,$out,2*$SIZE_T($sp)
         llgf    $s0,0($inp)
         llgf    $s1,4($inp)
         llgf    $s2,8($inp)
@@ -1300,7 +1322,7 @@ $code.=<<___;
  
         bras    $ra,_s390x_AES_decrypt
  
-       lmg     $inp,$key,16($sp)
+       lm${g}  $inp,$key,2*$SIZE_T($sp)
         sllg    $s0,$s0,32
         sllg    $s2,$s2,32
         lr      $s0,$s1
@@ -1308,15 +1330,15 @@ $code.=<<___;
  
         lg      $t0,0($inp)
         lg      $t1,8($inp)
-       xg      $s0,128($sp)
-       xg      $s2,136($sp)
+       xg      $s0,16*$SIZE_T($sp)
+       xg      $s2,16*$SIZE_T+8($sp)
         lghi    $s1,16
-       slgr    $len,$s1
+       sl${g}r $len,$s1
         brc     4,.Lcbc_dec_tail        # if borrow
         brc     2,.Lcbc_dec_done        # if zero
         stg     $s0,0($out)
         stg     $s2,8($out)
-       stmg    $t0,$t1,128($sp)
+       stmg    $t0,$t1,16*$SIZE_T($sp)
  
         la      $inp,16($inp)
         la      $out,16($out)
@@ -1326,7 +1348,7 @@ $code.=<<___;
         stg     $s0,0($out)
         stg     $s2,8($out)
  .Lcbc_dec_exit:
-       lmg     $ivp,$ra,48($sp)
+       lm${g}  %r6,$ra,6*$SIZE_T($sp)
         stmg    $t0,$t1,0($ivp)
  
         br      $ra
@@ -1334,10 +1356,10 @@ $code.=<<___;
  .align 16
  .Lcbc_dec_tail:
         aghi    $len,15
-       stg     $s0,128($sp)
-       stg     $s2,136($sp)
+       stg     $s0,16*$SIZE_T($sp)
+       stg     $s2,16*$SIZE_T+8($sp)
         bras    $s1,4f
-       mvc     0(1,$out),128($sp)
+       mvc     0(1,$out),16*$SIZE_T($sp)
  4:     ex      $len,0($s1)
         j       .Lcbc_dec_exit
  .size  AES_cbc_encrypt,.-AES_cbc_encrypt
@@ -1359,6 +1381,7 @@ $code.=<<___;
  .type  AES_ctr32_encrypt,\@function
  .align 16
  AES_ctr32_encrypt:
+       llgfr   $len,$len       # safe in ctr32 subroutine even in 64-bit case
  ___
  $code.=<<___ if (!$softonly);
         l       %r0,240($key)
@@ -1366,7 +1389,7 @@ $code.=<<___ if (!$softonly);
         clr     %r0,%r1
         jl      .Lctr32_software
  
-       stmg    %r6,$s3,48($sp)
+       stm${g} %r6,$s3,6*$SIZE_T($sp)
  
         slgr    $out,$inp
         la      %r1,0($key)     # %r1 is permanent copy of $key
@@ -1388,14 +1411,14 @@ $code.=<<___ if (!$softonly);
  
         la      $sp,1024($s0)   # alloca
         srlg    $fp,$fp,4       # convert bytes to blocks, minimum 16
-       stg     $s2,0($sp)      # back-chain
-       stg     $fp,8($sp)
+       st${g}  $s2,0($sp)      # back-chain
+       st${g}  $fp,$SIZE_T($sp)
  
         slgr    $len,$fp
         brc     1,.Lctr32_hw_loop       # not zero, no borrow
         algr    $fp,$len        # input is shorter than allocated buffer
         lghi    $len,0
-       stg     $fp,8($sp)
+       st${g}  $fp,$SIZE_T($sp)
  
  .Lctr32_hw_loop:
         la      $s2,16($sp)
@@ -1432,8 +1455,8 @@ $code.=<<___ if (!$softonly);
         lghi    $len,0
         brc     4+1,.Lctr32_hw_loop     # not zero
  
-       lg      $s0,0($sp)
-       lg      $s1,8($sp)
+       l${g}   $s0,0($sp)
+       l${g}   $s1,$SIZE_T($sp)
         la      $s2,16($sp)
  .Lctr32_hw_zap:
         stg     $s0,0($s2)
@@ -1442,30 +1465,30 @@ $code.=<<___ if (!$softonly);
         brct    $s1,.Lctr32_hw_zap
  
         la      $sp,0($s0)
-       lmg     %r6,$s3,48($sp)
+       lm${g}  %r6,$s3,6*$SIZE_T($sp)
         br      $ra
  .align 16
  .Lctr32_software:
  ___
  $code.=<<___;
-       stmg    $key,$ra,40($sp)
-       slgr    $out,$inp
+       stm${g} $key,$ra,5*$SIZE_T($sp)
+       sl${g}r $out,$inp
         larl    $tbl,AES_Te
         llgf    $t1,12($ivp)
  
  .Lctr32_loop:
-       stmg    $inp,$len,16($sp)
+       stm${g} $inp,$len,2*$SIZE_T($sp)
         llgf    $s0,0($ivp)
         llgf    $s1,4($ivp)
         llgf    $s2,8($ivp)
         lgr     $s3,$t1
-       st      $t1,128($sp)
+       st      $t1,16*$SIZE_T($sp)
         lgr     %r4,$key
  
         bras    $ra,_s390x_AES_encrypt
  
-       lmg     $inp,$ivp,16($sp)
-       llgf    $t1,128($sp)
+       lm${g}  $inp,$ivp,2*$SIZE_T($sp)
+       llgf    $t1,16*$SIZE_T($sp)
         x       $s0,0($inp)
         x       $s1,4($inp)
         x       $s2,8($inp)
@@ -1479,7 +1502,7 @@ $code.=<<___;
         ahi     $t1,1           # 32-bit increment
         brct    $len,.Lctr32_loop
  
-       lmg     %r6,$ra,48($sp)
+       lm${g}  %r6,$ra,6*$SIZE_T($sp)
         br      $ra
  .size  AES_ctr32_encrypt,.-AES_ctr32_encrypt
  ___
diff --git a/crypto/bn/asm/s390x-mont.pl b/crypto/bn/asm/s390x-mont.pl

index 502fa2e01fdecf09bd729469c96eddf6be07f289..0c5f0638e10b523364d934e9d32d8a3ec05b7861 100644 (file)
--- a/crypto/bn/asm/s390x-mont.pl
+++ b/crypto/bn/asm/s390x-mont.pl
@@ -32,9 +32,33 @@
  # Reschedule to minimize/avoid Address Generation Interlock hazard,
  # make inner loops counter-based.
  
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. Compatibility with 32-bit BN_ULONG
+# is achieved by swapping words after 64-bit loads, follow _dswap-s.
+# On z990 it was measured to perform 2.6-2.2 times better, less for
+# longer keys...
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";
  
+$stdframe=16*$SIZE_T+4*8;
+
  $mn0="%r0";
  $num="%r1";
  
@@ -63,34 +87,44 @@ $code.=<<___;
  .globl bn_mul_mont
  .type  bn_mul_mont,\@function
  bn_mul_mont:
-       lgf     $num,164($sp)   # pull $num
-       sla     $num,3          # $num to enumerate bytes
+       lgf     $num,`$stdframe+$SIZE_T-4`($sp) # pull $num
+       sla     $num,`log($SIZE_T)/log(2)`      # $num to enumerate bytes
         la      $bp,0($num,$bp)
  
-       stg     %r2,16($sp)
+       st${g}  %r2,2*$SIZE_T($sp)
  
         cghi    $num,16         #
         lghi    %r2,0           #
         blr     %r14            # if($num<16) return 0;
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+       tmll    $num,4
+       bnzr    %r14            # if ($num&1) return 0;
+___
+$code.=<<___ if ($flavour !~ /3[12]/);
         cghi    $num,128        #
         bhr     %r14            # if($num>128) return 0;
+___
+$code.=<<___;
+       stm${g} %r3,%r15,3*$SIZE_T($sp)
  
-       stmg    %r3,%r15,24($sp)
-
-       lghi    $rp,-160-8      # leave room for carry bit
+       lghi    $rp,-$stdframe-8        # leave room for carry bit
         lcgr    $j,$num         # -$num
         lgr     %r0,$sp
         la      $rp,0($rp,$sp)
         la      $sp,0($j,$rp)   # alloca
-       stg     %r0,0($sp)      # back chain
+       st${g}  %r0,0($sp)      # back chain
  
         sra     $num,3          # restore $num
         la      $bp,0($j,$bp)   # restore $bp
         ahi     $num,-1         # adjust $num for inner loop
         lg      $n0,0($n0)      # pull n0
+       _dswap  $n0
  
         lg      $bi,0($bp)
+       _dswap  $bi
         lg      $alo,0($ap)
+       _dswap  $alo
         mlgr    $ahi,$bi        # ap[0]*bp[0]
         lgr     $AHI,$ahi
  
@@ -98,6 +132,7 @@ bn_mul_mont:
         msgr    $mn0,$n0
  
         lg      $nlo,0($np)     #
+       _dswap  $nlo
         mlgr    $nhi,$mn0       # np[0]*m1
         algr    $nlo,$alo       # +="tp[0]"
         lghi    $NHI,0
@@ -109,12 +144,14 @@ bn_mul_mont:
  .align 16
  .L1st:
         lg      $alo,0($j,$ap)
+       _dswap  $alo
         mlgr    $ahi,$bi        # ap[j]*bp[0]
         algr    $alo,$AHI
         lghi    $AHI,0
         alcgr   $AHI,$ahi
  
         lg      $nlo,0($j,$np)
+       _dswap  $nlo
         mlgr    $nhi,$mn0       # np[j]*m1
         algr    $nlo,$NHI
         lghi    $NHI,0
@@ -122,22 +159,24 @@ bn_mul_mont:
         algr    $nlo,$alo
         alcgr   $NHI,$nhi
  
-       stg     $nlo,160-8($j,$sp)      # tp[j-1]=
+       stg     $nlo,$stdframe-8($j,$sp)        # tp[j-1]=
         la      $j,8($j)        # j++
         brct    $count,.L1st
  
         algr    $NHI,$AHI
         lghi    $AHI,0
         alcgr   $AHI,$AHI       # upmost overflow bit
-       stg     $NHI,160-8($j,$sp)
-       stg     $AHI,160($j,$sp)
+       stg     $NHI,$stdframe-8($j,$sp)
+       stg     $AHI,$stdframe($j,$sp)
         la      $bp,8($bp)      # bp++
  
  .Louter:
         lg      $bi,0($bp)      # bp[i]
+       _dswap  $bi
         lg      $alo,0($ap)
+       _dswap  $alo
         mlgr    $ahi,$bi        # ap[0]*bp[i]
-       alg     $alo,160($sp)   # +=tp[0]
+       alg     $alo,$stdframe($sp)     # +=tp[0]
         lghi    $AHI,0
         alcgr   $AHI,$ahi
  
@@ -145,6 +184,7 @@ bn_mul_mont:
         msgr    $mn0,$n0        # tp[0]*n0
  
         lg      $nlo,0($np)     # np[0]
+       _dswap  $nlo
         mlgr    $nhi,$mn0       # np[0]*m1
         algr    $nlo,$alo       # +="tp[0]"
         lghi    $NHI,0
@@ -156,14 +196,16 @@ bn_mul_mont:
  .align 16
  .Linner:
         lg      $alo,0($j,$ap)
+       _dswap  $alo
         mlgr    $ahi,$bi        # ap[j]*bp[i]
         algr    $alo,$AHI
         lghi    $AHI,0
         alcgr   $ahi,$AHI
-       alg     $alo,160($j,$sp)# +=tp[j]
+       alg     $alo,$stdframe($j,$sp)# +=tp[j]
         alcgr   $AHI,$ahi
  
         lg      $nlo,0($j,$np)
+       _dswap  $nlo
         mlgr    $nhi,$mn0       # np[j]*m1
         algr    $nlo,$NHI
         lghi    $NHI,0
@@ -171,31 +213,33 @@ bn_mul_mont:
         algr    $nlo,$alo       # +="tp[j]"
         alcgr   $NHI,$nhi
  
-       stg     $nlo,160-8($j,$sp)      # tp[j-1]=
+       stg     $nlo,$stdframe-8($j,$sp)        # tp[j-1]=
         la      $j,8($j)        # j++
         brct    $count,.Linner
  
         algr    $NHI,$AHI
         lghi    $AHI,0
         alcgr   $AHI,$AHI
-       alg     $NHI,160($j,$sp)# accumulate previous upmost overflow bit
+       alg     $NHI,$stdframe($j,$sp)# accumulate previous upmost overflow bit
         lghi    $ahi,0
         alcgr   $AHI,$ahi       # new upmost overflow bit
-       stg     $NHI,160-8($j,$sp)
-       stg     $AHI,160($j,$sp)
+       stg     $NHI,$stdframe-8($j,$sp)
+       stg     $AHI,$stdframe($j,$sp)
  
         la      $bp,8($bp)      # bp++
-       clg     $bp,160+8+32($j,$sp)    # compare to &bp[num]
+       cl${g}  $bp,`$stdframe+8+4*$SIZE_T`($j,$sp)     # compare to &bp[num]
         jne     .Louter
  
-       lg      $rp,160+8+16($j,$sp)    # reincarnate rp
-       la      $ap,160($sp)
+       l${g}   $rp,`$stdframe+8+2*$SIZE_T`($j,$sp)     # reincarnate rp
+       la      $ap,$stdframe($sp)
         ahi     $num,1          # restore $num, incidentally clears "borrow"
  
         la      $j,0(%r0)
         lr      $count,$num
  .Lsub: lg      $alo,0($j,$ap)
-       slbg    $alo,0($j,$np)
+       lg      $nlo,0($j,$np)
+       _dswap  $nlo
+       slbgr   $alo,$nlo
         stg     $alo,0($j,$rp)
         la      $j,8($j)
         brct    $count,.Lsub
@@ -210,19 +254,24 @@ bn_mul_mont:
  
         la      $j,0(%r0)
         lgr     $count,$num
-.Lcopy:        lg      $alo,0($j,$ap)  # copy or in-place refresh
-       stg     $j,160($j,$sp)  # zap tp
+.Lcopy:        lg      $alo,0($j,$ap)          # copy or in-place refresh
+       _dswap  $alo
+       stg     $j,$stdframe($j,$sp)    # zap tp
         stg     $alo,0($j,$rp)
         la      $j,8($j)
         brct    $count,.Lcopy
  
-       la      %r1,160+8+48($j,$sp)
-       lmg     %r6,%r15,0(%r1)
+       la      %r1,`$stdframe+8+6*$SIZE_T`($j,$sp)
+       lm${g}  %r6,%r15,0(%r1)
         lghi    %r2,1           # signal "processed"
         br      %r14
  .size  bn_mul_mont,.-bn_mul_mont
  .string        "Montgomery Multiplication for s390x, CRYPTOGAMS by <appro\@openssl.org>"
  ___
  
-print $code;
+foreach (split("\n",$code)) {
+       s/\`([^\`]*)\`/eval $1/ge;
+       s/_dswap\s+(%r[0-9]+)/sprintf("rllg\t%s,%s,32",$1,$1) if($SIZE_T==4)/e;
+       print $_,"\n";
+}
  close STDOUT;
diff --git a/crypto/modes/asm/ghash-s390x.pl b/crypto/modes/asm/ghash-s390x.pl

index d7689de541fefda513f9d359ddfbada7ad5b8a35..16ad034fc12d61c165d7d21adde534ab89d09d3b 100644 (file)
--- a/crypto/modes/asm/ghash-s390x.pl
+++ b/crypto/modes/asm/ghash-s390x.pl
@@ -18,6 +18,26 @@
  # and the result should be close to 12. In the lack of instruction-
  # level profiling data it's impossible to tell why...
  
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 2.8x better than 32-bit code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";
  
@@ -74,7 +94,7 @@ $code.=<<___ if(!$softonly);
  .Lsoft_gmult:
  ___
  $code.=<<___;
-       stmg    %r6,%r14,48($sp)
+       stm${g} %r6,%r14,6*$SIZE_T($sp)
  
         aghi    $Xi,-1
         lghi    $len,1
@@ -109,8 +129,11 @@ $code.=<<___ if(!$softonly);
  .align 32
  .Lsoft_ghash:
  ___
+$cdoe.=<<___ if ($flavour =~ /3[12]/);
+       llgfr   $len,$len
+___
  $code.=<<___;
-       stmg    %r6,%r14,48($sp)
+       stm${g} %r6,%r14,6*$SIZE_T($sp)
  
         aghi    $Xi,-1
         srlg    $len,$len,4
@@ -209,7 +232,7 @@ $code.=<<___;
         xgr     $Zhi,$tmp
         stg     $Zlo,8+1($Xi)
         stg     $Zhi,0+1($Xi)
-       lmg     %r6,%r14,48($sp)
+       lm${g}  %r6,%r14,6*$SIZE_T($sp)
         br      %r14
  .type  gcm_ghash_4bit,\@function
  .size  gcm_ghash_4bit,(.-gcm_ghash_4bit)
diff --git a/crypto/rc4/asm/rc4-s390x.pl b/crypto/rc4/asm/rc4-s390x.pl

index f26c515e78c33a4950c29acd00f6b7fbdcde3053..1aa754820c64e905cf75956c27aff6866e172dae 100644 (file)
--- a/crypto/rc4/asm/rc4-s390x.pl
+++ b/crypto/rc4/asm/rc4-s390x.pl
@@ -13,6 +13,26 @@
  # "cluster" Address Generation Interlocks, so that one pipeline stall
  # resolves several dependencies.
  
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z990 it was measured to perform
+# 50% better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";
  
@@ -42,7 +62,12 @@ $code.=<<___;
  .type  RC4,\@function
  .align 64
  RC4:
-       stmg    %r6,%r11,48($sp)
+       stm${g} %r6,%r11,6*$SIZE_T($sp)
+___
+$code.=<<___ if ($flavour =~ /3[12]/);
+       llgfr   $len,$len
+___
+$code.=<<___;
         llgc    $XX[0],0($key)
         llgc    $YY,1($key)
         la      $XX[0],1($XX[0])
@@ -93,7 +118,7 @@ $code.=<<___;
         xgr     $acc,$TX[1]
         stg     $acc,0($out)
         la      $out,8($out)
-       brct    $cnt,.Loop8
+       brctg   $cnt,.Loop8
  
  .Lshort:
         lghi    $acc,7
@@ -125,7 +150,7 @@ $code.=<<___;
         ahi     $XX[0],-1
         stc     $XX[0],0($key)
         stc     $YY,1($key)
-       lmg     %r6,%r11,48($sp)
+       lm${g}  %r6,%r11,6*$SIZE_T($sp)
         br      $rp
  .size  RC4,.-RC4
  .string        "RC4 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
@@ -150,7 +175,7 @@ $code.=<<___;
  .type  RC4_set_key,\@function
  .align 64
  RC4_set_key:
-       stmg    %r6,%r8,48($sp)
+       stm${g} %r6,%r8,6*$SIZE_T($sp)
         lhi     $cnt,256
         la      $idx,0(%r0)
         sth     $idx,0($key)
@@ -183,7 +208,7 @@ RC4_set_key:
         la      $iinp,0(%r0)
         j       .L2ndloop
  .Ldone:
-       lmg     %r6,%r8,48($sp)
+       lm${g}  %r6,%r8,6*$SIZE_T($sp)
         br      $rp
  .size  RC4_set_key,.-RC4_set_key
  
diff --git a/crypto/s390xcpuid.S b/crypto/s390xcpuid.S

index 82312d8b4e9e903e127493e977095d6822fd3839..06815347e6a380bd6229382b49ec3bc014921f04 100644 (file)
--- a/crypto/s390xcpuid.S
+++ b/crypto/s390xcpuid.S
@@ -62,6 +62,9 @@ OPENSSL_wipe_cpu:
  .type  OPENSSL_cleanse,@function
  .align 16
  OPENSSL_cleanse:
+#if !defined(__s390x__) && !defined(__s390x)
+       llgfr   %r3,%r3
+#endif
         lghi    %r4,15
         lghi    %r0,0
         clgr    %r3,%r4
diff --git a/crypto/sha/asm/sha1-s390x.pl b/crypto/sha/asm/sha1-s390x.pl

index 0e38f8e36d4b7187780a3005834a9031505849ee..9193dda45eff974206f24a23480d27d604b8b4e6 100644 (file)
--- a/crypto/sha/asm/sha1-s390x.pl
+++ b/crypto/sha/asm/sha1-s390x.pl
@@ -21,8 +21,27 @@
  # instructions to favour dual-issue z10 pipeline. On z10 hardware is
  # "only" ~2.3x faster than software.
  
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific.
+
  $kimdfunc=1;   # magic function code for kimd instruction
  
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
  while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
  open STDOUT,">$output";
  
@@ -42,13 +61,14 @@ $t1="%r11";
  @X=("%r12","%r13","%r14");
  $sp="%r15";
  
-$frame=160+16*4;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*4;
  
  sub Xupdate {
  my $i=shift;
  
  $code.=<<___ if ($i==15);
-       lg      $prefetch,160($sp)      ### Xupdate(16) warm-up
+       lg      $prefetch,$stdframe($sp)        ### Xupdate(16) warm-up
         lr      $X[0],$X[2]
  ___
  return if ($i&1);      # Xupdate is vectorized and executed every 2nd cycle
@@ -58,8 +78,8 @@ $code.=<<___ if ($i<16);
  ___
  $code.=<<___ if ($i>=16);
         xgr     $X[0],$prefetch         ### Xupdate($i)
-       lg      $prefetch,`160+4*(($i+2)%16)`($sp)
-       xg      $X[0],`160+4*(($i+8)%16)`($sp)
+       lg      $prefetch,`$stdframe+4*(($i+2)%16)`($sp)
+       xg      $X[0],`$stdframe+4*(($i+8)%16)`($sp)
         xgr     $X[0],$prefetch
         rll     $X[0],$X[0],1
         rllg    $X[1],$X[0],32
@@ -68,7 +88,7 @@ $code.=<<___ if ($i>=16);
         lr      $X[2],$X[1]             # feedback
  ___
  $code.=<<___ if ($i<=70);
-       stg     $X[0],`160+4*($i%16)`($sp)
+       stg     $X[0],`$stdframe+4*($i%16)`($sp)
  ___
  unshift(@X,pop(@X));
  }
@@ -148,9 +168,9 @@ $code.=<<___ if ($kimdfunc);
         tmhl    %r0,0x4000      # check for message-security assist
         jz      .Lsoftware
         lghi    %r0,0
-       la      %r1,16($sp)
+       la      %r1,`2*$SIZE_T`($sp)
         .long   0xb93e0002      # kimd %r0,%r2
-       lg      %r0,16($sp)
+       lg      %r0,`2*$SIZE_T`($sp)
         tmhh    %r0,`0x8000>>$kimdfunc`
         jz      .Lsoftware
         lghi    %r0,$kimdfunc
@@ -165,11 +185,11 @@ $code.=<<___ if ($kimdfunc);
  ___
  $code.=<<___;
         lghi    %r1,-$frame
-       stg     $ctx,16($sp)
-       stmg    %r6,%r15,48($sp)
+       st${g}  $ctx,`2*$SIZE_T`($sp)
+       stm${g} %r6,%r15,`6*$SIZE_T`($sp)
         lgr     %r0,$sp
         la      $sp,0(%r1,$sp)
-       stg     %r0,0($sp)
+       st${g}  %r0,0($sp)
  
         larl    $t0,Ktable
         llgf    $A,0($ctx)
@@ -199,7 +219,7 @@ ___
  for (;$i<80;$i++)      { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  $code.=<<___;
  
-       lg      $ctx,`$frame+16`($sp)
+       l${g}   $ctx,`$frame+2*$SIZE_T`($sp)
         la      $inp,64($inp)
         al      $A,0($ctx)
         al      $B,4($ctx)
@@ -211,9 +231,9 @@ $code.=<<___;
         st      $C,8($ctx)
         st      $D,12($ctx)
         st      $E,16($ctx)
-       brct    $len,.Lloop
+       brct${g} $len,.Lloop
  
-       lmg     %r6,%r15,`$frame+48`($sp)
+       lm${g}  %r6,%r15,`$frame+6*$SIZE_T`($sp)
         br      %r14
  .size  sha1_block_data_order,.-sha1_block_data_order
  .string        "SHA1 block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
diff --git a/crypto/sha/asm/sha512-s390x.pl b/crypto/sha/asm/sha512-s390x.pl

index 3a358a486092a6d43a3d9491cdb7313f413f436a..079a3fc78ab4fdaa82a6cc0aa6d29aa4c4f02fb9 100644 (file)
--- a/crypto/sha/asm/sha512-s390x.pl
+++ b/crypto/sha/asm/sha512-s390x.pl
@@ -26,6 +26,26 @@
  # favour dual-issue z10 pipeline. Hardware SHA256/512 is ~4.7x faster
  # than software.
  
+# November 2010.
+#
+# Adapt for -m31 build. If kernel supports what's called "highgprs"
+# feature on Linux [see /proc/cpuinfo], it's possible to use 64-bit
+# instructions and achieve "64-bit" performance even in 31-bit legacy
+# application context. The feature is not specific to any particular
+# processor, as long as it's "z-CPU". Latter implies that the code
+# remains z/Architecture specific. On z900 SHA256 was measured to
+# perform 2.4x and SHA512 - 13x better than code generated by gcc 4.3.
+
+$flavour = shift;
+
+if ($flavour =~ /3[12]/) {
+       $SIZE_T=4;
+       $g="";
+} else {
+       $SIZE_T=8;
+       $g="g";
+}
+
  $t0="%r0";
  $t1="%r1";
  $ctx="%r2";    $t2="%r2";
@@ -78,7 +98,8 @@ if ($output =~ /512/) {
  }
  $Func="sha${label}_block_data_order";
  $Table="K${label}";
-$frame=160+16*$SZ;
+$stdframe=16*$SIZE_T+4*8;
+$frame=$stdframe+16*$SZ;
  
  sub BODY_00_15 {
  my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
@@ -93,9 +114,9 @@ $code.=<<___;
         xgr     $t0,$t1
         $ROT    $t1,$t1,`$Sigma1[2]-$Sigma1[1]`
          xgr    $t2,$g
-       $ST     $T1,`160+$SZ*($i%16)`($sp)
+       $ST     $T1,`$stdframe+$SZ*($i%16)`($sp)
         xgr     $t0,$t1                 # Sigma1(e)
-       la      $T1,0($T1,$h)           # T1+=h
+       algr    $T1,$h                  # T1+=h
          ngr    $t2,$e
          lgr    $t1,$a
         algr    $T1,$t0                 # T1+=Sigma1(e)
@@ -113,7 +134,7 @@ $code.=<<___;
          ngr    $t2,$b
         algr    $h,$T1                  # h+=T1
          ogr    $t2,$t1                 # Maj(a,b,c)
-       la      $d,0($d,$T1)            # d+=T1
+       algr    $d,$T1                  # d+=T1
         algr    $h,$t2                  # h+=Maj(a,b,c)
  ___
  }
@@ -122,19 +143,19 @@ sub BODY_16_XX {
  my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  
  $code.=<<___;
-       $LD     $T1,`160+$SZ*(($i+1)%16)`($sp)  ### $i
-       $LD     $t1,`160+$SZ*(($i+14)%16)`($sp)
+       $LD     $T1,`$stdframe+$SZ*(($i+1)%16)`($sp)    ### $i
+       $LD     $t1,`$stdframe+$SZ*(($i+14)%16)`($sp)
         $ROT    $t0,$T1,$sigma0[0]
         $SHR    $T1,$sigma0[2]
         $ROT    $t2,$t0,`$sigma0[1]-$sigma0[0]`
         xgr     $T1,$t0
         $ROT    $t0,$t1,$sigma1[0]
-       xgr     $T1,$t2                         # sigma0(X[i+1])
+       xgr     $T1,$t2                                 # sigma0(X[i+1])
         $SHR    $t1,$sigma1[2]
-       $ADD    $T1,`160+$SZ*($i%16)`($sp)      # +=X[i]
+       $ADD    $T1,`$stdframe+$SZ*($i%16)`($sp)        # +=X[i]
         xgr     $t1,$t0
         $ROT    $t0,$t0,`$sigma1[1]-$sigma1[0]`
-       $ADD    $T1,`160+$SZ*(($i+9)%16)`($sp)  # +=X[i+9]
+       $ADD    $T1,`$stdframe+$SZ*(($i+9)%16)`($sp)    # +=X[i+9]
         xgr     $t1,$t0                         # sigma1(X[i+14])
         algr    $T1,$t1                         # +=sigma1(X[i+14])
  ___
@@ -212,6 +233,7 @@ $code.=<<___;
  .globl $Func
  .type  $Func,\@function
  $Func:
+       sllg    $len,$len,`log(16*$SZ)/log(2)`
  ___
  $code.=<<___ if ($kimdfunc);
         larl    %r1,OPENSSL_s390xcap_P
@@ -219,15 +241,15 @@ $code.=<<___ if ($kimdfunc);
         tmhl    %r0,0x4000      # check for message-security assist
         jz      .Lsoftware
         lghi    %r0,0
-       la      %r1,16($sp)
+       la      %r1,`2*$SIZE_T`($sp)
         .long   0xb93e0002      # kimd %r0,%r2
-       lg      %r0,16($sp)
+       lg      %r0,`2*$SIZE_T`($sp)
         tmhh    %r0,`0x8000>>$kimdfunc`
         jz      .Lsoftware
         lghi    %r0,$kimdfunc
         lgr     %r1,$ctx
         lgr     %r2,$inp
-       sllg    %r3,$len,`log(16*$SZ)/log(2)`
+       lgr     %r3,$len
         .long   0xb93e0002      # kimd %r0,%r2
         brc     1,.-4           # pay attention to "partial completion"
         br      %r14
@@ -235,13 +257,12 @@ $code.=<<___ if ($kimdfunc);
  .Lsoftware:
  ___
  $code.=<<___;
-       sllg    $len,$len,`log(16*$SZ)/log(2)`
         lghi    %r1,-$frame
-       agr     $len,$inp
-       stmg    $ctx,%r15,16($sp)
+       la      $len,0($len,$inp)
+       stm${g} $ctx,%r15,`2*$SIZE_T`($sp)
         lgr     %r0,$sp
         la      $sp,0(%r1,$sp)
-       stg     %r0,0($sp)
+       st${g}  %r0,0($sp)
  
         larl    $tbl,$Table
         $LD     $A,`0*$SZ`($ctx)
@@ -265,7 +286,7 @@ $code.=<<___;
         clgr    $len,$t0
         jne     .Lrounds_16_xx
  
-       lg      $ctx,`$frame+16`($sp)
+       l${g}   $ctx,`$frame+2*$SIZE_T`($sp)
         la      $inp,`16*$SZ`($inp)
         $ADD    $A,`0*$SZ`($ctx)
         $ADD    $B,`1*$SZ`($ctx)
@@ -283,10 +304,10 @@ $code.=<<___;
         $ST     $F,`5*$SZ`($ctx)
         $ST     $G,`6*$SZ`($ctx)
         $ST     $H,`7*$SZ`($ctx)
-       clg     $inp,`$frame+32`($sp)
+       cl${g}  $inp,`$frame+4*$SIZE_T`($sp)
         jne     .Lloop
  
-       lmg     %r6,%r15,`$frame+48`($sp)       
+       lm${g}  %r6,%r15,`$frame+6*$SIZE_T`($sp)        
         br      %r14
  .size  $Func,.-$Func
  .string        "SHA${label} block transform for s390x, CRYPTOGAMS by <appro\@openssl.org>"
author	Andy Polyakov <appro@openssl.org>
	Mon, 29 Nov 2010 20:52:43 +0000 (20:52 +0000)
committer	Andy Polyakov <appro@openssl.org>
	Mon, 29 Nov 2010 20:52:43 +0000 (20:52 +0000)
Configure		patch \| blob \| history
config		patch \| blob \| history
crypto/aes/asm/aes-s390x.pl		patch \| blob \| history
crypto/bn/asm/s390x-mont.pl		patch \| blob \| history
crypto/modes/asm/ghash-s390x.pl		patch \| blob \| history
crypto/rc4/asm/rc4-s390x.pl		patch \| blob \| history
crypto/s390xcpuid.S		patch \| blob \| history
crypto/sha/asm/sha1-s390x.pl		patch \| blob \| history
crypto/sha/asm/sha512-s390x.pl		patch \| blob \| history