Make x86_64 modules work under Win64/x64.

[openssl.git] / crypto / sha / asm / sha512-x86_64.pl
diff --git a/crypto/sha/asm/sha512-x86_64.pl b/crypto/sha/asm/sha512-x86_64.pl

index 28937de0facfa8cc7723c5fbb57a50eecc034c45..b6252d31eca20192099be564471f91fb06eea6d3 100755 (executable)
--- a/crypto/sha/asm/sha512-x86_64.pl
+++ b/crypto/sha/asm/sha512-x86_64.pl
@@ -8,7 +8,8 @@
  #
  # sha256/512_block procedure for x86_64.
  #
-# 40% improvement over compiler-generated code on Opteron. No magical
+# 40% improvement over compiler-generated code on Opteron. On EM64T
+# sha256 was observed to run >80% faster and sha512 - >40%. No magical
  # tricks, just straight implementation... I really wonder why gcc
  # [being armed with inline assembler] fails to generate as fast code.
  # The only thing which is cool about this module is that it's very
@@ -34,16 +35,22 @@
  #
  # Special note on Intel EM64T. While Opteron CPU exhibits perfect
  # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
-# [currently available] EM64T CPUs apparently are far from it. 64-bit
-# version, sha512_block, is hardly faster than 32-bit one. This is
-# presumably because 64-bit shifts/rotates apparently are not atomic
-# instructions, but implemented in microcode.
+# [currently available] EM64T CPUs apparently are far from it. On the
+# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+# sha256_block:-( This is presumably because 64-bit shifts/rotates
+# apparently are not atomic instructions, but implemented in microcode.
  
  $output=shift;
-open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $output";
  
  if ($output =~ /512/) {
-       $func="sha512_block";
+       $func="sha512_block_data_order";
         $TABLE="K512";
         $SZ=8;
         @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
@@ -55,7 +62,7 @@ if ($output =~ /512/) {
         @sigma1=(19,61, 6);
         $rounds=80;
  } else {
-       $func="sha256_block";
+       $func="sha256_block_data_order";
         $TABLE="K256";
         $SZ=4;
         @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
@@ -76,9 +83,8 @@ $Tbl="%rbp";
  $_ctx="16*$SZ+0*8(%rsp)";
  $_inp="16*$SZ+1*8(%rsp)";
  $_end="16*$SZ+2*8(%rsp)";
-$_ord="16*$SZ+3*8(%rsp)";
-$_rsp="16*$SZ+4*8(%rsp)";
-$framesz="16*$SZ+5*8";
+$_rsp="16*$SZ+3*8(%rsp)";
+$framesz="16*$SZ+4*8";
  
  
  sub ROUND_00_15()
@@ -188,7 +194,6 @@ $func:
         mov     $ctx,$_ctx              # save ctx, 1st arg
         mov     $inp,$_inp              # save inp, 2nd arh
         mov     %rdx,$_end              # save end pointer, "3rd" arg
-       mov     %ecx,$_ord              # save host, 4th arg
         mov     %rbp,$_rsp              # save copy of %rsp
  
         .picmeup $Tbl
@@ -208,25 +213,6 @@ $func:
  .Lloop:
         xor     $round,$round
  ___
-if ($SZ==4) {
-$code.=<<___;
-       cmpl    \$0,$_ord
-       je      .Ldata_order
-.align 16
-.Lhost_order:
-___
-
-       for($i=0;$i<16;$i++) {
-               $code.="        mov     $SZ*$i($inp),$T1\n";
-               &ROUND_00_15($i,@ROT);
-               unshift(@ROT,pop(@ROT));
-       }
-$code.=<<___;
-       jmp     .Lrounds_16_xx
-.align 16
-.Ldata_order:
-___
-} # 256
         for($i=0;$i<16;$i++) {
                 $code.="        mov     $SZ*$i($inp),$T1\n";
                 $code.="        bswap   $T1\n";