#
# sha256/512_block procedure for x86_64.
#
-# 40% improvement over compiler-generated code on Opteron. No magical
+# 40% improvement over compiler-generated code on Opteron. On EM64T
+# sha256 was observed to run >80% faster and sha512 - >40%. No magical
# tricks, just straight implementation... I really wonder why gcc
# [being armed with inline assembler] fails to generate as fast code.
# The only thing which is cool about this module is that it's very
#
# Special note on Intel EM64T. While Opteron CPU exhibits perfect
# perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
-# [currently available] EM64T CPUs apparently are far from it. 64-bit
-# version, sha512_block, is hardly faster than 32-bit one. This is
-# presumably because 64-bit shifts/rotates apparently are not atomic
-# instructions, but implemented in microcode.
+# [currently available] EM64T CPUs apparently are far from it. On the
+# contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
+# sha256_block:-( This is presumably because 64-bit shifts/rotates
+# apparently are not atomic instructions, but implemented in microcode.
$output=shift;
-open STDOUT,"| $^X ../perlasm/x86_64-xlate.pl $output";
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
+die "can't locate x86_64-xlate.pl";
+
+open STDOUT,"| $^X $xlate $output";
if ($output =~ /512/) {
- $func="sha512_block";
+ $func="sha512_block_data_order";
$TABLE="K512";
$SZ=8;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
@sigma1=(19,61, 6);
$rounds=80;
} else {
- $func="sha256_block";
+ $func="sha256_block_data_order";
$TABLE="K256";
$SZ=4;
@ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
$_ctx="16*$SZ+0*8(%rsp)";
$_inp="16*$SZ+1*8(%rsp)";
$_end="16*$SZ+2*8(%rsp)";
-$_ord="16*$SZ+3*8(%rsp)";
-$_rsp="16*$SZ+4*8(%rsp)";
-$framesz="16*$SZ+5*8";
+$_rsp="16*$SZ+3*8(%rsp)";
+$framesz="16*$SZ+4*8";
sub ROUND_00_15()
mov $ctx,$_ctx # save ctx, 1st arg
mov $inp,$_inp # save inp, 2nd arh
mov %rdx,$_end # save end pointer, "3rd" arg
- mov %ecx,$_ord # save host, 4th arg
mov %rbp,$_rsp # save copy of %rsp
.picmeup $Tbl
.Lloop:
xor $round,$round
___
-if ($SZ==4) {
-$code.=<<___;
- cmpl \$0,$_ord
- je .Ldata_order
-.align 16
-.Lhost_order:
-___
-
- for($i=0;$i<16;$i++) {
- $code.=" mov $SZ*$i($inp),$T1\n";
- &ROUND_00_15($i,@ROT);
- unshift(@ROT,pop(@ROT));
- }
-$code.=<<___;
- jmp .Lrounds_16_xx
-.align 16
-.Ldata_order:
-___
-} # 256
for($i=0;$i<16;$i++) {
$code.=" mov $SZ*$i($inp),$T1\n";
$code.=" bswap $T1\n";