From: Andy Polyakov <appro@openssl.org>
Date: Mon, 21 Jul 2014 13:29:09 +0000 (+0200)
Subject: sha1-ppc.pl: shave off one cycle from BODY_20_39
X-Git-Tag: master-post-reformat~563
X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=commitdiff_plain;h=5c3598307ebbf5a88d1c39fbb2629536e443a5dd

sha1-ppc.pl: shave off one cycle from BODY_20_39
and improve performance by 10% on POWER[78].

Reviewed-by: Kurt Roeckx <kurt@openssl.org>
---

diff --git a/crypto/sha/asm/sha1-ppc.pl b/crypto/sha/asm/sha1-ppc.pl
index 24a5d065d9..df5989610c 100755
--- a/crypto/sha/asm/sha1-ppc.pl
+++ b/crypto/sha/asm/sha1-ppc.pl
@@ -125,31 +125,31 @@ my ($i,$a,$b,$c,$d,$e,$f)=@_;
 my $j=$i+1;
 $code.=<<___ if ($i<79);
 	add	$f,$K,$e
+	xor	$t0,$b,$d
 	rotlwi	$e,$a,5
 	xor	@X[$j%16],@X[$j%16],@X[($j+2)%16]
 	add	$f,$f,@X[$i%16]
-	xor	$t0,$b,$c
+	xor	$t0,$t0,$c
 	xor	@X[$j%16],@X[$j%16],@X[($j+8)%16]
-	add	$f,$f,$e
+	add	$f,$f,$t0
 	rotlwi	$b,$b,30
-	xor	$t0,$t0,$d
 	xor	@X[$j%16],@X[$j%16],@X[($j+13)%16]
-	add	$f,$f,$t0
+	add	$f,$f,$e
 	rotlwi	@X[$j%16],@X[$j%16],1
 ___
 $code.=<<___ if ($i==79);
 	add	$f,$K,$e
+	xor	$t0,$b,$d
 	rotlwi	$e,$a,5
 	lwz	r16,0($ctx)
 	add	$f,$f,@X[$i%16]
-	xor	$t0,$b,$c
+	xor	$t0,$t0,$c
 	lwz	r17,4($ctx)
-	add	$f,$f,$e
+	add	$f,$f,$t0
 	rotlwi	$b,$b,30
 	lwz	r18,8($ctx)
-	xor	$t0,$t0,$d
 	lwz	r19,12($ctx)
-	add	$f,$f,$t0
+	add	$f,$f,$e
 	lwz	r20,16($ctx)
 ___
 }
diff --git a/crypto/sha/asm/sha512p8-ppc.pl b/crypto/sha/asm/sha512p8-ppc.pl
index cb0268c9d6..a316b31a4f 100755
--- a/crypto/sha/asm/sha512p8-ppc.pl
+++ b/crypto/sha/asm/sha512p8-ppc.pl
@@ -13,8 +13,8 @@
 # always virtualized setup with possibly throttled processor.
 # Relative comparison is therefore more informative. This module is
 # ~60% faster than integer-only sha512-ppc.pl. To anchor to something
-# else, SHA256 is 16% slower than sha1-ppc.pl and 2.5x slower than
-# hardware-assisted aes-128-cbc encrypt. SHA512 is 33% faster than
+# else, SHA256 is 24% slower than sha1-ppc.pl and 2.5x slower than
+# hardware-assisted aes-128-cbc encrypt. SHA512 is 20% faster than
 # sha1-ppc.pl and 1.6x slower than aes-128-cbc. Another interesting
 # result is degree of computational resources' utilization. POWER8 is
 # "massively multi-threaded chip" and difference between single- and