X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fbn%2Fasm%2Fsparcv9a-mont.pl;h=8e22a443b7f3a7014da0e89853f58e7031ea94ef;hp=1899ecb3a65812435c13419b73124323d1e89edb;hb=98939a05b6884538ba40fae2606291140f9e5839;hpb=2e21922eb6e0157fb07f4c464679d1b76b7ede5e;ds=sidebyside

diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl
index 1899ecb3a6..8e22a443b7 100755
--- a/crypto/bn/asm/sparcv9a-mont.pl
+++ b/crypto/bn/asm/sparcv9a-mont.pl
@@ -18,8 +18,8 @@
 # implementations from compatibility matrix. But the rest, whole Sun
 # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support
 # VIS extension instructions used in this module. This is considered
-# good enough to recommend HAL SPARC64 users [if any] to simply fall
-# down to no-asm configuration.
+# good enough to not care about HAL SPARC64 users [if any] who have
+# integer-only pure SPARCv9 module to "fall down" to.
 
 # USI&II cores currently exhibit uniform 2x improvement [over pre-
 # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII
@@ -31,7 +31,7 @@
 # is pipelined, which in turn *might* be impossible to match... On
 # additional note, SPARC64 V implements FP Multiply-Add instruction,
 # which is perfectly usable in this context... In other words, as far
-# as HAL/Fujitsu SPARC64 family goes, talk to the author:-)
+# as Fujitsu SPARC64 V goes, talk to the author:-)
 
 # The implementation implies following "non-natural" limitations on
 # input arguments:
@@ -455,13 +455,18 @@ $fname:
 	add	$tp,8,$tp
 
 .L1stskip:
+	fdtox	$dota,$dota
+	fdtox	$dotb,$dotb
+
 	ldx	[%sp+$bias+$frame+0],%o0
 	ldx	[%sp+$bias+$frame+8],%o1
 	ldx	[%sp+$bias+$frame+16],%o2
 	ldx	[%sp+$bias+$frame+24],%o3
 
 	srlx	%o0,16,%o7
+	std	$dota,[%sp+$bias+$frame+32]
 	add	%o7,%o1,%o1
+	std	$dotb,[%sp+$bias+$frame+40]
 	srlx	%o1,16,%o7
 	add	%o7,%o2,%o2
 	srlx	%o2,16,%o7
@@ -475,33 +480,28 @@ $fname:
 	or	%o1,%o0,%o0
 	or	%o2,%o0,%o0
 	or	%o7,%o0,%o0		! 64-bit result
+	ldx	[%sp+$bias+$frame+32],%o4
 	addcc	%g1,%o0,%o0
+	ldx	[%sp+$bias+$frame+40],%o5
 	srlx	%o3,16,%g1		! 34-bit carry
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 
 	stx	%o0,[$tp]		! tp[j-1]=
 	add	$tp,8,$tp
-
-	fdtox	$dota,$dota
-	fdtox	$dotb,$dotb
-	std	$dota,[%sp+$bias+$frame+32]
-	std	$dotb,[%sp+$bias+$frame+40]
-	ldx	[%sp+$bias+$frame+32],%o0
-	ldx	[%sp+$bias+$frame+40],%o1
 
-	srlx	%o0,16,%o7
-	add	%o7,%o1,%o1
-	and	%o0,$mask,%o0
-	sllx	%o1,16,%o7
-	or	%o7,%o0,%o0
-	addcc	%g1,%o0,%o0
-	srlx	%o1,48,%g1
+	srlx	%o4,16,%o7
+	add	%o7,%o5,%o5
+	and	%o4,$mask,%o4
+	sllx	%o5,16,%o7
+	or	%o7,%o4,%o4
+	addcc	%g1,%o4,%o4
+	srlx	%o5,48,%g1
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
 
 	mov	%g1,$carry
-	stx	%o0,[$tp]		! tp[num-1]=
+	stx	%o4,[$tp]		! tp[num-1]=
 
 	ba	.Louter
 	add	$i,8,$i
@@ -664,7 +664,9 @@ $fname:
 	bz,pn	%icc,.Linnerskip
 	std	$nlod,[%sp+$bias+$frame+24]
 
-.align	32,0x1000000
+	ba	.Linner
+	nop
+.align	32
 .Linner:
 	ldd	[$ap_l+$j],$alo		! load a[j] in double format
 	ldd	[$ap_h+$j],$ahi
@@ -719,12 +721,12 @@ $fname:
 	or	%o7,%o0,%o0		! 64-bit result
 		faddd	$nloc,$nhia,$nloc
 	addcc	%g1,%o0,%o0
+	ldx	[$tp+8],%o7		! tp[j]
 		faddd	$nlod,$nhib,$nlod
 	srlx	%o3,16,%g1		! 34-bit carry
 		fdtox	$nloa,$nloa
 	bcs,a	%xcc,.+8
 	add	%g1,1,%g1
-	ldx	[$tp+8],%o7		! tp[j]
 		fdtox	$nlob,$nlob
 	addcc	%o7,%o0,%o0
 		fdtox	$nloc,$nloc