X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fec%2Fasm%2Fecp_nistz256-sparcv9.pl;h=97201cb271b55e5e1a2be603db22b8a80b78dad5;hp=3c4b0711fc3371e3e6b0d9fc5c1996b8630bb8ad;hb=c74aea8d6ccdf07ce826a9451887739b8aa64096;hpb=085b3860651e2ff55e28f8a28a1f66b1a3fe538f

diff --git a/crypto/ec/asm/ecp_nistz256-sparcv9.pl b/crypto/ec/asm/ecp_nistz256-sparcv9.pl
index 3c4b0711fc..97201cb271 100755
--- a/crypto/ec/asm/ecp_nistz256-sparcv9.pl
+++ b/crypto/ec/asm/ecp_nistz256-sparcv9.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
@@ -94,6 +101,7 @@ my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
 my ($rp_real,$ap_real)=("%g2","%g3");
 
 $code.=<<___;
+.type	ecp_nistz256_precomputed,#object
 .size	ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
 .align	64
 .LRR:	! 2^512 mod P precomputed for NIST P256 polynomial
@@ -115,6 +123,7 @@ ecp_nistz256_to_mont:
 	nop
 	ret
 	restore
+.type	ecp_nistz256_to_mont,#function
 .size	ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
 
 ! void	ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
@@ -129,6 +138,7 @@ ecp_nistz256_from_mont:
 	nop
 	ret
 	restore
+.type	ecp_nistz256_from_mont,#function
 .size	ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
 
 ! void	ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
@@ -142,6 +152,7 @@ ecp_nistz256_mul_mont:
 	nop
 	ret
 	restore
+.type	ecp_nistz256_mul_mont,#function
 .size	ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
 
 ! void	ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
@@ -154,6 +165,7 @@ ecp_nistz256_sqr_mont:
 	nop
 	ret
 	restore
+.type	ecp_nistz256_sqr_mont,#function
 .size	ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
 ___
 
@@ -353,6 +365,7 @@ $code.=<<___;
 	st	@acc[6],[$rp+24]
 	retl
 	st	@acc[7],[$rp+28]
+.type	__ecp_nistz256_mul_mont,#function
 .size	__ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
 
 ! void	ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
@@ -372,6 +385,7 @@ ecp_nistz256_add:
 	ld	[$ap+28],@acc[7]
 	ret
 	restore
+.type	ecp_nistz256_add,#function
 .size	ecp_nistz256_add,.-ecp_nistz256_add
 
 .align	32
@@ -392,36 +406,48 @@ __ecp_nistz256_add:
 	addccc	@acc[5],$t5,@acc[5]
 	addccc	@acc[6],$t6,@acc[6]
 	addccc	@acc[7],$t7,@acc[7]
-	subc	%g0,%g0,$carry		! broadcast carry bit
+	addc	%g0,%g0,$carry
 
 .Lreduce_by_sub:
 
-	! if a+b carries, subtract modulus.
+	! if a+b >= modulus, subtract modulus.
 	!
+	! But since comparison implies subtraction, we subtract
+	! modulus and then add it back if subraction borrowed.
+
+	subcc	@acc[0],-1,@acc[0]
+	subccc	@acc[1],-1,@acc[1]
+	subccc	@acc[2],-1,@acc[2]
+	subccc	@acc[3], 0,@acc[3]
+	subccc	@acc[4], 0,@acc[4]
+	subccc	@acc[5], 0,@acc[5]
+	subccc	@acc[6], 1,@acc[6]
+	subccc	@acc[7],-1,@acc[7]
+	subc	$carry,0,$carry
+
 	! Note that because mod has special form, i.e. consists of
 	! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
-	! using value of broadcasted borrow and the borrow bit itself.
-	! To minimize dependency chain we first broadcast and then
-	! extract the bit by negating (follow $bi).
+	! using value of borrow and its negative.
 
-	subcc	@acc[0],$carry,@acc[0]	! subtract synthesized modulus
-	subccc	@acc[1],$carry,@acc[1]
+	addcc	@acc[0],$carry,@acc[0]	! add synthesized modulus
+	addccc	@acc[1],$carry,@acc[1]
 	neg	$carry,$bi
 	st	@acc[0],[$rp]
-	subccc	@acc[2],$carry,@acc[2]
+	addccc	@acc[2],$carry,@acc[2]
 	st	@acc[1],[$rp+4]
-	subccc	@acc[3],0,@acc[3]
+	addccc	@acc[3],0,@acc[3]
 	st	@acc[2],[$rp+8]
-	subccc	@acc[4],0,@acc[4]
+	addccc	@acc[4],0,@acc[4]
 	st	@acc[3],[$rp+12]
-	subccc	@acc[5],0,@acc[5]
+	addccc	@acc[5],0,@acc[5]
 	st	@acc[4],[$rp+16]
-	subccc	@acc[6],$bi,@acc[6]
+	addccc	@acc[6],$bi,@acc[6]
 	st	@acc[5],[$rp+20]
-	subc	@acc[7],$carry,@acc[7]
+	addc	@acc[7],$carry,@acc[7]
 	st	@acc[6],[$rp+24]
 	retl
 	st	@acc[7],[$rp+28]
+.type	__ecp_nistz256_add,#function
 .size	__ecp_nistz256_add,.-__ecp_nistz256_add
 
 ! void	ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
@@ -440,6 +466,7 @@ ecp_nistz256_mul_by_2:
 	ld	[$ap+28],@acc[7]
 	ret
 	restore
+.type	ecp_nistz256_mul_by_2,#function
 .size	ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
 
 .align	32
@@ -453,7 +480,8 @@ __ecp_nistz256_mul_by_2:
 	addccc	@acc[6],@acc[6],@acc[6]
 	addccc	@acc[7],@acc[7],@acc[7]
 	b	.Lreduce_by_sub
-	subc	%g0,%g0,$carry		! broadcast carry bit
+	addc	%g0,%g0,$carry
+.type	__ecp_nistz256_mul_by_2,#function
 .size	__ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
 
 ! void	ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
@@ -472,6 +500,7 @@ ecp_nistz256_mul_by_3:
 	ld	[$ap+28],@acc[7]
 	ret
 	restore
+.type	ecp_nistz256_mul_by_3,#function
 .size	ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
 
 .align	32
@@ -484,17 +513,27 @@ __ecp_nistz256_mul_by_3:
 	addccc	@acc[5],@acc[5],$t5
 	addccc	@acc[6],@acc[6],$t6
 	addccc	@acc[7],@acc[7],$t7
-	subc	%g0,%g0,$carry		! broadcast carry bit
+	addc	%g0,%g0,$carry
 
-	subcc	$t0,$carry,$t0		! .Lreduce_by_sub but without stores
+	subcc	$t0,-1,$t0		! .Lreduce_by_sub but without stores
+	subccc	$t1,-1,$t1
+	subccc	$t2,-1,$t2
+	subccc	$t3, 0,$t3
+	subccc	$t4, 0,$t4
+	subccc	$t5, 0,$t5
+	subccc	$t6, 1,$t6
+	subccc	$t7,-1,$t7
+	subc	$carry,0,$carry
+
+	addcc	$t0,$carry,$t0		! add synthesized modulus
+	addccc	$t1,$carry,$t1
 	neg	$carry,$bi
-	subccc	$t1,$carry,$t1
-	subccc	$t2,$carry,$t2
-	subccc	$t3,0,$t3
-	subccc	$t4,0,$t4
-	subccc	$t5,0,$t5
-	subccc	$t6,$bi,$t6
-	subc	$t7,$carry,$t7
+	addccc	$t2,$carry,$t2
+	addccc	$t3,0,$t3
+	addccc	$t4,0,$t4
+	addccc	$t5,0,$t5
+	addccc	$t6,$bi,$t6
+	addc	$t7,$carry,$t7
 
 	addcc	$t0,@acc[0],@acc[0]	! 2*a+a=3*a
 	addccc	$t1,@acc[1],@acc[1]
@@ -505,7 +544,8 @@ __ecp_nistz256_mul_by_3:
 	addccc	$t6,@acc[6],@acc[6]
 	addccc	$t7,@acc[7],@acc[7]
 	b	.Lreduce_by_sub
-	subc	%g0,%g0,$carry		! broadcast carry bit
+	addc	%g0,%g0,$carry
+.type	__ecp_nistz256_mul_by_3,#function
 .size	__ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
 
 ! void	ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8],
@@ -525,6 +565,7 @@ ecp_nistz256_sub:
 	ld	[$ap+28],@acc[7]
 	ret
 	restore
+.type	ecp_nistz256_sub,#function
 .size	ecp_nistz256_sub,.-ecp_nistz256_sub
 
 ! void	ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
@@ -544,6 +585,7 @@ ecp_nistz256_neg:
 	mov	0,@acc[7]
 	ret
 	restore
+.type	ecp_nistz256_neg,#function
 .size	ecp_nistz256_neg,.-ecp_nistz256_neg
 
 .align	32
@@ -594,6 +636,7 @@ __ecp_nistz256_sub_from:
 	st	@acc[6],[$rp+24]
 	retl
 	st	@acc[7],[$rp+28]
+.type	__ecp_nistz256_sub_from,#function
 .size	__ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
 
 .align	32
@@ -616,6 +659,7 @@ __ecp_nistz256_sub_morf:
 	subccc	$t7,@acc[7],@acc[7]
 	b	.Lreduce_by_add
 	subc	%g0,%g0,$carry		! broadcast borrow bit
+.type	__ecp_nistz256_sub_morf,#function
 .size	__ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
 
 ! void	ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
@@ -634,6 +678,7 @@ ecp_nistz256_div_by_2:
 	ld	[$ap+28],@acc[7]
 	ret
 	restore
+.type	ecp_nistz256_div_by_2,#function
 .size	ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
 
 .align	32
@@ -687,6 +732,7 @@ __ecp_nistz256_div_by_2:
 	st	@acc[6],[$rp+24]
 	retl
 	st	@acc[7],[$rp+28]
+.type	__ecp_nistz256_div_by_2,#function
 .size	__ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
 ___
 
@@ -816,6 +862,7 @@ ecp_nistz256_point_double:
 
 	ret
 	restore
+.type	ecp_nistz256_point_double,#function
 .size	ecp_nistz256_point_double,.-ecp_nistz256_point_double
 ___
 }
@@ -852,71 +899,39 @@ ecp_nistz256_point_add:
 	mov	$ap,$ap_real
 	mov	$bp,$bp_real
 
-	ld	[$bp],@acc[0]		! in2_x
-	ld	[$bp+4],@acc[1]
-	ld	[$bp+8],@acc[2]
-	ld	[$bp+12],@acc[3]
-	ld	[$bp+16],@acc[4]
-	ld	[$bp+20],@acc[5]
-	ld	[$bp+24],@acc[6]
-	ld	[$bp+28],@acc[7]
-	ld	[$bp+32],$t0		! in2_y
-	ld	[$bp+32+4],$t1
-	ld	[$bp+32+8],$t2
-	ld	[$bp+32+12],$t3
-	ld	[$bp+32+16],$t4
-	ld	[$bp+32+20],$t5
-	ld	[$bp+32+24],$t6
-	ld	[$bp+32+28],$t7
-	or	@acc[1],@acc[0],@acc[0]
-	or	@acc[3],@acc[2],@acc[2]
-	or	@acc[5],@acc[4],@acc[4]
-	or	@acc[7],@acc[6],@acc[6]
-	or	@acc[2],@acc[0],@acc[0]
-	or	@acc[6],@acc[4],@acc[4]
-	or	@acc[4],@acc[0],@acc[0]
+	ld	[$bp+64],$t0		! in2_z
+	ld	[$bp+64+4],$t1
+	ld	[$bp+64+8],$t2
+	ld	[$bp+64+12],$t3
+	ld	[$bp+64+16],$t4
+	ld	[$bp+64+20],$t5
+	ld	[$bp+64+24],$t6
+	ld	[$bp+64+28],$t7
 	or	$t1,$t0,$t0
 	or	$t3,$t2,$t2
 	or	$t5,$t4,$t4
 	or	$t7,$t6,$t6
 	or	$t2,$t0,$t0
 	or	$t6,$t4,$t4
-	or	$t4,$t0,$t0
-	or	@acc[0],$t0,$t0		! !in2infty
+	or	$t4,$t0,$t0		! !in2infty
 	movrnz	$t0,-1,$t0
 	st	$t0,[%fp+STACK_BIAS-12]
 
-	ld	[$ap],@acc[0]		! in1_x
-	ld	[$ap+4],@acc[1]
-	ld	[$ap+8],@acc[2]
-	ld	[$ap+12],@acc[3]
-	ld	[$ap+16],@acc[4]
-	ld	[$ap+20],@acc[5]
-	ld	[$ap+24],@acc[6]
-	ld	[$ap+28],@acc[7]
-	ld	[$ap+32],$t0		! in1_y
-	ld	[$ap+32+4],$t1
-	ld	[$ap+32+8],$t2
-	ld	[$ap+32+12],$t3
-	ld	[$ap+32+16],$t4
-	ld	[$ap+32+20],$t5
-	ld	[$ap+32+24],$t6
-	ld	[$ap+32+28],$t7
-	or	@acc[1],@acc[0],@acc[0]
-	or	@acc[3],@acc[2],@acc[2]
-	or	@acc[5],@acc[4],@acc[4]
-	or	@acc[7],@acc[6],@acc[6]
-	or	@acc[2],@acc[0],@acc[0]
-	or	@acc[6],@acc[4],@acc[4]
-	or	@acc[4],@acc[0],@acc[0]
+	ld	[$ap+64],$t0		! in1_z
+	ld	[$ap+64+4],$t1
+	ld	[$ap+64+8],$t2
+	ld	[$ap+64+12],$t3
+	ld	[$ap+64+16],$t4
+	ld	[$ap+64+20],$t5
+	ld	[$ap+64+24],$t6
+	ld	[$ap+64+28],$t7
 	or	$t1,$t0,$t0
 	or	$t3,$t2,$t2
 	or	$t5,$t4,$t4
 	or	$t7,$t6,$t6
 	or	$t2,$t0,$t0
 	or	$t6,$t4,$t4
-	or	$t4,$t0,$t0
-	or	@acc[0],$t0,$t0		! !in1infty
+	or	$t4,$t0,$t0		! !in1infty
 	movrnz	$t0,-1,$t0
 	st	$t0,[%fp+STACK_BIAS-16]
 
@@ -1118,6 +1133,7 @@ $code.=<<___;
 .Ladd_done:
 	ret
 	restore
+.type	ecp_nistz256_point_add,#function
 .size	ecp_nistz256_point_add,.-ecp_nistz256_point_add
 ___
 }
@@ -1153,37 +1169,21 @@ ecp_nistz256_point_add_affine:
 	mov	$ap,$ap_real
 	mov	$bp,$bp_real
 
-	ld	[$ap],@acc[0]		! in1_x
-	ld	[$ap+4],@acc[1]
-	ld	[$ap+8],@acc[2]
-	ld	[$ap+12],@acc[3]
-	ld	[$ap+16],@acc[4]
-	ld	[$ap+20],@acc[5]
-	ld	[$ap+24],@acc[6]
-	ld	[$ap+28],@acc[7]
-	ld	[$ap+32],$t0		! in1_y
-	ld	[$ap+32+4],$t1
-	ld	[$ap+32+8],$t2
-	ld	[$ap+32+12],$t3
-	ld	[$ap+32+16],$t4
-	ld	[$ap+32+20],$t5
-	ld	[$ap+32+24],$t6
-	ld	[$ap+32+28],$t7
-	or	@acc[1],@acc[0],@acc[0]
-	or	@acc[3],@acc[2],@acc[2]
-	or	@acc[5],@acc[4],@acc[4]
-	or	@acc[7],@acc[6],@acc[6]
-	or	@acc[2],@acc[0],@acc[0]
-	or	@acc[6],@acc[4],@acc[4]
-	or	@acc[4],@acc[0],@acc[0]
+	ld	[$ap+64],$t0		! in1_z
+	ld	[$ap+64+4],$t1
+	ld	[$ap+64+8],$t2
+	ld	[$ap+64+12],$t3
+	ld	[$ap+64+16],$t4
+	ld	[$ap+64+20],$t5
+	ld	[$ap+64+24],$t6
+	ld	[$ap+64+28],$t7
 	or	$t1,$t0,$t0
 	or	$t3,$t2,$t2
 	or	$t5,$t4,$t4
 	or	$t7,$t6,$t6
 	or	$t2,$t0,$t0
 	or	$t6,$t4,$t4
-	or	$t4,$t0,$t0
-	or	@acc[0],$t0,$t0		! !in1infty
+	or	$t4,$t0,$t0		! !in1infty
 	movrnz	$t0,-1,$t0
 	st	$t0,[%fp+STACK_BIAS-16]
 
@@ -1341,6 +1341,7 @@ ___
 $code.=<<___;
 	ret
 	restore
+.type	ecp_nistz256_point_add_affine,#function
 .size	ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
 ___
 }								}}}
@@ -1416,6 +1417,7 @@ ecp_nistz256_scatter_w5:
 
 	ret
 	restore
+.type	ecp_nistz256_scatter_w5,#function
 .size	ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
 
 ! void	ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1,
@@ -1513,6 +1515,7 @@ ecp_nistz256_gather_w5:
 
 	ret
 	restore
+.type	ecp_nistz256_gather_w5,#function
 .size	ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
 
 ! void	ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1,
@@ -1540,6 +1543,7 @@ ecp_nistz256_scatter_w7:
 
 	ret
 	restore
+.type	ecp_nistz256_scatter_w7,#function
 .size	ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
 
 ! void	ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1,
@@ -1580,6 +1584,7 @@ ecp_nistz256_gather_w7:
 
 	ret
 	restore
+.type	ecp_nistz256_gather_w7,#function
 .size	ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
 ___
 }}}
@@ -1607,6 +1612,7 @@ __ecp_nistz256_mul_by_2_vis3:
 	addxccc	$acc3,$acc3,$acc3
 	b	.Lreduce_by_sub_vis3
 	addxc	%g0,%g0,$acc4		! did it carry?
+.type	__ecp_nistz256_mul_by_2_vis3,#function
 .size	__ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
 
 .align	32
@@ -1629,17 +1635,19 @@ __ecp_nistz256_add_noload_vis3:
 	addcc	$acc0,1,$t0		! add -modulus, i.e. subtract
 	addxccc	$acc1,$poly1,$t1
 	addxccc	$acc2,$minus1,$t2
-	addxc	$acc3,$poly3,$t3
+	addxccc	$acc3,$poly3,$t3
+	addxc	$acc4,$minus1,$acc4
 
-	movrnz	$acc4,$t0,$acc0		! if a+b carried, ret = ret-mod
-	movrnz	$acc4,$t1,$acc1
+	movrz	$acc4,$t0,$acc0		! ret = borrow ? ret : ret-modulus
+	movrz	$acc4,$t1,$acc1
 	stx	$acc0,[$rp]
-	movrnz	$acc4,$t2,$acc2
+	movrz	$acc4,$t2,$acc2
 	stx	$acc1,[$rp+8]
-	movrnz	$acc4,$t3,$acc3
+	movrz	$acc4,$t3,$acc3
 	stx	$acc2,[$rp+16]
 	retl
 	stx	$acc3,[$rp+24]
+.type	__ecp_nistz256_add_vis3,#function
 .size	__ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
 
 ! Trouble with subtraction is that there is no subtraction with 64-bit
@@ -1686,6 +1694,7 @@ __ecp_nistz256_sub_from_vis3:
 	subc	%g0,%g0,$acc4		! did it borrow?
 	b	.Lreduce_by_add_vis3
 	or	$acc3,$acc5,$acc3
+.type	__ecp_nistz256_sub_from_vis3,#function
 .size	__ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
 
 .align	32
@@ -1744,6 +1753,7 @@ __ecp_nistz256_sub_morf_vis3:
 	stx	$acc2,[$rp+16]
 	retl
 	stx	$acc3,[$rp+24]
+.type	__ecp_nistz256_sub_morf_vis3,#function
 .size	__ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
 
 .align	32
@@ -1784,6 +1794,7 @@ __ecp_nistz256_div_by_2_vis3:
 	stx	$acc2,[$rp+16]
 	retl
 	stx	$acc3,[$rp+24]
+.type	__ecp_nistz256_div_by_2_vis3,#function
 .size	__ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
 
 ! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
@@ -1881,6 +1892,7 @@ $code.=<<___;
 	addxccc	$acc4,$t3,$acc3
 	b	.Lmul_final_vis3	! see below
 	addxc	$acc5,%g0,$acc4
+.type	__ecp_nistz256_mul_mont_vis3,#function
 .size	__ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
 
 ! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
@@ -2005,6 +2017,7 @@ $code.=<<___;
 	stx	$acc2,[$rp+16]
 	retl
 	stx	$acc3,[$rp+24]
+.type	__ecp_nistz256_sqr_mont_vis3,#function
 .size	__ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
 ___
 
@@ -2268,6 +2281,7 @@ ecp_nistz256_point_double_vis3:
 
 	ret
 	restore
+.type	ecp_nistz256_point_double_vis3,#function
 .size	ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
 ___
 }
@@ -2340,16 +2354,6 @@ ecp_nistz256_point_add_vis3:
 	stx	$acc2,[%sp+LOCALS64+$in2_y+16]
 	stx	$acc3,[%sp+LOCALS64+$in2_y+24]
 
-	or	$a1,$a0,$a0
-	or	$a3,$a2,$a2
-	or	$acc1,$acc0,$acc0
-	or	$acc3,$acc2,$acc2
-	or	$a2,$a0,$a0
-	or	$acc2,$acc0,$acc0
-	or	$acc0,$a0,$a0
-	movrnz	$a0,-1,$a0			! !in2infty
-	stx	$a0,[%fp+STACK_BIAS-8]
-
 	ld	[$bp+64],$acc0			! in2_z
 	ld	[$bp+64+4],$t0
 	ld	[$bp+64+8],$acc1
@@ -2383,6 +2387,12 @@ ecp_nistz256_point_add_vis3:
 	stx	$acc2,[%sp+LOCALS64+$in2_z+16]
 	stx	$acc3,[%sp+LOCALS64+$in2_z+24]
 
+	or	$acc1,$acc0,$acc0
+	or	$acc3,$acc2,$acc2
+	or	$acc2,$acc0,$acc0
+	movrnz	$acc0,-1,$acc0			! !in2infty
+	stx	$acc0,[%fp+STACK_BIAS-8]
+
 	or	$a0,$t0,$a0
 	ld	[$ap+32],$acc0			! in1_y
 	or	$a1,$t1,$a1
@@ -2412,16 +2422,6 @@ ecp_nistz256_point_add_vis3:
 	stx	$acc2,[%sp+LOCALS64+$in1_y+16]
 	stx	$acc3,[%sp+LOCALS64+$in1_y+24]
 
-	or	$a1,$a0,$a0
-	or	$a3,$a2,$a2
-	or	$acc1,$acc0,$acc0
-	or	$acc3,$acc2,$acc2
-	or	$a2,$a0,$a0
-	or	$acc2,$acc0,$acc0
-	or	$acc0,$a0,$a0
-	movrnz	$a0,-1,$a0			! !in1infty
-	stx	$a0,[%fp+STACK_BIAS-16]
-
 	ldx	[%sp+LOCALS64+$in2_z],$a0	! forward load
 	ldx	[%sp+LOCALS64+$in2_z+8],$a1
 	ldx	[%sp+LOCALS64+$in2_z+16],$a2
@@ -2448,6 +2448,12 @@ ecp_nistz256_point_add_vis3:
 	stx	$acc2,[%sp+LOCALS64+$in1_z+16]
 	stx	$acc3,[%sp+LOCALS64+$in1_z+24]
 
+	or	$acc1,$acc0,$acc0
+	or	$acc3,$acc2,$acc2
+	or	$acc2,$acc0,$acc0
+	movrnz	$acc0,-1,$acc0			! !in1infty
+	stx	$acc0,[%fp+STACK_BIAS-16]
+
 	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Z2sqr, in2_z);
 	add	%sp,LOCALS64+$Z2sqr,$rp
 
@@ -2688,6 +2694,7 @@ $code.=<<___;
 .Ladd_done_vis3:
 	ret
 	restore
+.type	ecp_nistz256_point_add_vis3,#function
 .size	ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
 ___
 }
@@ -2808,16 +2815,6 @@ ecp_nistz256_point_add_affine_vis3:
 	stx	$acc2,[%sp+LOCALS64+$in1_y+16]
 	stx	$acc3,[%sp+LOCALS64+$in1_y+24]
 
-	or	$a1,$a0,$a0
-	or	$a3,$a2,$a2
-	or	$acc1,$acc0,$acc0
-	or	$acc3,$acc2,$acc2
-	or	$a2,$a0,$a0
-	or	$acc2,$acc0,$acc0
-	or	$acc0,$a0,$a0
-	movrnz	$a0,-1,$a0			! !in1infty
-	stx	$a0,[%fp+STACK_BIAS-16]
-
 	ld	[$ap+64],$a0			! in1_z
 	ld	[$ap+64+4],$t0
 	ld	[$ap+64+8],$a1
@@ -2839,6 +2836,12 @@ ecp_nistz256_point_add_affine_vis3:
 	stx	$a2,[%sp+LOCALS64+$in1_z+16]
 	stx	$a3,[%sp+LOCALS64+$in1_z+24]
 
+	or	$a1,$a0,$t0
+	or	$a3,$a2,$t2
+	or	$t2,$t0,$t0
+	movrnz	$t0,-1,$t0			! !in1infty
+	stx	$t0,[%fp+STACK_BIAS-16]
+
 	call	__ecp_nistz256_sqr_mont_vis3	! p256_sqr_mont(Z1sqr, in1_z);
 	add	%sp,LOCALS64+$Z1sqr,$rp
 
@@ -3006,6 +3009,7 @@ ___
 $code.=<<___;
 	ret
 	restore
+.type	ecp_nistz256_point_add_affine_vis3,#function
 .size	ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
 .align	64
 .Lone_mont_vis3: