X-Git-Url: https://git.openssl.org/gitweb/?p=openssl.git;a=blobdiff_plain;f=crypto%2Fbn%2Fasm%2Fsparcv9a-mont.pl;h=8e22a443b7f3a7014da0e89853f58e7031ea94ef;hp=1899ecb3a65812435c13419b73124323d1e89edb;hb=98939a05b6884538ba40fae2606291140f9e5839;hpb=2e21922eb6e0157fb07f4c464679d1b76b7ede5e;ds=sidebyside diff --git a/crypto/bn/asm/sparcv9a-mont.pl b/crypto/bn/asm/sparcv9a-mont.pl index 1899ecb3a6..8e22a443b7 100755 --- a/crypto/bn/asm/sparcv9a-mont.pl +++ b/crypto/bn/asm/sparcv9a-mont.pl @@ -18,8 +18,8 @@ # implementations from compatibility matrix. But the rest, whole Sun # UltraSPARC family and brand new Fujitsu's SPARC64 V, all support # VIS extension instructions used in this module. This is considered -# good enough to recommend HAL SPARC64 users [if any] to simply fall -# down to no-asm configuration. +# good enough to not care about HAL SPARC64 users [if any] who have +# integer-only pure SPARCv9 module to "fall down" to. # USI&II cores currently exhibit uniform 2x improvement [over pre- # bn_mul_mont codebase] for all key lengths and benchmarks. On USIII @@ -31,7 +31,7 @@ # is pipelined, which in turn *might* be impossible to match... On # additional note, SPARC64 V implements FP Multiply-Add instruction, # which is perfectly usable in this context... In other words, as far -# as HAL/Fujitsu SPARC64 family goes, talk to the author:-) +# as Fujitsu SPARC64 V goes, talk to the author:-) # The implementation implies following "non-natural" limitations on # input arguments: @@ -455,13 +455,18 @@ $fname: add $tp,8,$tp .L1stskip: + fdtox $dota,$dota + fdtox $dotb,$dotb + ldx [%sp+$bias+$frame+0],%o0 ldx [%sp+$bias+$frame+8],%o1 ldx [%sp+$bias+$frame+16],%o2 ldx [%sp+$bias+$frame+24],%o3 srlx %o0,16,%o7 + std $dota,[%sp+$bias+$frame+32] add %o7,%o1,%o1 + std $dotb,[%sp+$bias+$frame+40] srlx %o1,16,%o7 add %o7,%o2,%o2 srlx %o2,16,%o7 @@ -475,33 +480,28 @@ $fname: or %o1,%o0,%o0 or %o2,%o0,%o0 or %o7,%o0,%o0 ! 64-bit result + ldx [%sp+$bias+$frame+32],%o4 addcc %g1,%o0,%o0 + ldx [%sp+$bias+$frame+40],%o5 srlx %o3,16,%g1 ! 34-bit carry bcs,a %xcc,.+8 add %g1,1,%g1 stx %o0,[$tp] ! tp[j-1]= add $tp,8,$tp - - fdtox $dota,$dota - fdtox $dotb,$dotb - std $dota,[%sp+$bias+$frame+32] - std $dotb,[%sp+$bias+$frame+40] - ldx [%sp+$bias+$frame+32],%o0 - ldx [%sp+$bias+$frame+40],%o1 - srlx %o0,16,%o7 - add %o7,%o1,%o1 - and %o0,$mask,%o0 - sllx %o1,16,%o7 - or %o7,%o0,%o0 - addcc %g1,%o0,%o0 - srlx %o1,48,%g1 + srlx %o4,16,%o7 + add %o7,%o5,%o5 + and %o4,$mask,%o4 + sllx %o5,16,%o7 + or %o7,%o4,%o4 + addcc %g1,%o4,%o4 + srlx %o5,48,%g1 bcs,a %xcc,.+8 add %g1,1,%g1 mov %g1,$carry - stx %o0,[$tp] ! tp[num-1]= + stx %o4,[$tp] ! tp[num-1]= ba .Louter add $i,8,$i @@ -664,7 +664,9 @@ $fname: bz,pn %icc,.Linnerskip std $nlod,[%sp+$bias+$frame+24] -.align 32,0x1000000 + ba .Linner + nop +.align 32 .Linner: ldd [$ap_l+$j],$alo ! load a[j] in double format ldd [$ap_h+$j],$ahi @@ -719,12 +721,12 @@ $fname: or %o7,%o0,%o0 ! 64-bit result faddd $nloc,$nhia,$nloc addcc %g1,%o0,%o0 + ldx [$tp+8],%o7 ! tp[j] faddd $nlod,$nhib,$nlod srlx %o3,16,%g1 ! 34-bit carry fdtox $nloa,$nloa bcs,a %xcc,.+8 add %g1,1,%g1 - ldx [$tp+8],%o7 ! tp[j] fdtox $nlob,$nlob addcc %o7,%o0,%o0 fdtox $nloc,$nloc