Add ec/asm/ecp_nistz256-sparcv9.pl.
[openssl.git] / crypto / ec / asm / ecp_nistz256-sparcv9.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # ECP_NISTZ256 module for SPARCv9.
11 #
12 # February 2015.
13 #
14 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
15 # http://eprint.iacr.org/2013/816. In the process of adaptation
16 # original .c module was made 32-bit savvy in order to make this
17 # implementation possible.
18 #
19 #                       with/without -DECP_NISTZ256_ASM
20 # UltraSPARC III        +12-18%
21 # SPARC T4              +99-550% (+66-150% on 32-bit Solaris)
22 #
23 # Ranges denote minimum and maximum improvement coefficients depending
24 # on benchmark. Lower coefficients are for ECDSA sign, server-side
25 # operation. Keep in mind that +200% means 3x improvement.
26
27 $code.=<<___;
28 #include "sparc_arch.h"
29
30 #define LOCALS  (STACK_BIAS+STACK_FRAME)
31 #ifdef  __arch64__
32 .register       %g2,#scratch
33 .register       %g3,#scratch
34 # define STACK64_FRAME  STACK_FRAME
35 # define LOCALS64       LOCALS
36 #else
37 # define STACK64_FRAME  (2047+192)
38 # define LOCALS64       STACK64_FRAME
39 #endif
40
41 .section        ".text",#alloc,#execinstr
42 ___
43 ########################################################################
44 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
45 #
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 open TABLE,"<ecp_nistz256_table.c"              or
48 open TABLE,"<${dir}../ecp_nistz256_table.c"     or
49 die "failed to open ecp_nistz256_table.c:",$!;
50
51 use integer;
52
53 foreach(<TABLE>) {
54         s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
55 }
56 close TABLE;
57
58 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
59 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
60 # amount of elements.
61 die "insane number of elements" if ($#arr != 64*16*37-1);
62
63 $code.=<<___;
64 .globl  ecp_nistz256_precomputed
65 .align  4096
66 ecp_nistz256_precomputed:
67 ___
68 ########################################################################
69 # this conversion smashes P256_POINT_AFFINE by individual bytes with
70 # 64 byte interval, similar to
71 #       1111222233334444
72 #       1234123412341234
73 for(1..37) {
74         @tbl = splice(@arr,0,64*16);
75         for($i=0;$i<64;$i++) {
76                 undef @line;
77                 for($j=0;$j<64;$j++) {
78                         push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
79                 }
80                 $code.=".byte\t";
81                 $code.=join(',',map { sprintf "0x%02x",$_} @line);
82                 $code.="\n";
83         }
84 }
85
86 {{{
87 my ($rp,$ap,$bp)=map("%i$_",(0..2));
88 my @acc=map("%l$_",(0..7));
89 my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5");
90 my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
91 my ($rp_real,$ap_real)=("%g2","%g3");
92
93 $code.=<<___;
94 .size   ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
95 .align  64
96 .LRR:   ! 2^512 mod P precomputed for NIST P256 polynomial
97 .long   0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
98 .long   0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
99 .Lone:
100 .long   1,0,0,0,0,0,0,0
101 .asciz  "ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
102
103 ! void  ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
104 .globl  ecp_nistz256_to_mont
105 .align  64
106 ecp_nistz256_to_mont:
107         save    %sp,-STACK_FRAME,%sp
108         nop
109 1:      call    .+8
110         add     %o7,.LRR-1b,$bp
111         call    __ecp_nistz256_mul_mont
112         nop
113         ret
114         restore
115 .size   ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
116
117 ! void  ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
118 .globl  ecp_nistz256_from_mont
119 .align  32
120 ecp_nistz256_from_mont:
121         save    %sp,-STACK_FRAME,%sp
122         nop
123 1:      call    .+8
124         add     %o7,.Lone-1b,$bp
125         call    __ecp_nistz256_mul_mont
126         nop
127         ret
128         restore
129 .size   ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
130
131 ! void  ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
132 !                                             const BN_ULONG %i2[8]);
133 .globl  ecp_nistz256_mul_mont
134 .align  32
135 ecp_nistz256_mul_mont:
136         save    %sp,-STACK_FRAME,%sp
137         nop
138         call    __ecp_nistz256_mul_mont
139         nop
140         ret
141         restore
142 .size   ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
143
144 ! void  ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
145 .globl  ecp_nistz256_sqr_mont
146 .align  32
147 ecp_nistz256_sqr_mont:
148         save    %sp,-STACK_FRAME,%sp
149         mov     $ap,$bp
150         call    __ecp_nistz256_mul_mont
151         nop
152         ret
153         restore
154 .size   ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
155 ___
156
157 ########################################################################
158 # Special thing to keep in mind is that $t0-$t7 hold 64-bit values,
159 # while all others are meant to keep 32. "Meant to" means that additions
160 # to @acc[0-7] do "contaminate" upper bits, but they are cleared before
161 # they can affect outcome (follow 'and' with $mask). Also keep in mind
162 # that addition with carry is addition with 32-bit carry, even though
163 # CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see
164 # below for VIS3 code paths.]
165
166 $code.=<<___;
167 .align  32
168 __ecp_nistz256_mul_mont:
169         ld      [$bp+0],$bi             ! b[0]
170         mov     -1,$mask
171         ld      [$ap+0],$a0
172         srl     $mask,0,$mask           ! 0xffffffff
173         ld      [$ap+4],$t1
174         ld      [$ap+8],$t2
175         ld      [$ap+12],$t3
176         ld      [$ap+16],$t4
177         ld      [$ap+20],$t5
178         ld      [$ap+24],$t6
179         ld      [$ap+28],$t7
180         mulx    $a0,$bi,$t0             ! a[0-7]*b[0], 64-bit results
181         mulx    $t1,$bi,$t1
182         mulx    $t2,$bi,$t2
183         mulx    $t3,$bi,$t3
184         mulx    $t4,$bi,$t4
185         mulx    $t5,$bi,$t5
186         mulx    $t6,$bi,$t6
187         mulx    $t7,$bi,$t7
188         srlx    $t0,32,@acc[1]          ! extract high parts
189         srlx    $t1,32,@acc[2]
190         srlx    $t2,32,@acc[3]
191         srlx    $t3,32,@acc[4]
192         srlx    $t4,32,@acc[5]
193         srlx    $t5,32,@acc[6]
194         srlx    $t6,32,@acc[7]
195         srlx    $t7,32,@acc[0]          ! "@acc[8]"
196         mov     0,$carry
197 ___
198 for($i=1;$i<8;$i++) {
199 $code.=<<___;
200         addcc   @acc[1],$t1,@acc[1]     ! accumulate high parts
201         ld      [$bp+4*$i],$bi          ! b[$i]
202         ld      [$ap+4],$t1             ! re-load a[1-7]
203         addccc  @acc[2],$t2,@acc[2]
204         addccc  @acc[3],$t3,@acc[3]
205         ld      [$ap+8],$t2
206         ld      [$ap+12],$t3
207         addccc  @acc[4],$t4,@acc[4]
208         addccc  @acc[5],$t5,@acc[5]
209         ld      [$ap+16],$t4
210         ld      [$ap+20],$t5
211         addccc  @acc[6],$t6,@acc[6]
212         addccc  @acc[7],$t7,@acc[7]
213         ld      [$ap+24],$t6
214         ld      [$ap+28],$t7
215         addccc  @acc[0],$carry,@acc[0]  ! "@acc[8]"
216         addc    %g0,%g0,$carry
217 ___
218         # Reduction iteration is normally performed by accumulating
219         # result of multiplication of modulus by "magic" digit [and
220         # omitting least significant word, which is guaranteed to
221         # be 0], but thanks to special form of modulus and "magic"
222         # digit being equal to least significant word, it can be
223         # performed with additions and subtractions alone. Indeed:
224         #
225         #        ffff.0001.0000.0000.0000.ffff.ffff.ffff
226         # *                                         abcd
227         # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
228         #
229         # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
230         # rewrite above as:
231         #
232         #   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
233         # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
234         # -      abcd.0000.0000.0000.0000.0000.0000.abcd
235         #
236         # or marking redundant operations:
237         #
238         #   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
239         # + abcd.0000.abcd.0000.0000.abcd.----.----.----
240         # -      abcd.----.----.----.----.----.----.----
241
242 $code.=<<___;
243         ! multiplication-less reduction
244         addcc   @acc[3],$t0,@acc[3]     ! r[3]+=r[0]
245         addccc  @acc[4],%g0,@acc[4]     ! r[4]+=0
246          and    @acc[1],$mask,@acc[1]
247          and    @acc[2],$mask,@acc[2]
248         addccc  @acc[5],%g0,@acc[5]     ! r[5]+=0
249         addccc  @acc[6],$t0,@acc[6]     ! r[6]+=r[0]
250          and    @acc[3],$mask,@acc[3]
251          and    @acc[4],$mask,@acc[4]
252         addccc  @acc[7],%g0,@acc[7]     ! r[7]+=0
253         addccc  @acc[0],$t0,@acc[0]     ! r[8]+=r[0]    "@acc[8]"
254          and    @acc[5],$mask,@acc[5]
255          and    @acc[6],$mask,@acc[6]
256         addc    $carry,%g0,$carry       ! top-most carry
257         subcc   @acc[7],$t0,@acc[7]     ! r[7]-=r[0]
258         subccc  @acc[0],%g0,@acc[0]     ! r[8]-=0       "@acc[8]"
259         subc    $carry,%g0,$carry       ! top-most carry
260          and    @acc[7],$mask,@acc[7]
261          and    @acc[0],$mask,@acc[0]   ! "@acc[8]"
262 ___
263         push(@acc,shift(@acc));         # rotate registers to "omit" acc[0]
264 $code.=<<___;
265         mulx    $a0,$bi,$t0             ! a[0-7]*b[$i], 64-bit results
266         mulx    $t1,$bi,$t1
267         mulx    $t2,$bi,$t2
268         mulx    $t3,$bi,$t3
269         mulx    $t4,$bi,$t4
270         mulx    $t5,$bi,$t5
271         mulx    $t6,$bi,$t6
272         mulx    $t7,$bi,$t7
273         add     @acc[0],$t0,$t0         ! accumulate low parts, can't overflow
274         add     @acc[1],$t1,$t1
275         srlx    $t0,32,@acc[1]          ! extract high parts
276         add     @acc[2],$t2,$t2
277         srlx    $t1,32,@acc[2]
278         add     @acc[3],$t3,$t3
279         srlx    $t2,32,@acc[3]
280         add     @acc[4],$t4,$t4
281         srlx    $t3,32,@acc[4]
282         add     @acc[5],$t5,$t5
283         srlx    $t4,32,@acc[5]
284         add     @acc[6],$t6,$t6
285         srlx    $t5,32,@acc[6]
286         add     @acc[7],$t7,$t7
287         srlx    $t6,32,@acc[7]
288         srlx    $t7,32,@acc[0]          ! "@acc[8]"
289 ___
290 }
291 $code.=<<___;
292         addcc   @acc[1],$t1,@acc[1]     ! accumulate high parts
293         addccc  @acc[2],$t2,@acc[2]
294         addccc  @acc[3],$t3,@acc[3]
295         addccc  @acc[4],$t4,@acc[4]
296         addccc  @acc[5],$t5,@acc[5]
297         addccc  @acc[6],$t6,@acc[6]
298         addccc  @acc[7],$t7,@acc[7]
299         addccc  @acc[0],$carry,@acc[0]  ! "@acc[8]"
300         addc    %g0,%g0,$carry
301
302         addcc   @acc[3],$t0,@acc[3]     ! multiplication-less reduction
303         addccc  @acc[4],%g0,@acc[4]
304         addccc  @acc[5],%g0,@acc[5]
305         addccc  @acc[6],$t0,@acc[6]
306         addccc  @acc[7],%g0,@acc[7]
307         addccc  @acc[0],$t0,@acc[0]     ! "@acc[8]"
308         addc    $carry,%g0,$carry
309         subcc   @acc[7],$t0,@acc[7]
310         subccc  @acc[0],%g0,@acc[0]     ! "@acc[8]"
311         subc    $carry,%g0,$carry       ! top-most carry
312 ___
313         push(@acc,shift(@acc));         # rotate registers to omit acc[0]
314 $code.=<<___;
315         ! Final step is "if result > mod, subtract mod", but we do it
316         ! "other way around", namely subtract modulus from result
317         ! and if it borrowed, add modulus back.
318
319         subcc   @acc[0],-1,@acc[0]      ! subtract modulus
320         subccc  @acc[1],-1,@acc[1]
321         subccc  @acc[2],-1,@acc[2]
322         subccc  @acc[3],0,@acc[3]
323         subccc  @acc[4],0,@acc[4]
324         subccc  @acc[5],0,@acc[5]
325         subccc  @acc[6],1,@acc[6]
326         subccc  @acc[7],-1,@acc[7]
327         subc    $carry,0,$carry         ! broadcast borrow bit
328
329         ! Note that because mod has special form, i.e. consists of
330         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
331         ! using value of broadcasted borrow and the borrow bit itself.
332         ! To minimize dependency chain we first broadcast and then
333         ! extract the bit by negating (follow $bi).
334
335         addcc   @acc[0],$carry,@acc[0]  ! add modulus or zero
336         addccc  @acc[1],$carry,@acc[1]
337         neg     $carry,$bi
338         st      @acc[0],[$rp]
339         addccc  @acc[2],$carry,@acc[2]
340         st      @acc[1],[$rp+4]
341         addccc  @acc[3],0,@acc[3]
342         st      @acc[2],[$rp+8]
343         addccc  @acc[4],0,@acc[4]
344         st      @acc[3],[$rp+12]
345         addccc  @acc[5],0,@acc[5]
346         st      @acc[4],[$rp+16]
347         addccc  @acc[6],$bi,@acc[6]
348         st      @acc[5],[$rp+20]
349         addc    @acc[7],$carry,@acc[7]
350         st      @acc[6],[$rp+24]
351         retl
352         st      @acc[7],[$rp+28]
353 .size   __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
354
355 ! void  ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
356 !                                        const BN_ULONG %i2[8]);
357 .globl  ecp_nistz256_add
358 .align  32
359 ecp_nistz256_add:
360         save    %sp,-STACK_FRAME,%sp
361         ld      [$ap],@acc[0]
362         ld      [$ap+4],@acc[1]
363         ld      [$ap+8],@acc[2]
364         ld      [$ap+12],@acc[3]
365         ld      [$ap+16],@acc[4]
366         ld      [$ap+20],@acc[5]
367         ld      [$ap+24],@acc[6]
368         call    __ecp_nistz256_add
369         ld      [$ap+28],@acc[7]
370         ret
371         restore
372 .size   ecp_nistz256_add,.-ecp_nistz256_add
373
374 .align  32
375 __ecp_nistz256_add:
376         ld      [$bp+0],$t0             ! b[0]
377         ld      [$bp+4],$t1
378         ld      [$bp+8],$t2
379         ld      [$bp+12],$t3
380         addcc   @acc[0],$t0,@acc[0]
381         ld      [$bp+16],$t4
382         ld      [$bp+20],$t5
383         addccc  @acc[1],$t1,@acc[1]
384         ld      [$bp+24],$t6
385         ld      [$bp+28],$t7
386         addccc  @acc[2],$t2,@acc[2]
387         addccc  @acc[3],$t3,@acc[3]
388         addccc  @acc[4],$t4,@acc[4]
389         addccc  @acc[5],$t5,@acc[5]
390         addccc  @acc[6],$t6,@acc[6]
391         addccc  @acc[7],$t7,@acc[7]
392         subc    %g0,%g0,$carry          ! broadcast carry bit
393
394 .Lreduce_by_sub:
395
396         ! if a+b carries, subtract modulus.
397         !
398         ! Note that because mod has special form, i.e. consists of
399         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
400         ! using value of broadcasted borrow and the borrow bit itself.
401         ! To minimize dependency chain we first broadcast and then
402         ! extract the bit by negating (follow $bi).
403
404         subcc   @acc[0],$carry,@acc[0]  ! subtract synthesized modulus
405         subccc  @acc[1],$carry,@acc[1]
406         neg     $carry,$bi
407         st      @acc[0],[$rp]
408         subccc  @acc[2],$carry,@acc[2]
409         st      @acc[1],[$rp+4]
410         subccc  @acc[3],0,@acc[3]
411         st      @acc[2],[$rp+8]
412         subccc  @acc[4],0,@acc[4]
413         st      @acc[3],[$rp+12]
414         subccc  @acc[5],0,@acc[5]
415         st      @acc[4],[$rp+16]
416         subccc  @acc[6],$bi,@acc[6]
417         st      @acc[5],[$rp+20]
418         subc    @acc[7],$carry,@acc[7]
419         st      @acc[6],[$rp+24]
420         retl
421         st      @acc[7],[$rp+28]
422 .size   __ecp_nistz256_add,.-__ecp_nistz256_add
423
424 ! void  ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
425 .globl  ecp_nistz256_mul_by_2
426 .align  32
427 ecp_nistz256_mul_by_2:
428         save    %sp,-STACK_FRAME,%sp
429         ld      [$ap],@acc[0]
430         ld      [$ap+4],@acc[1]
431         ld      [$ap+8],@acc[2]
432         ld      [$ap+12],@acc[3]
433         ld      [$ap+16],@acc[4]
434         ld      [$ap+20],@acc[5]
435         ld      [$ap+24],@acc[6]
436         call    __ecp_nistz256_mul_by_2
437         ld      [$ap+28],@acc[7]
438         ret
439         restore
440 .size   ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
441
442 .align  32
443 __ecp_nistz256_mul_by_2:
444         addcc   @acc[0],@acc[0],@acc[0] ! a+a=2*a
445         addccc  @acc[1],@acc[1],@acc[1]
446         addccc  @acc[2],@acc[2],@acc[2]
447         addccc  @acc[3],@acc[3],@acc[3]
448         addccc  @acc[4],@acc[4],@acc[4]
449         addccc  @acc[5],@acc[5],@acc[5]
450         addccc  @acc[6],@acc[6],@acc[6]
451         addccc  @acc[7],@acc[7],@acc[7]
452         b       .Lreduce_by_sub
453         subc    %g0,%g0,$carry          ! broadcast carry bit
454 .size   __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
455
456 ! void  ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
457 .globl  ecp_nistz256_mul_by_3
458 .align  32
459 ecp_nistz256_mul_by_3:
460         save    %sp,-STACK_FRAME,%sp
461         ld      [$ap],@acc[0]
462         ld      [$ap+4],@acc[1]
463         ld      [$ap+8],@acc[2]
464         ld      [$ap+12],@acc[3]
465         ld      [$ap+16],@acc[4]
466         ld      [$ap+20],@acc[5]
467         ld      [$ap+24],@acc[6]
468         call    __ecp_nistz256_mul_by_3
469         ld      [$ap+28],@acc[7]
470         ret
471         restore
472 .size   ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
473
474 .align  32
475 __ecp_nistz256_mul_by_3:
476         addcc   @acc[0],@acc[0],$t0     ! a+a=2*a
477         addccc  @acc[1],@acc[1],$t1
478         addccc  @acc[2],@acc[2],$t2
479         addccc  @acc[3],@acc[3],$t3
480         addccc  @acc[4],@acc[4],$t4
481         addccc  @acc[5],@acc[5],$t5
482         addccc  @acc[6],@acc[6],$t6
483         addccc  @acc[7],@acc[7],$t7
484         subc    %g0,%g0,$carry          ! broadcast carry bit
485
486         subcc   $t0,$carry,$t0          ! .Lreduce_by_sub but without stores
487         neg     $carry,$bi
488         subccc  $t1,$carry,$t1
489         subccc  $t2,$carry,$t2
490         subccc  $t3,0,$t3
491         subccc  $t4,0,$t4
492         subccc  $t5,0,$t5
493         subccc  $t6,$bi,$t6
494         subc    $t7,$carry,$t7
495
496         addcc   $t0,@acc[0],@acc[0]     ! 2*a+a=3*a
497         addccc  $t1,@acc[1],@acc[1]
498         addccc  $t2,@acc[2],@acc[2]
499         addccc  $t3,@acc[3],@acc[3]
500         addccc  $t4,@acc[4],@acc[4]
501         addccc  $t5,@acc[5],@acc[5]
502         addccc  $t6,@acc[6],@acc[6]
503         addccc  $t7,@acc[7],@acc[7]
504         b       .Lreduce_by_sub
505         subc    %g0,%g0,$carry          ! broadcast carry bit
506 .size   __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
507
508 ! void  ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8],
509 !                                        const BN_ULONG %i2[8]);
510 .globl  ecp_nistz256_sub
511 .align  32
512 ecp_nistz256_sub:
513         save    %sp,-STACK_FRAME,%sp
514         ld      [$ap],@acc[0]
515         ld      [$ap+4],@acc[1]
516         ld      [$ap+8],@acc[2]
517         ld      [$ap+12],@acc[3]
518         ld      [$ap+16],@acc[4]
519         ld      [$ap+20],@acc[5]
520         ld      [$ap+24],@acc[6]
521         call    __ecp_nistz256_sub_from
522         ld      [$ap+28],@acc[7]
523         ret
524         restore
525 .size   ecp_nistz256_sub,.-ecp_nistz256_sub
526
527 ! void  ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
528 .globl  ecp_nistz256_neg
529 .align  32
530 ecp_nistz256_neg:
531         save    %sp,-STACK_FRAME,%sp
532         mov     $ap,$bp
533         mov     0,@acc[0]
534         mov     0,@acc[1]
535         mov     0,@acc[2]
536         mov     0,@acc[3]
537         mov     0,@acc[4]
538         mov     0,@acc[5]
539         mov     0,@acc[6]
540         call    __ecp_nistz256_sub_from
541         mov     0,@acc[7]
542         ret
543         restore
544 .size   ecp_nistz256_neg,.-ecp_nistz256_neg
545
546 .align  32
547 __ecp_nistz256_sub_from:
548         ld      [$bp+0],$t0             ! b[0]
549         ld      [$bp+4],$t1
550         ld      [$bp+8],$t2
551         ld      [$bp+12],$t3
552         subcc   @acc[0],$t0,@acc[0]
553         ld      [$bp+16],$t4
554         ld      [$bp+20],$t5
555         subccc  @acc[1],$t1,@acc[1]
556         subccc  @acc[2],$t2,@acc[2]
557         ld      [$bp+24],$t6
558         ld      [$bp+28],$t7
559         subccc  @acc[3],$t3,@acc[3]
560         subccc  @acc[4],$t4,@acc[4]
561         subccc  @acc[5],$t5,@acc[5]
562         subccc  @acc[6],$t6,@acc[6]
563         subccc  @acc[7],$t7,@acc[7]
564         subc    %g0,%g0,$carry          ! broadcast borrow bit
565
566 .Lreduce_by_add:
567
568         ! if a-b borrows, add modulus.
569         !
570         ! Note that because mod has special form, i.e. consists of
571         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
572         ! using value of broadcasted borrow and the borrow bit itself.
573         ! To minimize dependency chain we first broadcast and then
574         ! extract the bit by negating (follow $bi).
575
576         addcc   @acc[0],$carry,@acc[0]  ! add synthesized modulus
577         addccc  @acc[1],$carry,@acc[1]
578         neg     $carry,$bi
579         st      @acc[0],[$rp]
580         addccc  @acc[2],$carry,@acc[2]
581         st      @acc[1],[$rp+4]
582         addccc  @acc[3],0,@acc[3]
583         st      @acc[2],[$rp+8]
584         addccc  @acc[4],0,@acc[4]
585         st      @acc[3],[$rp+12]
586         addccc  @acc[5],0,@acc[5]
587         st      @acc[4],[$rp+16]
588         addccc  @acc[6],$bi,@acc[6]
589         st      @acc[5],[$rp+20]
590         addc    @acc[7],$carry,@acc[7]
591         st      @acc[6],[$rp+24]
592         retl
593         st      @acc[7],[$rp+28]
594 .size   __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
595
596 .align  32
597 __ecp_nistz256_sub_morf:
598         ld      [$bp+0],$t0             ! b[0]
599         ld      [$bp+4],$t1
600         ld      [$bp+8],$t2
601         ld      [$bp+12],$t3
602         subcc   $t0,@acc[0],@acc[0]
603         ld      [$bp+16],$t4
604         ld      [$bp+20],$t5
605         subccc  $t1,@acc[1],@acc[1]
606         subccc  $t2,@acc[2],@acc[2]
607         ld      [$bp+24],$t6
608         ld      [$bp+28],$t7
609         subccc  $t3,@acc[3],@acc[3]
610         subccc  $t4,@acc[4],@acc[4]
611         subccc  $t5,@acc[5],@acc[5]
612         subccc  $t6,@acc[6],@acc[6]
613         subccc  $t7,@acc[7],@acc[7]
614         b       .Lreduce_by_add
615         subc    %g0,%g0,$carry          ! broadcast borrow bit
616 .size   __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
617
618 ! void  ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
619 .globl  ecp_nistz256_div_by_2
620 .align  32
621 ecp_nistz256_div_by_2:
622         save    %sp,-STACK_FRAME,%sp
623         ld      [$ap],@acc[0]
624         ld      [$ap+4],@acc[1]
625         ld      [$ap+8],@acc[2]
626         ld      [$ap+12],@acc[3]
627         ld      [$ap+16],@acc[4]
628         ld      [$ap+20],@acc[5]
629         ld      [$ap+24],@acc[6]
630         call    __ecp_nistz256_div_by_2
631         ld      [$ap+28],@acc[7]
632         ret
633         restore
634 .size   ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
635
636 .align  32
637 __ecp_nistz256_div_by_2:
638         ! ret = (a is odd ? a+mod : a) >> 1
639
640         and     @acc[0],1,$bi
641         neg     $bi,$carry
642         addcc   @acc[0],$carry,@acc[0]
643         addccc  @acc[1],$carry,@acc[1]
644         addccc  @acc[2],$carry,@acc[2]
645         addccc  @acc[3],0,@acc[3]
646         addccc  @acc[4],0,@acc[4]
647         addccc  @acc[5],0,@acc[5]
648         addccc  @acc[6],$bi,@acc[6]
649         addccc  @acc[7],$carry,@acc[7]
650         addc    %g0,%g0,$carry
651
652         ! ret >>= 1
653
654         srl     @acc[0],1,@acc[0]
655         sll     @acc[1],31,$t0
656         srl     @acc[1],1,@acc[1]
657         or      @acc[0],$t0,@acc[0]
658         sll     @acc[2],31,$t1
659         srl     @acc[2],1,@acc[2]
660         or      @acc[1],$t1,@acc[1]
661         sll     @acc[3],31,$t2
662         st      @acc[0],[$rp]
663         srl     @acc[3],1,@acc[3]
664         or      @acc[2],$t2,@acc[2]
665         sll     @acc[4],31,$t3
666         st      @acc[1],[$rp+4]
667         srl     @acc[4],1,@acc[4]
668         or      @acc[3],$t3,@acc[3]
669         sll     @acc[5],31,$t4
670         st      @acc[2],[$rp+8]
671         srl     @acc[5],1,@acc[5]
672         or      @acc[4],$t4,@acc[4]
673         sll     @acc[6],31,$t5
674         st      @acc[3],[$rp+12]
675         srl     @acc[6],1,@acc[6]
676         or      @acc[5],$t5,@acc[5]
677         sll     @acc[7],31,$t6
678         st      @acc[4],[$rp+16]
679         srl     @acc[7],1,@acc[7]
680         or      @acc[6],$t6,@acc[6]
681         sll     $carry,31,$t7
682         st      @acc[5],[$rp+20]
683         or      @acc[7],$t7,@acc[7]
684         st      @acc[6],[$rp+24]
685         retl
686         st      @acc[7],[$rp+28]
687 .size   __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
688 ___
689
690 ########################################################################
691 # following subroutines are "literal" implemetation of those found in
692 # ecp_nistz256.c
693 #
694 ########################################################################
695 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
696 #
697 {
698 my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
699 # above map() describes stack layout with 4 temporary
700 # 256-bit vectors on top.
701
702 $code.=<<___;
703 #ifdef __PIC__
704 SPARC_PIC_THUNK(%g1)
705 #endif
706
707 .globl  ecp_nistz256_point_double
708 .align  32
709 ecp_nistz256_point_double:
710         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
711         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
712         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
713         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
714         be      ecp_nistz256_point_double_vis3
715         nop
716
717         save    %sp,-STACK_FRAME-32*4,%sp
718
719         mov     $rp,$rp_real
720         mov     $ap,$ap_real
721
722         ld      [$ap+32],@acc[0]
723         ld      [$ap+32+4],@acc[1]
724         ld      [$ap+32+8],@acc[2]
725         ld      [$ap+32+12],@acc[3]
726         ld      [$ap+32+16],@acc[4]
727         ld      [$ap+32+20],@acc[5]
728         ld      [$ap+32+24],@acc[6]
729         ld      [$ap+32+28],@acc[7]
730         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y);
731         add     %sp,LOCALS+$S,$rp
732
733         add     $ap_real,64,$bp
734         add     $ap_real,64,$ap
735         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z);
736         add     %sp,LOCALS+$Zsqr,$rp
737
738         add     $ap_real,0,$bp
739         call    __ecp_nistz256_add      ! p256_add(M, Zsqr, in_x);
740         add     %sp,LOCALS+$M,$rp
741
742         add     %sp,LOCALS+$S,$bp
743         add     %sp,LOCALS+$S,$ap
744         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S);
745         add     %sp,LOCALS+$S,$rp
746
747         ld      [$ap_real],@acc[0]
748         add     %sp,LOCALS+$Zsqr,$bp
749         ld      [$ap_real+4],@acc[1]
750         ld      [$ap_real+8],@acc[2]
751         ld      [$ap_real+12],@acc[3]
752         ld      [$ap_real+16],@acc[4]
753         ld      [$ap_real+20],@acc[5]
754         ld      [$ap_real+24],@acc[6]
755         ld      [$ap_real+28],@acc[7]
756         call    __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr);
757         add     %sp,LOCALS+$Zsqr,$rp
758
759         add     $ap_real,32,$bp
760         add     $ap_real,64,$ap
761         call    __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y);
762         add     %sp,LOCALS+$tmp0,$rp
763
764         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0);
765         add     $rp_real,64,$rp
766
767         add     %sp,LOCALS+$Zsqr,$bp
768         add     %sp,LOCALS+$M,$ap
769         call    __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr);
770         add     %sp,LOCALS+$M,$rp
771
772         call    __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M);
773         add     %sp,LOCALS+$M,$rp
774
775         add     %sp,LOCALS+$S,$bp
776         add     %sp,LOCALS+$S,$ap
777         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S);
778         add     %sp,LOCALS+$tmp0,$rp
779
780         call    __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0);
781         add     $rp_real,32,$rp
782
783         add     $ap_real,0,$bp
784         add     %sp,LOCALS+$S,$ap
785         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x);
786         add     %sp,LOCALS+$S,$rp
787
788         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S);
789         add     %sp,LOCALS+$tmp0,$rp
790
791         add     %sp,LOCALS+$M,$bp
792         add     %sp,LOCALS+$M,$ap
793         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M);
794         add     $rp_real,0,$rp
795
796         add     %sp,LOCALS+$tmp0,$bp
797         call    __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0);
798         add     $rp_real,0,$rp
799
800         add     %sp,LOCALS+$S,$bp
801         call    __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x);
802         add     %sp,LOCALS+$S,$rp
803
804         add     %sp,LOCALS+$M,$bp
805         add     %sp,LOCALS+$S,$ap
806         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M);
807         add     %sp,LOCALS+$S,$rp
808
809         add     $rp_real,32,$bp
810         call    __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y);
811         add     $rp_real,32,$rp
812
813         ret
814         restore
815 .size   ecp_nistz256_point_double,.-ecp_nistz256_point_double
816 ___
817 }
818
819 ########################################################################
820 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
821 #                             const P256_POINT *in2);
822 {
823 my ($res_x,$res_y,$res_z,
824     $H,$Hsqr,$R,$Rsqr,$Hcub,
825     $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
826 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
827
828 # above map() describes stack layout with 12 temporary
829 # 256-bit vectors on top. Then we reserve some space for
830 # !in1infty, !in2infty, result of check for zero and return pointer.
831
832 my $bp_real=$rp_real;
833
834 $code.=<<___;
835 .globl  ecp_nistz256_point_add
836 .align  32
837 ecp_nistz256_point_add:
838         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
839         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
840         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
841         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
842         be      ecp_nistz256_point_add_vis3
843         nop
844
845         save    %sp,-STACK_FRAME-32*12-32,%sp
846
847         stx     $rp,[%fp+STACK_BIAS-8]  ! off-load $rp
848         mov     $ap,$ap_real
849         mov     $bp,$bp_real
850
851         ld      [$bp],@acc[0]           ! in2_x
852         ld      [$bp+4],@acc[1]
853         ld      [$bp+8],@acc[2]
854         ld      [$bp+12],@acc[3]
855         ld      [$bp+16],@acc[4]
856         ld      [$bp+20],@acc[5]
857         ld      [$bp+24],@acc[6]
858         ld      [$bp+28],@acc[7]
859         ld      [$bp+32],$t0            ! in2_y
860         ld      [$bp+32+4],$t1
861         ld      [$bp+32+8],$t2
862         ld      [$bp+32+12],$t3
863         ld      [$bp+32+16],$t4
864         ld      [$bp+32+20],$t5
865         ld      [$bp+32+24],$t6
866         ld      [$bp+32+28],$t7
867         or      @acc[1],@acc[0],@acc[0]
868         or      @acc[3],@acc[2],@acc[2]
869         or      @acc[5],@acc[4],@acc[4]
870         or      @acc[7],@acc[6],@acc[6]
871         or      @acc[2],@acc[0],@acc[0]
872         or      @acc[6],@acc[4],@acc[4]
873         or      @acc[4],@acc[0],@acc[0]
874         or      $t1,$t0,$t0
875         or      $t3,$t2,$t2
876         or      $t5,$t4,$t4
877         or      $t7,$t6,$t6
878         or      $t2,$t0,$t0
879         or      $t6,$t4,$t4
880         or      $t4,$t0,$t0
881         or      @acc[0],$t0,$t0         ! !in2infty
882         movrnz  $t0,-1,$t0
883         st      $t0,[%fp+STACK_BIAS-12]
884
885         ld      [$ap],@acc[0]           ! in1_x
886         ld      [$ap+4],@acc[1]
887         ld      [$ap+8],@acc[2]
888         ld      [$ap+12],@acc[3]
889         ld      [$ap+16],@acc[4]
890         ld      [$ap+20],@acc[5]
891         ld      [$ap+24],@acc[6]
892         ld      [$ap+28],@acc[7]
893         ld      [$ap+32],$t0            ! in1_y
894         ld      [$ap+32+4],$t1
895         ld      [$ap+32+8],$t2
896         ld      [$ap+32+12],$t3
897         ld      [$ap+32+16],$t4
898         ld      [$ap+32+20],$t5
899         ld      [$ap+32+24],$t6
900         ld      [$ap+32+28],$t7
901         or      @acc[1],@acc[0],@acc[0]
902         or      @acc[3],@acc[2],@acc[2]
903         or      @acc[5],@acc[4],@acc[4]
904         or      @acc[7],@acc[6],@acc[6]
905         or      @acc[2],@acc[0],@acc[0]
906         or      @acc[6],@acc[4],@acc[4]
907         or      @acc[4],@acc[0],@acc[0]
908         or      $t1,$t0,$t0
909         or      $t3,$t2,$t2
910         or      $t5,$t4,$t4
911         or      $t7,$t6,$t6
912         or      $t2,$t0,$t0
913         or      $t6,$t4,$t4
914         or      $t4,$t0,$t0
915         or      @acc[0],$t0,$t0         ! !in1infty
916         movrnz  $t0,-1,$t0
917         st      $t0,[%fp+STACK_BIAS-16]
918
919         add     $bp_real,64,$bp
920         add     $bp_real,64,$ap
921         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z);
922         add     %sp,LOCALS+$Z2sqr,$rp
923
924         add     $ap_real,64,$bp
925         add     $ap_real,64,$ap
926         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
927         add     %sp,LOCALS+$Z1sqr,$rp
928
929         add     $bp_real,64,$bp
930         add     %sp,LOCALS+$Z2sqr,$ap
931         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z);
932         add     %sp,LOCALS+$S1,$rp
933
934         add     $ap_real,64,$bp
935         add     %sp,LOCALS+$Z1sqr,$ap
936         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
937         add     %sp,LOCALS+$S2,$rp
938
939         add     $ap_real,32,$bp
940         add     %sp,LOCALS+$S1,$ap
941         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y);
942         add     %sp,LOCALS+$S1,$rp
943
944         add     $bp_real,32,$bp
945         add     %sp,LOCALS+$S2,$ap
946         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
947         add     %sp,LOCALS+$S2,$rp
948
949         add     %sp,LOCALS+$S1,$bp
950         call    __ecp_nistz256_sub_from ! p256_sub(R, S2, S1);
951         add     %sp,LOCALS+$R,$rp
952
953         or      @acc[1],@acc[0],@acc[0] ! see if result is zero
954         or      @acc[3],@acc[2],@acc[2]
955         or      @acc[5],@acc[4],@acc[4]
956         or      @acc[7],@acc[6],@acc[6]
957         or      @acc[2],@acc[0],@acc[0]
958         or      @acc[6],@acc[4],@acc[4]
959         or      @acc[4],@acc[0],@acc[0]
960         st      @acc[0],[%fp+STACK_BIAS-20]
961
962         add     $ap_real,0,$bp
963         add     %sp,LOCALS+$Z2sqr,$ap
964         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr);
965         add     %sp,LOCALS+$U1,$rp
966
967         add     $bp_real,0,$bp
968         add     %sp,LOCALS+$Z1sqr,$ap
969         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr);
970         add     %sp,LOCALS+$U2,$rp
971
972         add     %sp,LOCALS+$U1,$bp
973         call    __ecp_nistz256_sub_from ! p256_sub(H, U2, U1);
974         add     %sp,LOCALS+$H,$rp
975
976         or      @acc[1],@acc[0],@acc[0] ! see if result is zero
977         or      @acc[3],@acc[2],@acc[2]
978         or      @acc[5],@acc[4],@acc[4]
979         or      @acc[7],@acc[6],@acc[6]
980         or      @acc[2],@acc[0],@acc[0]
981         or      @acc[6],@acc[4],@acc[4]
982         orcc    @acc[4],@acc[0],@acc[0]
983
984         bne,pt  %icc,.Ladd_proceed      ! is_equal(U1,U2)?
985         nop
986
987         ld      [%fp+STACK_BIAS-12],$t0
988         ld      [%fp+STACK_BIAS-16],$t1
989         ld      [%fp+STACK_BIAS-20],$t2
990         andcc   $t0,$t1,%g0
991         be,pt   %icc,.Ladd_proceed      ! (in1infty || in2infty)?
992         nop
993         andcc   $t2,$t2,%g0
994         be,pt   %icc,.Ladd_proceed      ! is_equal(S1,S2)?
995         nop
996
997         ldx     [%fp+STACK_BIAS-8],$rp
998         st      %g0,[$rp]
999         st      %g0,[$rp+4]
1000         st      %g0,[$rp+8]
1001         st      %g0,[$rp+12]
1002         st      %g0,[$rp+16]
1003         st      %g0,[$rp+20]
1004         st      %g0,[$rp+24]
1005         st      %g0,[$rp+28]
1006         st      %g0,[$rp+32]
1007         st      %g0,[$rp+32+4]
1008         st      %g0,[$rp+32+8]
1009         st      %g0,[$rp+32+12]
1010         st      %g0,[$rp+32+16]
1011         st      %g0,[$rp+32+20]
1012         st      %g0,[$rp+32+24]
1013         st      %g0,[$rp+32+28]
1014         st      %g0,[$rp+64]
1015         st      %g0,[$rp+64+4]
1016         st      %g0,[$rp+64+8]
1017         st      %g0,[$rp+64+12]
1018         st      %g0,[$rp+64+16]
1019         st      %g0,[$rp+64+20]
1020         st      %g0,[$rp+64+24]
1021         st      %g0,[$rp+64+28]
1022         b       .Ladd_done
1023         nop
1024
1025 .align  16
1026 .Ladd_proceed:
1027         add     %sp,LOCALS+$R,$bp
1028         add     %sp,LOCALS+$R,$ap
1029         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1030         add     %sp,LOCALS+$Rsqr,$rp
1031
1032         add     $ap_real,64,$bp
1033         add     %sp,LOCALS+$H,$ap
1034         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1035         add     %sp,LOCALS+$res_z,$rp
1036
1037         add     %sp,LOCALS+$H,$bp
1038         add     %sp,LOCALS+$H,$ap
1039         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1040         add     %sp,LOCALS+$Hsqr,$rp
1041
1042         add     $bp_real,64,$bp
1043         add     %sp,LOCALS+$res_z,$ap
1044         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z);
1045         add     %sp,LOCALS+$res_z,$rp
1046
1047         add     %sp,LOCALS+$H,$bp
1048         add     %sp,LOCALS+$Hsqr,$ap
1049         call    __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1050         add     %sp,LOCALS+$Hcub,$rp
1051
1052         add     %sp,LOCALS+$U1,$bp
1053         add     %sp,LOCALS+$Hsqr,$ap
1054         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr);
1055         add     %sp,LOCALS+$U2,$rp
1056
1057         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1058         add     %sp,LOCALS+$Hsqr,$rp
1059
1060         add     %sp,LOCALS+$Rsqr,$bp
1061         call    __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1062         add     %sp,LOCALS+$res_x,$rp
1063
1064         add     %sp,LOCALS+$Hcub,$bp
1065         call    __ecp_nistz256_sub_from !  p256_sub(res_x, res_x, Hcub);
1066         add     %sp,LOCALS+$res_x,$rp
1067
1068         add     %sp,LOCALS+$U2,$bp
1069         call    __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1070         add     %sp,LOCALS+$res_y,$rp
1071
1072         add     %sp,LOCALS+$Hcub,$bp
1073         add     %sp,LOCALS+$S1,$ap
1074         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub);
1075         add     %sp,LOCALS+$S2,$rp
1076
1077         add     %sp,LOCALS+$R,$bp
1078         add     %sp,LOCALS+$res_y,$ap
1079         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1080         add     %sp,LOCALS+$res_y,$rp
1081
1082         add     %sp,LOCALS+$S2,$bp
1083         call    __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1084         add     %sp,LOCALS+$res_y,$rp
1085
1086         ld      [%fp+STACK_BIAS-16],$t1 ! !in1infty
1087         ld      [%fp+STACK_BIAS-12],$t2 ! !in2infty
1088         ldx     [%fp+STACK_BIAS-8],$rp
1089 ___
1090 for($i=0;$i<96;$i+=8) {                 # conditional moves
1091 $code.=<<___;
1092         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1093         ld      [%sp+LOCALS+$i+4],@acc[1]
1094         ld      [$bp_real+$i],@acc[2]           ! in2
1095         ld      [$bp_real+$i+4],@acc[3]
1096         ld      [$ap_real+$i],@acc[4]           ! in1
1097         ld      [$ap_real+$i+4],@acc[5]
1098         movrz   $t1,@acc[2],@acc[0]
1099         movrz   $t1,@acc[3],@acc[1]
1100         movrz   $t2,@acc[4],@acc[0]
1101         movrz   $t2,@acc[5],@acc[1]
1102         st      @acc[0],[$rp+$i]
1103         st      @acc[1],[$rp+$i+4]
1104 ___
1105 }
1106 $code.=<<___;
1107 .Ladd_done:
1108         ret
1109         restore
1110 .size   ecp_nistz256_point_add,.-ecp_nistz256_point_add
1111 ___
1112 }
1113
1114 ########################################################################
1115 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1116 #                                    const P256_POINT_AFFINE *in2);
1117 {
1118 my ($res_x,$res_y,$res_z,
1119     $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1120 my $Z1sqr = $S2;
1121 # above map() describes stack layout with 10 temporary
1122 # 256-bit vectors on top. Then we reserve some space for
1123 # !in1infty, !in2infty, result of check for zero and return pointer.
1124
1125 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1126 my $bp_real=$rp_real;
1127
1128 $code.=<<___;
1129 .globl  ecp_nistz256_point_add_affine
1130 .align  32
1131 ecp_nistz256_point_add_affine:
1132         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
1133         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
1134         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
1135         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
1136         be      ecp_nistz256_point_add_affine_vis3
1137         nop
1138
1139         save    %sp,-STACK_FRAME-32*10-32,%sp
1140
1141         stx     $rp,[%fp+STACK_BIAS-8]  ! off-load $rp
1142         mov     $ap,$ap_real
1143         mov     $bp,$bp_real
1144
1145         ld      [$ap],@acc[0]           ! in1_x
1146         ld      [$ap+4],@acc[1]
1147         ld      [$ap+8],@acc[2]
1148         ld      [$ap+12],@acc[3]
1149         ld      [$ap+16],@acc[4]
1150         ld      [$ap+20],@acc[5]
1151         ld      [$ap+24],@acc[6]
1152         ld      [$ap+28],@acc[7]
1153         ld      [$ap+32],$t0            ! in1_y
1154         ld      [$ap+32+4],$t1
1155         ld      [$ap+32+8],$t2
1156         ld      [$ap+32+12],$t3
1157         ld      [$ap+32+16],$t4
1158         ld      [$ap+32+20],$t5
1159         ld      [$ap+32+24],$t6
1160         ld      [$ap+32+28],$t7
1161         or      @acc[1],@acc[0],@acc[0]
1162         or      @acc[3],@acc[2],@acc[2]
1163         or      @acc[5],@acc[4],@acc[4]
1164         or      @acc[7],@acc[6],@acc[6]
1165         or      @acc[2],@acc[0],@acc[0]
1166         or      @acc[6],@acc[4],@acc[4]
1167         or      @acc[4],@acc[0],@acc[0]
1168         or      $t1,$t0,$t0
1169         or      $t3,$t2,$t2
1170         or      $t5,$t4,$t4
1171         or      $t7,$t6,$t6
1172         or      $t2,$t0,$t0
1173         or      $t6,$t4,$t4
1174         or      $t4,$t0,$t0
1175         or      @acc[0],$t0,$t0         ! !in1infty
1176         movrnz  $t0,-1,$t0
1177         st      $t0,[%fp+STACK_BIAS-16]
1178
1179         ld      [$bp],@acc[0]           ! in2_x
1180         ld      [$bp+4],@acc[1]
1181         ld      [$bp+8],@acc[2]
1182         ld      [$bp+12],@acc[3]
1183         ld      [$bp+16],@acc[4]
1184         ld      [$bp+20],@acc[5]
1185         ld      [$bp+24],@acc[6]
1186         ld      [$bp+28],@acc[7]
1187         ld      [$bp+32],$t0            ! in2_y
1188         ld      [$bp+32+4],$t1
1189         ld      [$bp+32+8],$t2
1190         ld      [$bp+32+12],$t3
1191         ld      [$bp+32+16],$t4
1192         ld      [$bp+32+20],$t5
1193         ld      [$bp+32+24],$t6
1194         ld      [$bp+32+28],$t7
1195         or      @acc[1],@acc[0],@acc[0]
1196         or      @acc[3],@acc[2],@acc[2]
1197         or      @acc[5],@acc[4],@acc[4]
1198         or      @acc[7],@acc[6],@acc[6]
1199         or      @acc[2],@acc[0],@acc[0]
1200         or      @acc[6],@acc[4],@acc[4]
1201         or      @acc[4],@acc[0],@acc[0]
1202         or      $t1,$t0,$t0
1203         or      $t3,$t2,$t2
1204         or      $t5,$t4,$t4
1205         or      $t7,$t6,$t6
1206         or      $t2,$t0,$t0
1207         or      $t6,$t4,$t4
1208         or      $t4,$t0,$t0
1209         or      @acc[0],$t0,$t0         ! !in2infty
1210         movrnz  $t0,-1,$t0
1211         st      $t0,[%fp+STACK_BIAS-12]
1212
1213         add     $ap_real,64,$bp
1214         add     $ap_real,64,$ap
1215         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
1216         add     %sp,LOCALS+$Z1sqr,$rp
1217
1218         add     $bp_real,0,$bp
1219         add     %sp,LOCALS+$Z1sqr,$ap
1220         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x);
1221         add     %sp,LOCALS+$U2,$rp
1222
1223         add     $ap_real,0,$bp
1224         call    __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x);
1225         add     %sp,LOCALS+$H,$rp
1226
1227         add     $ap_real,64,$bp
1228         add     %sp,LOCALS+$Z1sqr,$ap
1229         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
1230         add     %sp,LOCALS+$S2,$rp
1231
1232         add     $ap_real,64,$bp
1233         add     %sp,LOCALS+$H,$ap
1234         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1235         add     %sp,LOCALS+$res_z,$rp
1236
1237         add     $bp_real,32,$bp
1238         add     %sp,LOCALS+$S2,$ap
1239         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
1240         add     %sp,LOCALS+$S2,$rp
1241
1242         add     $ap_real,32,$bp
1243         call    __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y);
1244         add     %sp,LOCALS+$R,$rp
1245
1246         add     %sp,LOCALS+$H,$bp
1247         add     %sp,LOCALS+$H,$ap
1248         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1249         add     %sp,LOCALS+$Hsqr,$rp
1250
1251         add     %sp,LOCALS+$R,$bp
1252         add     %sp,LOCALS+$R,$ap
1253         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1254         add     %sp,LOCALS+$Rsqr,$rp
1255
1256         add     %sp,LOCALS+$H,$bp
1257         add     %sp,LOCALS+$Hsqr,$ap
1258         call    __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1259         add     %sp,LOCALS+$Hcub,$rp
1260
1261         add     $ap_real,0,$bp
1262         add     %sp,LOCALS+$Hsqr,$ap
1263         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr);
1264         add     %sp,LOCALS+$U2,$rp
1265
1266         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1267         add     %sp,LOCALS+$Hsqr,$rp
1268
1269         add     %sp,LOCALS+$Rsqr,$bp
1270         call    __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1271         add     %sp,LOCALS+$res_x,$rp
1272
1273         add     %sp,LOCALS+$Hcub,$bp
1274         call    __ecp_nistz256_sub_from !  p256_sub(res_x, res_x, Hcub);
1275         add     %sp,LOCALS+$res_x,$rp
1276
1277         add     %sp,LOCALS+$U2,$bp
1278         call    __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1279         add     %sp,LOCALS+$res_y,$rp
1280
1281         add     $ap_real,32,$bp
1282         add     %sp,LOCALS+$Hcub,$ap
1283         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub);
1284         add     %sp,LOCALS+$S2,$rp
1285
1286         add     %sp,LOCALS+$R,$bp
1287         add     %sp,LOCALS+$res_y,$ap
1288         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1289         add     %sp,LOCALS+$res_y,$rp
1290
1291         add     %sp,LOCALS+$S2,$bp
1292         call    __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1293         add     %sp,LOCALS+$res_y,$rp
1294
1295         ld      [%fp+STACK_BIAS-16],$t1 ! !in1infty
1296         ld      [%fp+STACK_BIAS-12],$t2 ! !in2infty
1297         ldx     [%fp+STACK_BIAS-8],$rp
1298 ___
1299 for($i=0;$i<64;$i+=8) {                 # conditional moves
1300 $code.=<<___;
1301         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1302         ld      [%sp+LOCALS+$i+4],@acc[1]
1303         ld      [$bp_real+$i],@acc[2]           ! in2
1304         ld      [$bp_real+$i+4],@acc[3]
1305         ld      [$ap_real+$i],@acc[4]           ! in1
1306         ld      [$ap_real+$i+4],@acc[5]
1307         movrz   $t1,@acc[2],@acc[0]
1308         movrz   $t1,@acc[3],@acc[1]
1309         movrz   $t2,@acc[4],@acc[0]
1310         movrz   $t2,@acc[5],@acc[1]
1311         st      @acc[0],[$rp+$i]
1312         st      @acc[1],[$rp+$i+4]
1313 ___
1314 }
1315 for(;$i<96;$i+=8) {
1316 my $j=($i-64)/4;
1317 $code.=<<___;
1318         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1319         ld      [%sp+LOCALS+$i+4],@acc[1]
1320         ld      [$ap_real+$i],@acc[4]           ! in1
1321         ld      [$ap_real+$i+4],@acc[5]
1322         movrz   $t1,@ONE_mont[$j],@acc[0]
1323         movrz   $t1,@ONE_mont[$j+1],@acc[1]
1324         movrz   $t2,@acc[4],@acc[0]
1325         movrz   $t2,@acc[5],@acc[1]
1326         st      @acc[0],[$rp+$i]
1327         st      @acc[1],[$rp+$i+4]
1328 ___
1329 }
1330 $code.=<<___;
1331         ret
1332         restore
1333 .size   ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1334 ___
1335 }                                                               }}}
1336 {{{
1337 my ($out,$inp,$index)=map("%i$_",(0..2));
1338 my $mask="%o0";
1339
1340 $code.=<<___;
1341 ! void  ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1,
1342 !                                         int %i2);
1343 .globl  ecp_nistz256_scatter_w5
1344 .align  32
1345 ecp_nistz256_scatter_w5:
1346         save    %sp,-STACK_FRAME,%sp
1347
1348         sll     $index,2,$index
1349         add     $out,$index,$out
1350
1351         ld      [$inp],%l0              ! X
1352         ld      [$inp+4],%l1
1353         ld      [$inp+8],%l2
1354         ld      [$inp+12],%l3
1355         ld      [$inp+16],%l4
1356         ld      [$inp+20],%l5
1357         ld      [$inp+24],%l6
1358         ld      [$inp+28],%l7
1359         add     $inp,32,$inp
1360         st      %l0,[$out+64*0-4]
1361         st      %l1,[$out+64*1-4]
1362         st      %l2,[$out+64*2-4]
1363         st      %l3,[$out+64*3-4]
1364         st      %l4,[$out+64*4-4]
1365         st      %l5,[$out+64*5-4]
1366         st      %l6,[$out+64*6-4]
1367         st      %l7,[$out+64*7-4]
1368         add     $out,64*8,$out
1369
1370         ld      [$inp],%l0              ! Y
1371         ld      [$inp+4],%l1
1372         ld      [$inp+8],%l2
1373         ld      [$inp+12],%l3
1374         ld      [$inp+16],%l4
1375         ld      [$inp+20],%l5
1376         ld      [$inp+24],%l6
1377         ld      [$inp+28],%l7
1378         add     $inp,32,$inp
1379         st      %l0,[$out+64*0-4]
1380         st      %l1,[$out+64*1-4]
1381         st      %l2,[$out+64*2-4]
1382         st      %l3,[$out+64*3-4]
1383         st      %l4,[$out+64*4-4]
1384         st      %l5,[$out+64*5-4]
1385         st      %l6,[$out+64*6-4]
1386         st      %l7,[$out+64*7-4]
1387         add     $out,64*8,$out
1388
1389         ld      [$inp],%l0              ! Z
1390         ld      [$inp+4],%l1
1391         ld      [$inp+8],%l2
1392         ld      [$inp+12],%l3
1393         ld      [$inp+16],%l4
1394         ld      [$inp+20],%l5
1395         ld      [$inp+24],%l6
1396         ld      [$inp+28],%l7
1397         st      %l0,[$out+64*0-4]
1398         st      %l1,[$out+64*1-4]
1399         st      %l2,[$out+64*2-4]
1400         st      %l3,[$out+64*3-4]
1401         st      %l4,[$out+64*4-4]
1402         st      %l5,[$out+64*5-4]
1403         st      %l6,[$out+64*6-4]
1404         st      %l7,[$out+64*7-4]
1405
1406         ret
1407         restore
1408 .size   ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1409
1410 ! void  ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1,
1411 !                                              int %i2);
1412 .globl  ecp_nistz256_gather_w5
1413 .align  32
1414 ecp_nistz256_gather_w5:
1415         save    %sp,-STACK_FRAME,%sp
1416
1417         neg     $index,$mask
1418         srax    $mask,63,$mask
1419
1420         add     $index,$mask,$index
1421         sll     $index,2,$index
1422         add     $inp,$index,$inp
1423
1424         ld      [$inp+64*0],%l0
1425         ld      [$inp+64*1],%l1
1426         ld      [$inp+64*2],%l2
1427         ld      [$inp+64*3],%l3
1428         ld      [$inp+64*4],%l4
1429         ld      [$inp+64*5],%l5
1430         ld      [$inp+64*6],%l6
1431         ld      [$inp+64*7],%l7
1432         add     $inp,64*8,$inp
1433         and     %l0,$mask,%l0
1434         and     %l1,$mask,%l1
1435         st      %l0,[$out]              ! X
1436         and     %l2,$mask,%l2
1437         st      %l1,[$out+4]
1438         and     %l3,$mask,%l3
1439         st      %l2,[$out+8]
1440         and     %l4,$mask,%l4
1441         st      %l3,[$out+12]
1442         and     %l5,$mask,%l5
1443         st      %l4,[$out+16]
1444         and     %l6,$mask,%l6
1445         st      %l5,[$out+20]
1446         and     %l7,$mask,%l7
1447         st      %l6,[$out+24]
1448         st      %l7,[$out+28]
1449         add     $out,32,$out
1450
1451         ld      [$inp+64*0],%l0
1452         ld      [$inp+64*1],%l1
1453         ld      [$inp+64*2],%l2
1454         ld      [$inp+64*3],%l3
1455         ld      [$inp+64*4],%l4
1456         ld      [$inp+64*5],%l5
1457         ld      [$inp+64*6],%l6
1458         ld      [$inp+64*7],%l7
1459         add     $inp,64*8,$inp
1460         and     %l0,$mask,%l0
1461         and     %l1,$mask,%l1
1462         st      %l0,[$out]              ! Y
1463         and     %l2,$mask,%l2
1464         st      %l1,[$out+4]
1465         and     %l3,$mask,%l3
1466         st      %l2,[$out+8]
1467         and     %l4,$mask,%l4
1468         st      %l3,[$out+12]
1469         and     %l5,$mask,%l5
1470         st      %l4,[$out+16]
1471         and     %l6,$mask,%l6
1472         st      %l5,[$out+20]
1473         and     %l7,$mask,%l7
1474         st      %l6,[$out+24]
1475         st      %l7,[$out+28]
1476         add     $out,32,$out
1477
1478         ld      [$inp+64*0],%l0
1479         ld      [$inp+64*1],%l1
1480         ld      [$inp+64*2],%l2
1481         ld      [$inp+64*3],%l3
1482         ld      [$inp+64*4],%l4
1483         ld      [$inp+64*5],%l5
1484         ld      [$inp+64*6],%l6
1485         ld      [$inp+64*7],%l7
1486         and     %l0,$mask,%l0
1487         and     %l1,$mask,%l1
1488         st      %l0,[$out]              ! Z
1489         and     %l2,$mask,%l2
1490         st      %l1,[$out+4]
1491         and     %l3,$mask,%l3
1492         st      %l2,[$out+8]
1493         and     %l4,$mask,%l4
1494         st      %l3,[$out+12]
1495         and     %l5,$mask,%l5
1496         st      %l4,[$out+16]
1497         and     %l6,$mask,%l6
1498         st      %l5,[$out+20]
1499         and     %l7,$mask,%l7
1500         st      %l6,[$out+24]
1501         st      %l7,[$out+28]
1502
1503         ret
1504         restore
1505 .size   ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1506
1507 ! void  ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1,
1508 !                                         int %i2);
1509 .globl  ecp_nistz256_scatter_w7
1510 .align  32
1511 ecp_nistz256_scatter_w7:
1512         save    %sp,-STACK_FRAME,%sp
1513         nop
1514         add     $out,$index,$out
1515         mov     64/4,$index
1516 .Loop_scatter_w7:
1517         ld      [$inp],%l0
1518         add     $inp,4,$inp
1519         subcc   $index,1,$index
1520         stb     %l0,[$out+64*0-1]
1521         srl     %l0,8,%l1
1522         stb     %l1,[$out+64*1-1]
1523         srl     %l0,16,%l2
1524         stb     %l2,[$out+64*2-1]
1525         srl     %l0,24,%l3
1526         stb     %l3,[$out+64*3-1]
1527         bne     .Loop_scatter_w7
1528         add     $out,64*4,$out
1529
1530         ret
1531         restore
1532 .size   ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1533
1534 ! void  ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1,
1535 !                                                     int %i2);
1536 .globl  ecp_nistz256_gather_w7
1537 .align  32
1538 ecp_nistz256_gather_w7:
1539         save    %sp,-STACK_FRAME,%sp
1540
1541         neg     $index,$mask
1542         srax    $mask,63,$mask
1543
1544         add     $index,$mask,$index
1545         add     $inp,$index,$inp
1546         mov     64/4,$index
1547
1548 .Loop_gather_w7:
1549         ldub    [$inp+64*0],%l0
1550         prefetch [$inp+3840+64*0],1
1551         subcc   $index,1,$index
1552         ldub    [$inp+64*1],%l1
1553         prefetch [$inp+3840+64*1],1
1554         ldub    [$inp+64*2],%l2
1555         prefetch [$inp+3840+64*2],1
1556         ldub    [$inp+64*3],%l3
1557         prefetch [$inp+3840+64*3],1
1558         add     $inp,64*4,$inp
1559         sll     %l1,8,%l1
1560         sll     %l2,16,%l2
1561         or      %l0,%l1,%l0
1562         sll     %l3,24,%l3
1563         or      %l0,%l2,%l0
1564         or      %l0,%l3,%l0
1565         and     %l0,$mask,%l0
1566         st      %l0,[$out]
1567         bne     .Loop_gather_w7
1568         add     $out,4,$out
1569
1570         ret
1571         restore
1572 .size   ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1573 ___
1574 }}}
1575 {{{
1576 ########################################################################
1577 # Following subroutines are VIS3 counterparts of those above that
1578 # implement ones found in ecp_nistz256.c. Key difference is that they
1579 # use 128-bit muliplication and addition with 64-bit carry, and in order
1580 # to do that they perform conversion from uin32_t[8] to uint64_t[4] upon
1581 # entry and vice versa on return.
1582 #
1583 my ($rp,$ap,$bp)=map("%i$_",(0..2));
1584 my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7));
1585 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5));
1586 my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1");
1587 my ($rp_real,$ap_real)=("%g2","%g3");
1588 my ($acc6,$acc7)=($bp,$bi);     # used in squaring
1589
1590 $code.=<<___;
1591 .align  32
1592 __ecp_nistz256_mul_by_2_vis3:
1593         addcc   $acc0,$acc0,$acc0
1594         addxccc $acc1,$acc1,$acc1
1595         addxccc $acc2,$acc2,$acc2
1596         addxccc $acc3,$acc3,$acc3
1597         b       .Lreduce_by_sub_vis3
1598         addxc   %g0,%g0,$acc4           ! did it carry?
1599 .size   __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
1600
1601 .align  32
1602 __ecp_nistz256_add_vis3:
1603         ldx     [$bp+0],$t0
1604         ldx     [$bp+8],$t1
1605         ldx     [$bp+16],$t2
1606         ldx     [$bp+24],$t3
1607
1608 __ecp_nistz256_add_noload_vis3:
1609
1610         addcc   $t0,$acc0,$acc0
1611         addxccc $t1,$acc1,$acc1
1612         addxccc $t2,$acc2,$acc2
1613         addxccc $t3,$acc3,$acc3
1614         addxc   %g0,%g0,$acc4           ! did it carry?
1615
1616 .Lreduce_by_sub_vis3:
1617
1618         addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
1619         addxccc $acc1,$poly1,$t1
1620         addxccc $acc2,$minus1,$t2
1621         addxc   $acc3,$poly3,$t3
1622
1623         movrnz  $acc4,$t0,$acc0         ! if a+b carried, ret = ret-mod
1624         movrnz  $acc4,$t1,$acc1
1625         stx     $acc0,[$rp]
1626         movrnz  $acc4,$t2,$acc2
1627         stx     $acc1,[$rp+8]
1628         movrnz  $acc4,$t3,$acc3
1629         stx     $acc2,[$rp+16]
1630         retl
1631         stx     $acc3,[$rp+24]
1632 .size   __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
1633
1634 ! Trouble with subtraction is that there is no subtraction with 64-bit
1635 ! borrow, only with 32-bit one. For this reason we "decompose" 64-bit
1636 ! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But
1637 ! recall that SPARC is big-endian, which is why you'll observe that
1638 ! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we
1639 ! "collect" result back to 64-bit $acc0-$acc3.
1640 .align  32
1641 __ecp_nistz256_sub_from_vis3:
1642         ld      [$bp+4],$t0
1643         ld      [$bp+0],$t1
1644         ld      [$bp+12],$t2
1645         ld      [$bp+8],$t3
1646
1647         srlx    $acc0,32,$acc4
1648         not     $poly1,$poly1
1649         srlx    $acc1,32,$acc5
1650         subcc   $acc0,$t0,$acc0
1651         ld      [$bp+20],$t0
1652         subccc  $acc4,$t1,$acc4
1653         ld      [$bp+16],$t1
1654         subccc  $acc1,$t2,$acc1
1655         ld      [$bp+28],$t2
1656         and     $acc0,$poly1,$acc0
1657         subccc  $acc5,$t3,$acc5
1658         ld      [$bp+24],$t3
1659         sllx    $acc4,32,$acc4
1660         and     $acc1,$poly1,$acc1
1661         sllx    $acc5,32,$acc5
1662         or      $acc0,$acc4,$acc0
1663         srlx    $acc2,32,$acc4
1664         or      $acc1,$acc5,$acc1
1665         srlx    $acc3,32,$acc5
1666         subccc  $acc2,$t0,$acc2
1667         subccc  $acc4,$t1,$acc4
1668         subccc  $acc3,$t2,$acc3
1669         and     $acc2,$poly1,$acc2
1670         subccc  $acc5,$t3,$acc5
1671         sllx    $acc4,32,$acc4
1672         and     $acc3,$poly1,$acc3
1673         sllx    $acc5,32,$acc5
1674         or      $acc2,$acc4,$acc2
1675         subc    %g0,%g0,$acc4           ! did it borrow?
1676         b       .Lreduce_by_add_vis3
1677         or      $acc3,$acc5,$acc3
1678 .size   __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
1679
1680 .align  32
1681 __ecp_nistz256_sub_morf_vis3:
1682         ld      [$bp+4],$t0
1683         ld      [$bp+0],$t1
1684         ld      [$bp+12],$t2
1685         ld      [$bp+8],$t3
1686
1687         srlx    $acc0,32,$acc4
1688         not     $poly1,$poly1
1689         srlx    $acc1,32,$acc5
1690         subcc   $t0,$acc0,$acc0
1691         ld      [$bp+20],$t0
1692         subccc  $t1,$acc4,$acc4
1693         ld      [$bp+16],$t1
1694         subccc  $t2,$acc1,$acc1
1695         ld      [$bp+28],$t2
1696         and     $acc0,$poly1,$acc0
1697         subccc  $t3,$acc5,$acc5
1698         ld      [$bp+24],$t3
1699         sllx    $acc4,32,$acc4
1700         and     $acc1,$poly1,$acc1
1701         sllx    $acc5,32,$acc5
1702         or      $acc0,$acc4,$acc0
1703         srlx    $acc2,32,$acc4
1704         or      $acc1,$acc5,$acc1
1705         srlx    $acc3,32,$acc5
1706         subccc  $t0,$acc2,$acc2
1707         subccc  $t1,$acc4,$acc4
1708         subccc  $t2,$acc3,$acc3
1709         and     $acc2,$poly1,$acc2
1710         subccc  $t3,$acc5,$acc5
1711         sllx    $acc4,32,$acc4
1712         and     $acc3,$poly1,$acc3
1713         sllx    $acc5,32,$acc5
1714         or      $acc2,$acc4,$acc2
1715         subc    %g0,%g0,$acc4           ! did it borrow?
1716         or      $acc3,$acc5,$acc3
1717
1718 .Lreduce_by_add_vis3:
1719
1720         addcc   $acc0,-1,$t0            ! add modulus
1721         not     $poly3,$t3
1722         addxccc $acc1,$poly1,$t1
1723         not     $poly1,$poly1           ! restore $poly1
1724         addxccc $acc2,%g0,$t2
1725         addxc   $acc3,$t3,$t3
1726
1727         movrnz  $acc4,$t0,$acc0         ! if a-b borrowed, ret = ret+mod
1728         movrnz  $acc4,$t1,$acc1
1729         stx     $acc0,[$rp]
1730         movrnz  $acc4,$t2,$acc2
1731         stx     $acc1,[$rp+8]
1732         movrnz  $acc4,$t3,$acc3
1733         stx     $acc2,[$rp+16]
1734         retl
1735         stx     $acc3,[$rp+24]
1736 .size   __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
1737
1738 .align  32
1739 __ecp_nistz256_div_by_2_vis3:
1740         ! ret = (a is odd ? a+mod : a) >> 1
1741
1742         not     $poly1,$t1
1743         not     $poly3,$t3
1744         and     $acc0,1,$acc5
1745         addcc   $acc0,-1,$t0            ! add modulus
1746         addxccc $acc1,$t1,$t1
1747         addxccc $acc2,%g0,$t2
1748         addxccc $acc3,$t3,$t3
1749         addxc   %g0,%g0,$acc4           ! carry bit
1750
1751         movrnz  $acc5,$t0,$acc0
1752         movrnz  $acc5,$t1,$acc1
1753         movrnz  $acc5,$t2,$acc2
1754         movrnz  $acc5,$t3,$acc3
1755         movrz   $acc5,%g0,$acc4
1756
1757         ! ret >>= 1
1758
1759         srlx    $acc0,1,$acc0
1760         sllx    $acc1,63,$t0
1761         srlx    $acc1,1,$acc1
1762         or      $acc0,$t0,$acc0
1763         sllx    $acc2,63,$t1
1764         srlx    $acc2,1,$acc2
1765         or      $acc1,$t1,$acc1
1766         sllx    $acc3,63,$t2
1767         stx     $acc0,[$rp]
1768         srlx    $acc3,1,$acc3
1769         or      $acc2,$t2,$acc2
1770         sllx    $acc4,63,$t3            ! don't forget carry bit
1771         stx     $acc1,[$rp+8]
1772         or      $acc3,$t3,$acc3
1773         stx     $acc2,[$rp+16]
1774         retl
1775         stx     $acc3,[$rp+24]
1776 .size   __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
1777
1778 ! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
1779 ! 4x faster [on T4]...
1780 .align  32
1781 __ecp_nistz256_mul_mont_vis3:
1782         mulx    $a0,$bi,$acc0
1783         not     $poly3,$poly3           ! 0xFFFFFFFF00000001
1784         umulxhi $a0,$bi,$t0
1785         mulx    $a1,$bi,$acc1
1786         umulxhi $a1,$bi,$t1
1787         mulx    $a2,$bi,$acc2
1788         umulxhi $a2,$bi,$t2
1789         mulx    $a3,$bi,$acc3
1790         umulxhi $a3,$bi,$t3
1791         ldx     [$bp+8],$bi             ! b[1]
1792
1793         addcc   $acc1,$t0,$acc1         ! accumulate high parts of multiplication
1794          sllx   $acc0,32,$t0
1795         addxccc $acc2,$t1,$acc2
1796          srlx   $acc0,32,$t1
1797         addxccc $acc3,$t2,$acc3
1798         addxc   %g0,$t3,$acc4
1799         mov     0,$acc5
1800 ___
1801 for($i=1;$i<4;$i++) {
1802         # Reduction iteration is normally performed by accumulating
1803         # result of multiplication of modulus by "magic" digit [and
1804         # omitting least significant word, which is guaranteed to
1805         # be 0], but thanks to special form of modulus and "magic"
1806         # digit being equal to least significant word, it can be
1807         # performed with additions and subtractions alone. Indeed:
1808         #
1809         #            ffff0001.00000000.0000ffff.ffffffff
1810         # *                                     abcdefgh
1811         # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1812         #
1813         # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1814         # rewrite above as:
1815         #
1816         #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1817         # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
1818         # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
1819         #
1820         # or marking redundant operations:
1821         #
1822         #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
1823         # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
1824         # - 0000abcd.efgh0000.--------.--------.--------
1825         #   ^^^^^^^^ but this word is calculated with umulxhi, because
1826         #            there is no subtract with 64-bit borrow:-(
1827
1828 $code.=<<___;
1829         sub     $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1830         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1831         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1832         mulx    $a0,$bi,$t0
1833         addxccc $acc2,$t1,$acc1
1834         mulx    $a1,$bi,$t1
1835         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1836         mulx    $a2,$bi,$t2
1837         addxccc $acc4,$t3,$acc3
1838         mulx    $a3,$bi,$t3
1839         addxc   $acc5,%g0,$acc4
1840
1841         addcc   $acc0,$t0,$acc0         ! accumulate low parts of multiplication
1842         umulxhi $a0,$bi,$t0
1843         addxccc $acc1,$t1,$acc1
1844         umulxhi $a1,$bi,$t1
1845         addxccc $acc2,$t2,$acc2
1846         umulxhi $a2,$bi,$t2
1847         addxccc $acc3,$t3,$acc3
1848         umulxhi $a3,$bi,$t3
1849         addxc   $acc4,%g0,$acc4
1850 ___
1851 $code.=<<___    if ($i<3);
1852         ldx     [$bp+8*($i+1)],$bi      ! bp[$i+1]
1853 ___
1854 $code.=<<___;
1855         addcc   $acc1,$t0,$acc1         ! accumulate high parts of multiplication 
1856          sllx   $acc0,32,$t0
1857         addxccc $acc2,$t1,$acc2
1858          srlx   $acc0,32,$t1
1859         addxccc $acc3,$t2,$acc3
1860         addxccc $acc4,$t3,$acc4
1861         addxc   %g0,%g0,$acc5
1862 ___
1863 }
1864 $code.=<<___;
1865         sub     $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1866         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1867         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1868         addxccc $acc2,$t1,$acc1
1869         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1870         addxccc $acc4,$t3,$acc3
1871         b       .Lmul_final_vis3        ! see below
1872         addxc   $acc5,%g0,$acc4
1873 .size   __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
1874
1875 ! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
1876 ! instructions, but only 14% faster [on T4]...
1877 .align  32
1878 __ecp_nistz256_sqr_mont_vis3:
1879         !  |  |  |  |  |  |a1*a0|  |
1880         !  |  |  |  |  |a2*a0|  |  |
1881         !  |  |a3*a2|a3*a0|  |  |  |
1882         !  |  |  |  |a2*a1|  |  |  |
1883         !  |  |  |a3*a1|  |  |  |  |
1884         ! *|  |  |  |  |  |  |  | 2|
1885         ! +|a3*a3|a2*a2|a1*a1|a0*a0|
1886         !  |--+--+--+--+--+--+--+--|
1887         !  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1888         !
1889         !  "can't overflow" below mark carrying into high part of
1890         !  multiplication result, which can't overflow, because it
1891         !  can never be all ones.
1892
1893         mulx    $a1,$a0,$acc1           ! a[1]*a[0]
1894         umulxhi $a1,$a0,$t1
1895         mulx    $a2,$a0,$acc2           ! a[2]*a[0]
1896         umulxhi $a2,$a0,$t2
1897         mulx    $a3,$a0,$acc3           ! a[3]*a[0]
1898         umulxhi $a3,$a0,$acc4
1899
1900         addcc   $acc2,$t1,$acc2         ! accumulate high parts of multiplication
1901         mulx    $a2,$a1,$t0             ! a[2]*a[1]
1902         umulxhi $a2,$a1,$t1
1903         addxccc $acc3,$t2,$acc3
1904         mulx    $a3,$a1,$t2             ! a[3]*a[1]
1905         umulxhi $a3,$a1,$t3
1906         addxc   $acc4,%g0,$acc4         ! can't overflow
1907
1908         mulx    $a3,$a2,$acc5           ! a[3]*a[2]
1909         not     $poly3,$poly3           ! 0xFFFFFFFF00000001
1910         umulxhi $a3,$a2,$acc6
1911
1912         addcc   $t2,$t1,$t1             ! accumulate high parts of multiplication
1913         mulx    $a0,$a0,$acc0           ! a[0]*a[0]
1914         addxc   $t3,%g0,$t2             ! can't overflow
1915
1916         addcc   $acc3,$t0,$acc3         ! accumulate low parts of multiplication
1917         umulxhi $a0,$a0,$a0
1918         addxccc $acc4,$t1,$acc4
1919         mulx    $a1,$a1,$t1             ! a[1]*a[1]
1920         addxccc $acc5,$t2,$acc5
1921         umulxhi $a1,$a1,$a1
1922         addxc   $acc6,%g0,$acc6         ! can't overflow
1923
1924         addcc   $acc1,$acc1,$acc1       ! acc[1-6]*=2
1925         mulx    $a2,$a2,$t2             ! a[2]*a[2]
1926         addxccc $acc2,$acc2,$acc2
1927         umulxhi $a2,$a2,$a2
1928         addxccc $acc3,$acc3,$acc3
1929         mulx    $a3,$a3,$t3             ! a[3]*a[3]
1930         addxccc $acc4,$acc4,$acc4
1931         umulxhi $a3,$a3,$a3
1932         addxccc $acc5,$acc5,$acc5
1933         addxccc $acc6,$acc6,$acc6
1934         addxc   %g0,%g0,$acc7
1935
1936         addcc   $acc1,$a0,$acc1         ! +a[i]*a[i]
1937         addxccc $acc2,$t1,$acc2
1938         addxccc $acc3,$a1,$acc3
1939         addxccc $acc4,$t2,$acc4
1940          sllx   $acc0,32,$t0
1941         addxccc $acc5,$a2,$acc5
1942          srlx   $acc0,32,$t1
1943         addxccc $acc6,$t3,$acc6
1944          sub    $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1945         addxc   $acc7,$a3,$acc7
1946 ___
1947 for($i=0;$i<3;$i++) {                   # reductions, see commentary
1948                                         # in multiplication for details
1949 $code.=<<___;
1950         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1951         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1952          sllx   $acc0,32,$t0
1953         addxccc $acc2,$t1,$acc1
1954          srlx   $acc0,32,$t1
1955         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1956          sub    $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1957         addxc   %g0,$t3,$acc3           ! cant't overflow
1958 ___
1959 }
1960 $code.=<<___;
1961         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1962         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1963         addxccc $acc2,$t1,$acc1
1964         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1965         addxc   %g0,$t3,$acc3           ! can't overflow
1966
1967         addcc   $acc0,$acc4,$acc0       ! accumulate upper half
1968         addxccc $acc1,$acc5,$acc1
1969         addxccc $acc2,$acc6,$acc2
1970         addxccc $acc3,$acc7,$acc3
1971         addxc   %g0,%g0,$acc4
1972
1973 .Lmul_final_vis3:
1974
1975         ! Final step is "if result > mod, subtract mod", but as comparison
1976         ! means subtraction, we do the subtraction and then copy outcome
1977         ! if it didn't borrow. But note that as we [have to] replace
1978         ! subtraction with addition with negative, carry/borrow logic is
1979         ! inverse.
1980
1981         addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
1982         not     $poly3,$poly3           ! restore 0x00000000FFFFFFFE
1983         addxccc $acc1,$poly1,$t1
1984         addxccc $acc2,$minus1,$t2
1985         addxccc $acc3,$poly3,$t3
1986         addxccc $acc4,$minus1,%g0       ! did it carry?
1987
1988         movcs   %xcc,$t0,$acc0
1989         movcs   %xcc,$t1,$acc1
1990         stx     $acc0,[$rp]
1991         movcs   %xcc,$t2,$acc2
1992         stx     $acc1,[$rp+8]
1993         movcs   %xcc,$t3,$acc3
1994         stx     $acc2,[$rp+16]
1995         retl
1996         stx     $acc3,[$rp+24]
1997 .size   __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
1998 ___
1999
2000 ########################################################################
2001 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
2002 #
2003 {
2004 my ($res_x,$res_y,$res_z,
2005     $in_x,$in_y,$in_z,
2006     $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9));
2007 # above map() describes stack layout with 10 temporary
2008 # 256-bit vectors on top.
2009
2010 $code.=<<___;
2011 .align  32
2012 ecp_nistz256_point_double_vis3:
2013         save    %sp,-STACK64_FRAME-32*10,%sp
2014
2015         mov     $rp,$rp_real
2016         mov     -1,$minus1
2017         mov     -2,$poly3
2018         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2019         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2020
2021         ! convert input to uint64_t[4]
2022         ld      [$ap],$a0                       ! in_x
2023         ld      [$ap+4],$t0
2024         ld      [$ap+8],$a1
2025         ld      [$ap+12],$t1
2026         ld      [$ap+16],$a2
2027         ld      [$ap+20],$t2
2028         ld      [$ap+24],$a3
2029         ld      [$ap+28],$t3
2030         sllx    $t0,32,$t0
2031         sllx    $t1,32,$t1
2032         ld      [$ap+32],$acc0                  ! in_y
2033         or      $a0,$t0,$a0
2034         ld      [$ap+32+4],$t0
2035         sllx    $t2,32,$t2
2036         ld      [$ap+32+8],$acc1
2037         or      $a1,$t1,$a1
2038         ld      [$ap+32+12],$t1
2039         sllx    $t3,32,$t3
2040         ld      [$ap+32+16],$acc2
2041         or      $a2,$t2,$a2
2042         ld      [$ap+32+20],$t2
2043         or      $a3,$t3,$a3
2044         ld      [$ap+32+24],$acc3
2045         sllx    $t0,32,$t0
2046         ld      [$ap+32+28],$t3
2047         sllx    $t1,32,$t1
2048         stx     $a0,[%sp+LOCALS64+$in_x]
2049         sllx    $t2,32,$t2
2050         stx     $a1,[%sp+LOCALS64+$in_x+8]
2051         sllx    $t3,32,$t3
2052         stx     $a2,[%sp+LOCALS64+$in_x+16]
2053         or      $acc0,$t0,$acc0
2054         stx     $a3,[%sp+LOCALS64+$in_x+24]
2055         or      $acc1,$t1,$acc1
2056         stx     $acc0,[%sp+LOCALS64+$in_y]
2057         or      $acc2,$t2,$acc2
2058         stx     $acc1,[%sp+LOCALS64+$in_y+8]
2059         or      $acc3,$t3,$acc3
2060         stx     $acc2,[%sp+LOCALS64+$in_y+16]
2061         stx     $acc3,[%sp+LOCALS64+$in_y+24]
2062
2063         ld      [$ap+64],$a0                    ! in_z
2064         ld      [$ap+64+4],$t0
2065         ld      [$ap+64+8],$a1
2066         ld      [$ap+64+12],$t1
2067         ld      [$ap+64+16],$a2
2068         ld      [$ap+64+20],$t2
2069         ld      [$ap+64+24],$a3
2070         ld      [$ap+64+28],$t3
2071         sllx    $t0,32,$t0
2072         sllx    $t1,32,$t1
2073         or      $a0,$t0,$a0
2074         sllx    $t2,32,$t2
2075         or      $a1,$t1,$a1
2076         sllx    $t3,32,$t3
2077         or      $a2,$t2,$a2
2078         or      $a3,$t3,$a3
2079         sllx    $t0,32,$t0
2080         sllx    $t1,32,$t1
2081         stx     $a0,[%sp+LOCALS64+$in_z]
2082         sllx    $t2,32,$t2
2083         stx     $a1,[%sp+LOCALS64+$in_z+8]
2084         sllx    $t3,32,$t3
2085         stx     $a2,[%sp+LOCALS64+$in_z+16]
2086         stx     $a3,[%sp+LOCALS64+$in_z+24]
2087
2088         ! in_y is still in $acc0-$acc3
2089         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(S, in_y);
2090         add     %sp,LOCALS64+$S,$rp
2091
2092         ! in_z is still in $a0-$a3
2093         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Zsqr, in_z);
2094         add     %sp,LOCALS64+$Zsqr,$rp
2095
2096         mov     $acc0,$a0                       ! put Zsqr aside
2097         mov     $acc1,$a1
2098         mov     $acc2,$a2
2099         mov     $acc3,$a3
2100
2101         add     %sp,LOCALS64+$in_x,$bp
2102         call    __ecp_nistz256_add_vis3         ! p256_add(M, Zsqr, in_x);
2103         add     %sp,LOCALS64+$M,$rp
2104
2105         mov     $a0,$acc0                       ! restore Zsqr
2106         ldx     [%sp+LOCALS64+$S],$a0           ! forward load
2107         mov     $a1,$acc1
2108         ldx     [%sp+LOCALS64+$S+8],$a1
2109         mov     $a2,$acc2
2110         ldx     [%sp+LOCALS64+$S+16],$a2
2111         mov     $a3,$acc3
2112         ldx     [%sp+LOCALS64+$S+24],$a3
2113
2114         add     %sp,LOCALS64+$in_x,$bp
2115         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(Zsqr, in_x, Zsqr);
2116         add     %sp,LOCALS64+$Zsqr,$rp
2117
2118         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(S, S);
2119         add     %sp,LOCALS64+$S,$rp
2120
2121         ldx     [%sp+LOCALS64+$in_z],$bi
2122         ldx     [%sp+LOCALS64+$in_y],$a0
2123         ldx     [%sp+LOCALS64+$in_y+8],$a1
2124         ldx     [%sp+LOCALS64+$in_y+16],$a2
2125         ldx     [%sp+LOCALS64+$in_y+24],$a3
2126         add     %sp,LOCALS64+$in_z,$bp
2127         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(tmp0, in_z, in_y);
2128         add     %sp,LOCALS64+$tmp0,$rp
2129
2130         ldx     [%sp+LOCALS64+$M],$bi           ! forward load
2131         ldx     [%sp+LOCALS64+$Zsqr],$a0
2132         ldx     [%sp+LOCALS64+$Zsqr+8],$a1
2133         ldx     [%sp+LOCALS64+$Zsqr+16],$a2
2134         ldx     [%sp+LOCALS64+$Zsqr+24],$a3
2135
2136         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(res_z, tmp0);
2137         add     %sp,LOCALS64+$res_z,$rp
2138
2139         add     %sp,LOCALS64+$M,$bp
2140         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(M, M, Zsqr);
2141         add     %sp,LOCALS64+$M,$rp
2142
2143         mov     $acc0,$a0                       ! put aside M
2144         mov     $acc1,$a1
2145         mov     $acc2,$a2
2146         mov     $acc3,$a3
2147         call    __ecp_nistz256_mul_by_2_vis3
2148         add     %sp,LOCALS64+$M,$rp
2149         mov     $a0,$t0                         ! copy M
2150         ldx     [%sp+LOCALS64+$S],$a0           ! forward load
2151         mov     $a1,$t1
2152         ldx     [%sp+LOCALS64+$S+8],$a1
2153         mov     $a2,$t2
2154         ldx     [%sp+LOCALS64+$S+16],$a2
2155         mov     $a3,$t3
2156         ldx     [%sp+LOCALS64+$S+24],$a3
2157         call    __ecp_nistz256_add_noload_vis3  ! p256_mul_by_3(M, M);
2158         add     %sp,LOCALS64+$M,$rp
2159
2160         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(tmp0, S);
2161         add     %sp,LOCALS64+$tmp0,$rp
2162
2163         ldx     [%sp+LOCALS64+$S],$bi           ! forward load
2164         ldx     [%sp+LOCALS64+$in_x],$a0
2165         ldx     [%sp+LOCALS64+$in_x+8],$a1
2166         ldx     [%sp+LOCALS64+$in_x+16],$a2
2167         ldx     [%sp+LOCALS64+$in_x+24],$a3
2168
2169         call    __ecp_nistz256_div_by_2_vis3    ! p256_div_by_2(res_y, tmp0);
2170         add     %sp,LOCALS64+$res_y,$rp
2171
2172         add     %sp,LOCALS64+$S,$bp
2173         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S, S, in_x);
2174         add     %sp,LOCALS64+$S,$rp
2175
2176         ldx     [%sp+LOCALS64+$M],$a0           ! forward load
2177         ldx     [%sp+LOCALS64+$M+8],$a1
2178         ldx     [%sp+LOCALS64+$M+16],$a2
2179         ldx     [%sp+LOCALS64+$M+24],$a3
2180
2181         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(tmp0, S);
2182         add     %sp,LOCALS64+$tmp0,$rp
2183
2184         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(res_x, M);
2185         add     %sp,LOCALS64+$res_x,$rp
2186
2187         add     %sp,LOCALS64+$tmp0,$bp
2188         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_x, res_x, tmp0);
2189         add     %sp,LOCALS64+$res_x,$rp
2190
2191         ldx     [%sp+LOCALS64+$M],$a0           ! forward load
2192         ldx     [%sp+LOCALS64+$M+8],$a1
2193         ldx     [%sp+LOCALS64+$M+16],$a2
2194         ldx     [%sp+LOCALS64+$M+24],$a3
2195
2196         add     %sp,LOCALS64+$S,$bp
2197         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(S, S, res_x);
2198         add     %sp,LOCALS64+$S,$rp
2199
2200         mov     $acc0,$bi
2201         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S, S, M);
2202         add     %sp,LOCALS64+$S,$rp
2203
2204         ldx     [%sp+LOCALS64+$res_x],$a0       ! forward load
2205         ldx     [%sp+LOCALS64+$res_x+8],$a1
2206         ldx     [%sp+LOCALS64+$res_x+16],$a2
2207         ldx     [%sp+LOCALS64+$res_x+24],$a3
2208
2209         add     %sp,LOCALS64+$res_y,$bp
2210         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, S, res_y);
2211         add     %sp,LOCALS64+$res_y,$bp
2212
2213         ! convert output to uint_32[8]
2214         srlx    $a0,32,$t0
2215         srlx    $a1,32,$t1
2216         st      $a0,[$rp_real]                  ! res_x
2217         srlx    $a2,32,$t2
2218         st      $t0,[$rp_real+4]
2219         srlx    $a3,32,$t3
2220         st      $a1,[$rp_real+8]
2221         st      $t1,[$rp_real+12]
2222         st      $a2,[$rp_real+16]
2223         st      $t2,[$rp_real+20]
2224         st      $a3,[$rp_real+24]
2225         st      $t3,[$rp_real+28]
2226
2227         ldx     [%sp+LOCALS64+$res_z],$a0       ! forward load
2228         srlx    $acc0,32,$t0
2229         ldx     [%sp+LOCALS64+$res_z+8],$a1
2230         srlx    $acc1,32,$t1
2231         ldx     [%sp+LOCALS64+$res_z+16],$a2
2232         srlx    $acc2,32,$t2
2233         ldx     [%sp+LOCALS64+$res_z+24],$a3
2234         srlx    $acc3,32,$t3
2235         st      $acc0,[$rp_real+32]             ! res_y
2236         st      $t0,  [$rp_real+32+4]
2237         st      $acc1,[$rp_real+32+8]
2238         st      $t1,  [$rp_real+32+12]
2239         st      $acc2,[$rp_real+32+16]
2240         st      $t2,  [$rp_real+32+20]
2241         st      $acc3,[$rp_real+32+24]
2242         st      $t3,  [$rp_real+32+28]
2243
2244         srlx    $a0,32,$t0
2245         srlx    $a1,32,$t1
2246         st      $a0,[$rp_real+64]               ! res_z
2247         srlx    $a2,32,$t2
2248         st      $t0,[$rp_real+64+4]
2249         srlx    $a3,32,$t3
2250         st      $a1,[$rp_real+64+8]
2251         st      $t1,[$rp_real+64+12]
2252         st      $a2,[$rp_real+64+16]
2253         st      $t2,[$rp_real+64+20]
2254         st      $a3,[$rp_real+64+24]
2255         st      $t3,[$rp_real+64+28]
2256
2257         ret
2258         restore
2259 .size   ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
2260 ___
2261 }
2262 ########################################################################
2263 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
2264 #                             const P256_POINT *in2);
2265 {
2266 my ($res_x,$res_y,$res_z,
2267     $in1_x,$in1_y,$in1_z,
2268     $in2_x,$in2_y,$in2_z,
2269     $H,$Hsqr,$R,$Rsqr,$Hcub,
2270     $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
2271 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2272
2273 # above map() describes stack layout with 18 temporary
2274 # 256-bit vectors on top. Then we reserve some space for
2275 # !in1infty, !in2infty and result of check for zero.
2276
2277 $code.=<<___;
2278 .globl  ecp_nistz256_point_add_vis3
2279 .align  32
2280 ecp_nistz256_point_add_vis3:
2281         save    %sp,-STACK64_FRAME-32*18-32,%sp
2282
2283         mov     $rp,$rp_real
2284         mov     -1,$minus1
2285         mov     -2,$poly3
2286         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2287         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2288
2289         ! convert input to uint64_t[4]
2290         ld      [$bp],$a0                       ! in2_x
2291         ld      [$bp+4],$t0
2292         ld      [$bp+8],$a1
2293         ld      [$bp+12],$t1
2294         ld      [$bp+16],$a2
2295         ld      [$bp+20],$t2
2296         ld      [$bp+24],$a3
2297         ld      [$bp+28],$t3
2298         sllx    $t0,32,$t0
2299         sllx    $t1,32,$t1
2300         ld      [$bp+32],$acc0                  ! in2_y
2301         or      $a0,$t0,$a0
2302         ld      [$bp+32+4],$t0
2303         sllx    $t2,32,$t2
2304         ld      [$bp+32+8],$acc1
2305         or      $a1,$t1,$a1
2306         ld      [$bp+32+12],$t1
2307         sllx    $t3,32,$t3
2308         ld      [$bp+32+16],$acc2
2309         or      $a2,$t2,$a2
2310         ld      [$bp+32+20],$t2
2311         or      $a3,$t3,$a3
2312         ld      [$bp+32+24],$acc3
2313         sllx    $t0,32,$t0
2314         ld      [$bp+32+28],$t3
2315         sllx    $t1,32,$t1
2316         stx     $a0,[%sp+LOCALS64+$in2_x]
2317         sllx    $t2,32,$t2
2318         stx     $a1,[%sp+LOCALS64+$in2_x+8]
2319         sllx    $t3,32,$t3
2320         stx     $a2,[%sp+LOCALS64+$in2_x+16]
2321         or      $acc0,$t0,$acc0
2322         stx     $a3,[%sp+LOCALS64+$in2_x+24]
2323         or      $acc1,$t1,$acc1
2324         stx     $acc0,[%sp+LOCALS64+$in2_y]
2325         or      $acc2,$t2,$acc2
2326         stx     $acc1,[%sp+LOCALS64+$in2_y+8]
2327         or      $acc3,$t3,$acc3
2328         stx     $acc2,[%sp+LOCALS64+$in2_y+16]
2329         stx     $acc3,[%sp+LOCALS64+$in2_y+24]
2330
2331         or      $a1,$a0,$a0
2332         or      $a3,$a2,$a2
2333         or      $acc1,$acc0,$acc0
2334         or      $acc3,$acc2,$acc2
2335         or      $a2,$a0,$a0
2336         or      $acc2,$acc0,$acc0
2337         or      $acc0,$a0,$a0
2338         movrnz  $a0,-1,$a0                      ! !in2infty
2339         stx     $a0,[%fp+STACK_BIAS-8]
2340
2341         ld      [$bp+64],$acc0                  ! in2_z
2342         ld      [$bp+64+4],$t0
2343         ld      [$bp+64+8],$acc1
2344         ld      [$bp+64+12],$t1
2345         ld      [$bp+64+16],$acc2
2346         ld      [$bp+64+20],$t2
2347         ld      [$bp+64+24],$acc3
2348         ld      [$bp+64+28],$t3
2349         sllx    $t0,32,$t0
2350         sllx    $t1,32,$t1
2351         ld      [$ap],$a0                       ! in1_x
2352         or      $acc0,$t0,$acc0
2353         ld      [$ap+4],$t0
2354         sllx    $t2,32,$t2
2355         ld      [$ap+8],$a1
2356         or      $acc1,$t1,$acc1
2357         ld      [$ap+12],$t1
2358         sllx    $t3,32,$t3
2359         ld      [$ap+16],$a2
2360         or      $acc2,$t2,$acc2
2361         ld      [$ap+20],$t2
2362         or      $acc3,$t3,$acc3
2363         ld      [$ap+24],$a3
2364         sllx    $t0,32,$t0
2365         ld      [$ap+28],$t3
2366         sllx    $t1,32,$t1
2367         stx     $acc0,[%sp+LOCALS64+$in2_z]
2368         sllx    $t2,32,$t2
2369         stx     $acc1,[%sp+LOCALS64+$in2_z+8]
2370         sllx    $t3,32,$t3
2371         stx     $acc2,[%sp+LOCALS64+$in2_z+16]
2372         stx     $acc3,[%sp+LOCALS64+$in2_z+24]
2373
2374         or      $a0,$t0,$a0
2375         ld      [$ap+32],$acc0                  ! in1_y
2376         or      $a1,$t1,$a1
2377         ld      [$ap+32+4],$t0
2378         or      $a2,$t2,$a2
2379         ld      [$ap+32+8],$acc1
2380         or      $a3,$t3,$a3
2381         ld      [$ap+32+12],$t1
2382         ld      [$ap+32+16],$acc2
2383         ld      [$ap+32+20],$t2
2384         ld      [$ap+32+24],$acc3
2385         sllx    $t0,32,$t0
2386         ld      [$ap+32+28],$t3
2387         sllx    $t1,32,$t1
2388         stx     $a0,[%sp+LOCALS64+$in1_x]
2389         sllx    $t2,32,$t2
2390         stx     $a1,[%sp+LOCALS64+$in1_x+8]
2391         sllx    $t3,32,$t3
2392         stx     $a2,[%sp+LOCALS64+$in1_x+16]
2393         or      $acc0,$t0,$acc0
2394         stx     $a3,[%sp+LOCALS64+$in1_x+24]
2395         or      $acc1,$t1,$acc1
2396         stx     $acc0,[%sp+LOCALS64+$in1_y]
2397         or      $acc2,$t2,$acc2
2398         stx     $acc1,[%sp+LOCALS64+$in1_y+8]
2399         or      $acc3,$t3,$acc3
2400         stx     $acc2,[%sp+LOCALS64+$in1_y+16]
2401         stx     $acc3,[%sp+LOCALS64+$in1_y+24]
2402
2403         or      $a1,$a0,$a0
2404         or      $a3,$a2,$a2
2405         or      $acc1,$acc0,$acc0
2406         or      $acc3,$acc2,$acc2
2407         or      $a2,$a0,$a0
2408         or      $acc2,$acc0,$acc0
2409         or      $acc0,$a0,$a0
2410         movrnz  $a0,-1,$a0                      ! !in1infty
2411         stx     $a0,[%fp+STACK_BIAS-16]
2412
2413         ldx     [%sp+LOCALS64+$in2_z],$a0       ! forward load
2414         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2415         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2416         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2417
2418         ld      [$ap+64],$acc0                  ! in1_z
2419         ld      [$ap+64+4],$t0
2420         ld      [$ap+64+8],$acc1
2421         ld      [$ap+64+12],$t1
2422         ld      [$ap+64+16],$acc2
2423         ld      [$ap+64+20],$t2
2424         ld      [$ap+64+24],$acc3
2425         ld      [$ap+64+28],$t3
2426         sllx    $t0,32,$t0
2427         sllx    $t1,32,$t1
2428         or      $acc0,$t0,$acc0
2429         sllx    $t2,32,$t2
2430         or      $acc1,$t1,$acc1
2431         sllx    $t3,32,$t3
2432         stx     $acc0,[%sp+LOCALS64+$in1_z]
2433         or      $acc2,$t2,$acc2
2434         stx     $acc1,[%sp+LOCALS64+$in1_z+8]
2435         or      $acc3,$t3,$acc3
2436         stx     $acc2,[%sp+LOCALS64+$in1_z+16]
2437         stx     $acc3,[%sp+LOCALS64+$in1_z+24]
2438
2439         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z2sqr, in2_z);
2440         add     %sp,LOCALS64+$Z2sqr,$rp
2441
2442         ldx     [%sp+LOCALS64+$in1_z],$a0
2443         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2444         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2445         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2446         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z1sqr, in1_z);
2447         add     %sp,LOCALS64+$Z1sqr,$rp
2448
2449         ldx     [%sp+LOCALS64+$Z2sqr],$bi
2450         ldx     [%sp+LOCALS64+$in2_z],$a0
2451         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2452         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2453         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2454         add     %sp,LOCALS64+$Z2sqr,$bp
2455         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S1, Z2sqr, in2_z);
2456         add     %sp,LOCALS64+$S1,$rp
2457
2458         ldx     [%sp+LOCALS64+$Z1sqr],$bi
2459         ldx     [%sp+LOCALS64+$in1_z],$a0
2460         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2461         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2462         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2463         add     %sp,LOCALS64+$Z1sqr,$bp
2464         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, Z1sqr, in1_z);
2465         add     %sp,LOCALS64+$S2,$rp
2466
2467         ldx     [%sp+LOCALS64+$S1],$bi
2468         ldx     [%sp+LOCALS64+$in1_y],$a0
2469         ldx     [%sp+LOCALS64+$in1_y+8],$a1
2470         ldx     [%sp+LOCALS64+$in1_y+16],$a2
2471         ldx     [%sp+LOCALS64+$in1_y+24],$a3
2472         add     %sp,LOCALS64+$S1,$bp
2473         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S1, S1, in1_y);
2474         add     %sp,LOCALS64+$S1,$rp
2475
2476         ldx     [%sp+LOCALS64+$S2],$bi
2477         ldx     [%sp+LOCALS64+$in2_y],$a0
2478         ldx     [%sp+LOCALS64+$in2_y+8],$a1
2479         ldx     [%sp+LOCALS64+$in2_y+16],$a2
2480         ldx     [%sp+LOCALS64+$in2_y+24],$a3
2481         add     %sp,LOCALS64+$S2,$bp
2482         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S2, in2_y);
2483         add     %sp,LOCALS64+$S2,$rp
2484
2485         ldx     [%sp+LOCALS64+$Z2sqr],$bi       ! forward load
2486         ldx     [%sp+LOCALS64+$in1_x],$a0
2487         ldx     [%sp+LOCALS64+$in1_x+8],$a1
2488         ldx     [%sp+LOCALS64+$in1_x+16],$a2
2489         ldx     [%sp+LOCALS64+$in1_x+24],$a3
2490
2491         add     %sp,LOCALS64+$S1,$bp
2492         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(R, S2, S1);
2493         add     %sp,LOCALS64+$R,$rp
2494
2495         or      $acc1,$acc0,$acc0               ! see if result is zero
2496         or      $acc3,$acc2,$acc2
2497         or      $acc2,$acc0,$acc0
2498         stx     $acc0,[%fp+STACK_BIAS-24]
2499
2500         add     %sp,LOCALS64+$Z2sqr,$bp
2501         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U1, in1_x, Z2sqr);
2502         add     %sp,LOCALS64+$U1,$rp
2503
2504         ldx     [%sp+LOCALS64+$Z1sqr],$bi
2505         ldx     [%sp+LOCALS64+$in2_x],$a0
2506         ldx     [%sp+LOCALS64+$in2_x+8],$a1
2507         ldx     [%sp+LOCALS64+$in2_x+16],$a2
2508         ldx     [%sp+LOCALS64+$in2_x+24],$a3
2509         add     %sp,LOCALS64+$Z1sqr,$bp
2510         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, in2_x, Z1sqr);
2511         add     %sp,LOCALS64+$U2,$rp
2512
2513         ldx     [%sp+LOCALS64+$R],$a0           ! forward load
2514         ldx     [%sp+LOCALS64+$R+8],$a1
2515         ldx     [%sp+LOCALS64+$R+16],$a2
2516         ldx     [%sp+LOCALS64+$R+24],$a3
2517
2518         add     %sp,LOCALS64+$U1,$bp
2519         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(H, U2, U1);
2520         add     %sp,LOCALS64+$H,$rp
2521
2522         or      $acc1,$acc0,$acc0               ! see if result is zero
2523         or      $acc3,$acc2,$acc2
2524         orcc    $acc2,$acc0,$acc0
2525
2526         bne,pt  %xcc,.Ladd_proceed_vis3         ! is_equal(U1,U2)?
2527         nop
2528
2529         ldx     [%fp+STACK_BIAS-8],$t0
2530         ldx     [%fp+STACK_BIAS-16],$t1
2531         ldx     [%fp+STACK_BIAS-24],$t2
2532         andcc   $t0,$t1,%g0
2533         be,pt   %xcc,.Ladd_proceed_vis3         ! (in1infty || in2infty)?
2534         nop
2535         andcc   $t2,$t2,%g0
2536         be,pt   %xcc,.Ladd_proceed_vis3         ! is_equal(S1,S2)?
2537         nop
2538
2539         st      %g0,[$rp_real]
2540         st      %g0,[$rp_real+4]
2541         st      %g0,[$rp_real+8]
2542         st      %g0,[$rp_real+12]
2543         st      %g0,[$rp_real+16]
2544         st      %g0,[$rp_real+20]
2545         st      %g0,[$rp_real+24]
2546         st      %g0,[$rp_real+28]
2547         st      %g0,[$rp_real+32]
2548         st      %g0,[$rp_real+32+4]
2549         st      %g0,[$rp_real+32+8]
2550         st      %g0,[$rp_real+32+12]
2551         st      %g0,[$rp_real+32+16]
2552         st      %g0,[$rp_real+32+20]
2553         st      %g0,[$rp_real+32+24]
2554         st      %g0,[$rp_real+32+28]
2555         st      %g0,[$rp_real+64]
2556         st      %g0,[$rp_real+64+4]
2557         st      %g0,[$rp_real+64+8]
2558         st      %g0,[$rp_real+64+12]
2559         st      %g0,[$rp_real+64+16]
2560         st      %g0,[$rp_real+64+20]
2561         st      %g0,[$rp_real+64+24]
2562         st      %g0,[$rp_real+64+28]
2563         b       .Ladd_done_vis3
2564         nop
2565
2566 .align  16
2567 .Ladd_proceed_vis3:
2568         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Rsqr, R);
2569         add     %sp,LOCALS64+$Rsqr,$rp
2570
2571         ldx     [%sp+LOCALS64+$H],$bi
2572         ldx     [%sp+LOCALS64+$in1_z],$a0
2573         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2574         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2575         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2576         add     %sp,LOCALS64+$H,$bp
2577         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, H, in1_z);
2578         add     %sp,LOCALS64+$res_z,$rp
2579
2580         ldx     [%sp+LOCALS64+$H],$a0
2581         ldx     [%sp+LOCALS64+$H+8],$a1
2582         ldx     [%sp+LOCALS64+$H+16],$a2
2583         ldx     [%sp+LOCALS64+$H+24],$a3
2584         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Hsqr, H);
2585         add     %sp,LOCALS64+$Hsqr,$rp
2586
2587         ldx     [%sp+LOCALS64+$res_z],$bi
2588         ldx     [%sp+LOCALS64+$in2_z],$a0
2589         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2590         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2591         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2592         add     %sp,LOCALS64+$res_z,$bp
2593         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, res_z, in2_z);
2594         add     %sp,LOCALS64+$res_z,$rp
2595
2596         ldx     [%sp+LOCALS64+$H],$bi
2597         ldx     [%sp+LOCALS64+$Hsqr],$a0
2598         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2599         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2600         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2601         add     %sp,LOCALS64+$H,$bp
2602         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(Hcub, Hsqr, H);
2603         add     %sp,LOCALS64+$Hcub,$rp
2604
2605         ldx     [%sp+LOCALS64+$U1],$bi
2606         ldx     [%sp+LOCALS64+$Hsqr],$a0
2607         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2608         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2609         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2610         add     %sp,LOCALS64+$U1,$bp
2611         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, U1, Hsqr);
2612         add     %sp,LOCALS64+$U2,$rp
2613
2614         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(Hsqr, U2);
2615         add     %sp,LOCALS64+$Hsqr,$rp
2616
2617         add     %sp,LOCALS64+$Rsqr,$bp
2618         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_x, Rsqr, Hsqr);
2619         add     %sp,LOCALS64+$res_x,$rp
2620
2621         add     %sp,LOCALS64+$Hcub,$bp
2622         call    __ecp_nistz256_sub_from_vis3    !  p256_sub(res_x, res_x, Hcub);
2623         add     %sp,LOCALS64+$res_x,$rp
2624
2625         ldx     [%sp+LOCALS64+$S1],$bi          ! forward load
2626         ldx     [%sp+LOCALS64+$Hcub],$a0
2627         ldx     [%sp+LOCALS64+$Hcub+8],$a1
2628         ldx     [%sp+LOCALS64+$Hcub+16],$a2
2629         ldx     [%sp+LOCALS64+$Hcub+24],$a3
2630
2631         add     %sp,LOCALS64+$U2,$bp
2632         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_y, U2, res_x);
2633         add     %sp,LOCALS64+$res_y,$rp
2634
2635         add     %sp,LOCALS64+$S1,$bp
2636         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S1, Hcub);
2637         add     %sp,LOCALS64+$S2,$rp
2638
2639         ldx     [%sp+LOCALS64+$R],$bi
2640         ldx     [%sp+LOCALS64+$res_y],$a0
2641         ldx     [%sp+LOCALS64+$res_y+8],$a1
2642         ldx     [%sp+LOCALS64+$res_y+16],$a2
2643         ldx     [%sp+LOCALS64+$res_y+24],$a3
2644         add     %sp,LOCALS64+$R,$bp
2645         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_y, res_y, R);
2646         add     %sp,LOCALS64+$res_y,$rp
2647
2648         add     %sp,LOCALS64+$S2,$bp
2649         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, res_y, S2);
2650         add     %sp,LOCALS64+$res_y,$rp
2651
2652         ldx     [%fp+STACK_BIAS-16],$t1         ! !in1infty
2653         ldx     [%fp+STACK_BIAS-8],$t2          ! !in2infty
2654 ___
2655 for($i=0;$i<96;$i+=16) {                        # conditional moves
2656 $code.=<<___;
2657         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2658         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2659         ldx     [%sp+LOCALS64+$in2_x+$i],$acc2  ! in2
2660         ldx     [%sp+LOCALS64+$in2_x+$i+8],$acc3
2661         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2662         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2663         movrz   $t1,$acc2,$acc0
2664         movrz   $t1,$acc3,$acc1
2665         movrz   $t2,$acc4,$acc0
2666         movrz   $t2,$acc5,$acc1
2667         srlx    $acc0,32,$acc2
2668         srlx    $acc1,32,$acc3
2669         st      $acc0,[$rp_real+$i]
2670         st      $acc2,[$rp_real+$i+4]
2671         st      $acc1,[$rp_real+$i+8]
2672         st      $acc3,[$rp_real+$i+12]
2673 ___
2674 }
2675 $code.=<<___;
2676 .Ladd_done_vis3:
2677         ret
2678         restore
2679 .size   ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
2680 ___
2681 }
2682 ########################################################################
2683 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
2684 #                                    const P256_POINT_AFFINE *in2);
2685 {
2686 my ($res_x,$res_y,$res_z,
2687     $in1_x,$in1_y,$in1_z,
2688     $in2_x,$in2_y,
2689     $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
2690 my $Z1sqr = $S2;
2691 # above map() describes stack layout with 15 temporary
2692 # 256-bit vectors on top. Then we reserve some space for
2693 # !in1infty and !in2infty.
2694
2695 $code.=<<___;
2696 .align  32
2697 ecp_nistz256_point_add_affine_vis3:
2698         save    %sp,-STACK64_FRAME-32*15-32,%sp
2699
2700         mov     $rp,$rp_real
2701         mov     -1,$minus1
2702         mov     -2,$poly3
2703         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2704         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2705
2706         ! convert input to uint64_t[4]
2707         ld      [$bp],$a0                       ! in2_x
2708         ld      [$bp+4],$t0
2709         ld      [$bp+8],$a1
2710         ld      [$bp+12],$t1
2711         ld      [$bp+16],$a2
2712         ld      [$bp+20],$t2
2713         ld      [$bp+24],$a3
2714         ld      [$bp+28],$t3
2715         sllx    $t0,32,$t0
2716         sllx    $t1,32,$t1
2717         ld      [$bp+32],$acc0                  ! in2_y
2718         or      $a0,$t0,$a0
2719         ld      [$bp+32+4],$t0
2720         sllx    $t2,32,$t2
2721         ld      [$bp+32+8],$acc1
2722         or      $a1,$t1,$a1
2723         ld      [$bp+32+12],$t1
2724         sllx    $t3,32,$t3
2725         ld      [$bp+32+16],$acc2
2726         or      $a2,$t2,$a2
2727         ld      [$bp+32+20],$t2
2728         or      $a3,$t3,$a3
2729         ld      [$bp+32+24],$acc3
2730         sllx    $t0,32,$t0
2731         ld      [$bp+32+28],$t3
2732         sllx    $t1,32,$t1
2733         stx     $a0,[%sp+LOCALS64+$in2_x]
2734         sllx    $t2,32,$t2
2735         stx     $a1,[%sp+LOCALS64+$in2_x+8]
2736         sllx    $t3,32,$t3
2737         stx     $a2,[%sp+LOCALS64+$in2_x+16]
2738         or      $acc0,$t0,$acc0
2739         stx     $a3,[%sp+LOCALS64+$in2_x+24]
2740         or      $acc1,$t1,$acc1
2741         stx     $acc0,[%sp+LOCALS64+$in2_y]
2742         or      $acc2,$t2,$acc2
2743         stx     $acc1,[%sp+LOCALS64+$in2_y+8]
2744         or      $acc3,$t3,$acc3
2745         stx     $acc2,[%sp+LOCALS64+$in2_y+16]
2746         stx     $acc3,[%sp+LOCALS64+$in2_y+24]
2747
2748         or      $a1,$a0,$a0
2749         or      $a3,$a2,$a2
2750         or      $acc1,$acc0,$acc0
2751         or      $acc3,$acc2,$acc2
2752         or      $a2,$a0,$a0
2753         or      $acc2,$acc0,$acc0
2754         or      $acc0,$a0,$a0
2755         movrnz  $a0,-1,$a0                      ! !in2infty
2756         stx     $a0,[%fp+STACK_BIAS-8]
2757
2758         ld      [$ap],$a0                       ! in1_x
2759         ld      [$ap+4],$t0
2760         ld      [$ap+8],$a1
2761         ld      [$ap+12],$t1
2762         ld      [$ap+16],$a2
2763         ld      [$ap+20],$t2
2764         ld      [$ap+24],$a3
2765         ld      [$ap+28],$t3
2766         sllx    $t0,32,$t0
2767         sllx    $t1,32,$t1
2768         ld      [$ap+32],$acc0                  ! in1_y
2769         or      $a0,$t0,$a0
2770         ld      [$ap+32+4],$t0
2771         sllx    $t2,32,$t2
2772         ld      [$ap+32+8],$acc1
2773         or      $a1,$t1,$a1
2774         ld      [$ap+32+12],$t1
2775         sllx    $t3,32,$t3
2776         ld      [$ap+32+16],$acc2
2777         or      $a2,$t2,$a2
2778         ld      [$ap+32+20],$t2
2779         or      $a3,$t3,$a3
2780         ld      [$ap+32+24],$acc3
2781         sllx    $t0,32,$t0
2782         ld      [$ap+32+28],$t3
2783         sllx    $t1,32,$t1
2784         stx     $a0,[%sp+LOCALS64+$in1_x]
2785         sllx    $t2,32,$t2
2786         stx     $a1,[%sp+LOCALS64+$in1_x+8]
2787         sllx    $t3,32,$t3
2788         stx     $a2,[%sp+LOCALS64+$in1_x+16]
2789         or      $acc0,$t0,$acc0
2790         stx     $a3,[%sp+LOCALS64+$in1_x+24]
2791         or      $acc1,$t1,$acc1
2792         stx     $acc0,[%sp+LOCALS64+$in1_y]
2793         or      $acc2,$t2,$acc2
2794         stx     $acc1,[%sp+LOCALS64+$in1_y+8]
2795         or      $acc3,$t3,$acc3
2796         stx     $acc2,[%sp+LOCALS64+$in1_y+16]
2797         stx     $acc3,[%sp+LOCALS64+$in1_y+24]
2798
2799         or      $a1,$a0,$a0
2800         or      $a3,$a2,$a2
2801         or      $acc1,$acc0,$acc0
2802         or      $acc3,$acc2,$acc2
2803         or      $a2,$a0,$a0
2804         or      $acc2,$acc0,$acc0
2805         or      $acc0,$a0,$a0
2806         movrnz  $a0,-1,$a0                      ! !in1infty
2807         stx     $a0,[%fp+STACK_BIAS-16]
2808
2809         ld      [$ap+64],$a0                    ! in1_z
2810         ld      [$ap+64+4],$t0
2811         ld      [$ap+64+8],$a1
2812         ld      [$ap+64+12],$t1
2813         ld      [$ap+64+16],$a2
2814         ld      [$ap+64+20],$t2
2815         ld      [$ap+64+24],$a3
2816         ld      [$ap+64+28],$t3
2817         sllx    $t0,32,$t0
2818         sllx    $t1,32,$t1
2819         or      $a0,$t0,$a0
2820         sllx    $t2,32,$t2
2821         or      $a1,$t1,$a1
2822         sllx    $t3,32,$t3
2823         stx     $a0,[%sp+LOCALS64+$in1_z]
2824         or      $a2,$t2,$a2
2825         stx     $a1,[%sp+LOCALS64+$in1_z+8]
2826         or      $a3,$t3,$a3
2827         stx     $a2,[%sp+LOCALS64+$in1_z+16]
2828         stx     $a3,[%sp+LOCALS64+$in1_z+24]
2829
2830         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z1sqr, in1_z);
2831         add     %sp,LOCALS64+$Z1sqr,$rp
2832
2833         ldx     [%sp+LOCALS64+$in2_x],$bi
2834         mov     $acc0,$a0
2835         mov     $acc1,$a1
2836         mov     $acc2,$a2
2837         mov     $acc3,$a3
2838         add     %sp,LOCALS64+$in2_x,$bp
2839         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, Z1sqr, in2_x);
2840         add     %sp,LOCALS64+$U2,$rp
2841
2842         ldx     [%sp+LOCALS64+$Z1sqr],$bi       ! forward load
2843         ldx     [%sp+LOCALS64+$in1_z],$a0
2844         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2845         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2846         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2847
2848         add     %sp,LOCALS64+$in1_x,$bp
2849         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(H, U2, in1_x);
2850         add     %sp,LOCALS64+$H,$rp
2851
2852         add     %sp,LOCALS64+$Z1sqr,$bp
2853         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, Z1sqr, in1_z);
2854         add     %sp,LOCALS64+$S2,$rp
2855
2856         ldx     [%sp+LOCALS64+$H],$bi
2857         ldx     [%sp+LOCALS64+$in1_z],$a0
2858         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2859         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2860         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2861         add     %sp,LOCALS64+$H,$bp
2862         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, H, in1_z);
2863         add     %sp,LOCALS64+$res_z,$rp
2864
2865         ldx     [%sp+LOCALS64+$S2],$bi
2866         ldx     [%sp+LOCALS64+$in2_y],$a0
2867         ldx     [%sp+LOCALS64+$in2_y+8],$a1
2868         ldx     [%sp+LOCALS64+$in2_y+16],$a2
2869         ldx     [%sp+LOCALS64+$in2_y+24],$a3
2870         add     %sp,LOCALS64+$S2,$bp
2871         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S2, in2_y);
2872         add     %sp,LOCALS64+$S2,$rp
2873
2874         ldx     [%sp+LOCALS64+$H],$a0           ! forward load
2875         ldx     [%sp+LOCALS64+$H+8],$a1
2876         ldx     [%sp+LOCALS64+$H+16],$a2
2877         ldx     [%sp+LOCALS64+$H+24],$a3
2878
2879         add     %sp,LOCALS64+$in1_y,$bp
2880         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(R, S2, in1_y);
2881         add     %sp,LOCALS64+$R,$rp
2882
2883         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Hsqr, H);
2884         add     %sp,LOCALS64+$Hsqr,$rp
2885
2886         ldx     [%sp+LOCALS64+$R],$a0
2887         ldx     [%sp+LOCALS64+$R+8],$a1
2888         ldx     [%sp+LOCALS64+$R+16],$a2
2889         ldx     [%sp+LOCALS64+$R+24],$a3
2890         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Rsqr, R);
2891         add     %sp,LOCALS64+$Rsqr,$rp
2892
2893         ldx     [%sp+LOCALS64+$H],$bi
2894         ldx     [%sp+LOCALS64+$Hsqr],$a0
2895         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2896         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2897         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2898         add     %sp,LOCALS64+$H,$bp
2899         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(Hcub, Hsqr, H);
2900         add     %sp,LOCALS64+$Hcub,$rp
2901
2902         ldx     [%sp+LOCALS64+$Hsqr],$bi
2903         ldx     [%sp+LOCALS64+$in1_x],$a0
2904         ldx     [%sp+LOCALS64+$in1_x+8],$a1
2905         ldx     [%sp+LOCALS64+$in1_x+16],$a2
2906         ldx     [%sp+LOCALS64+$in1_x+24],$a3
2907         add     %sp,LOCALS64+$Hsqr,$bp
2908         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, in1_x, Hsqr);
2909         add     %sp,LOCALS64+$U2,$rp
2910
2911         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(Hsqr, U2);
2912         add     %sp,LOCALS64+$Hsqr,$rp
2913
2914         add     %sp,LOCALS64+$Rsqr,$bp
2915         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_x, Rsqr, Hsqr);
2916         add     %sp,LOCALS64+$res_x,$rp
2917
2918         add     %sp,LOCALS64+$Hcub,$bp
2919         call    __ecp_nistz256_sub_from_vis3    !  p256_sub(res_x, res_x, Hcub);
2920         add     %sp,LOCALS64+$res_x,$rp
2921
2922         ldx     [%sp+LOCALS64+$Hcub],$bi        ! forward load
2923         ldx     [%sp+LOCALS64+$in1_y],$a0
2924         ldx     [%sp+LOCALS64+$in1_y+8],$a1
2925         ldx     [%sp+LOCALS64+$in1_y+16],$a2
2926         ldx     [%sp+LOCALS64+$in1_y+24],$a3
2927
2928         add     %sp,LOCALS64+$U2,$bp
2929         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_y, U2, res_x);
2930         add     %sp,LOCALS64+$res_y,$rp
2931
2932         add     %sp,LOCALS64+$Hcub,$bp
2933         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, in1_y, Hcub);
2934         add     %sp,LOCALS64+$S2,$rp
2935
2936         ldx     [%sp+LOCALS64+$R],$bi
2937         ldx     [%sp+LOCALS64+$res_y],$a0
2938         ldx     [%sp+LOCALS64+$res_y+8],$a1
2939         ldx     [%sp+LOCALS64+$res_y+16],$a2
2940         ldx     [%sp+LOCALS64+$res_y+24],$a3
2941         add     %sp,LOCALS64+$R,$bp
2942         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_y, res_y, R);
2943         add     %sp,LOCALS64+$res_y,$rp
2944
2945         add     %sp,LOCALS64+$S2,$bp
2946         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, res_y, S2);
2947         add     %sp,LOCALS64+$res_y,$rp
2948
2949         ldx     [%fp+STACK_BIAS-16],$t1         ! !in1infty
2950         ldx     [%fp+STACK_BIAS-8],$t2          ! !in2infty
2951 1:      call    .+8
2952         add     %o7,.Lone_mont_vis3-1b,$bp
2953 ___
2954 for($i=0;$i<64;$i+=16) {                        # conditional moves
2955 $code.=<<___;
2956         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2957         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2958         ldx     [%sp+LOCALS64+$in2_x+$i],$acc2  ! in2
2959         ldx     [%sp+LOCALS64+$in2_x+$i+8],$acc3
2960         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2961         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2962         movrz   $t1,$acc2,$acc0
2963         movrz   $t1,$acc3,$acc1
2964         movrz   $t2,$acc4,$acc0
2965         movrz   $t2,$acc5,$acc1
2966         srlx    $acc0,32,$acc2
2967         srlx    $acc1,32,$acc3
2968         st      $acc0,[$rp_real+$i]
2969         st      $acc2,[$rp_real+$i+4]
2970         st      $acc1,[$rp_real+$i+8]
2971         st      $acc3,[$rp_real+$i+12]
2972 ___
2973 }
2974 for(;$i<96;$i+=16) {
2975 $code.=<<___;
2976         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2977         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2978         ldx     [$bp+$i-64],$acc2               ! "in2"
2979         ldx     [$bp+$i-64+8],$acc3
2980         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2981         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2982         movrz   $t1,$acc2,$acc0
2983         movrz   $t1,$acc3,$acc1
2984         movrz   $t2,$acc4,$acc0
2985         movrz   $t2,$acc5,$acc1
2986         srlx    $acc0,32,$acc2
2987         srlx    $acc1,32,$acc3
2988         st      $acc0,[$rp_real+$i]
2989         st      $acc2,[$rp_real+$i+4]
2990         st      $acc1,[$rp_real+$i+8]
2991         st      $acc3,[$rp_real+$i+12]
2992 ___
2993 }
2994 $code.=<<___;
2995         ret
2996         restore
2997 .size   ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
2998 .align  64
2999 .Lone_mont_vis3:
3000 .long   0x00000000,0x00000001, 0xffffffff,0x00000000
3001 .long   0xffffffff,0xffffffff, 0x00000000,0xfffffffe
3002 .align  64
3003 ___
3004 }                                                               }}}
3005 \f
3006 # Purpose of these subroutines is to explicitly encode VIS instructions,
3007 # so that one can compile the module without having to specify VIS
3008 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
3009 # Idea is to reserve for option to produce "universal" binary and let
3010 # programmer detect if current CPU is VIS capable at run-time.
3011 sub unvis3 {
3012 my ($mnemonic,$rs1,$rs2,$rd)=@_;
3013 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
3014 my ($ref,$opf);
3015 my %visopf = (  "addxc"         => 0x011,
3016                 "addxccc"       => 0x013,
3017                 "umulxhi"       => 0x016        );
3018
3019     $ref = "$mnemonic\t$rs1,$rs2,$rd";
3020
3021     if ($opf=$visopf{$mnemonic}) {
3022         foreach ($rs1,$rs2,$rd) {
3023             return $ref if (!/%([goli])([0-9])/);
3024             $_=$bias{$1}+$2;
3025         }
3026
3027         return  sprintf ".word\t0x%08x !%s",
3028                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
3029                         $ref;
3030     } else {
3031         return $ref;
3032     }
3033 }
3034
3035 foreach (split("\n",$code)) {
3036         s/\`([^\`]*)\`/eval $1/ge;
3037
3038         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
3039                 &unvis3($1,$2,$3,$4)
3040          /ge;
3041
3042         print $_,"\n";
3043 }
3044
3045 close STDOUT;