29a089b4a2706ae6bd8acdd520691f5c0902fd31
[openssl.git] / crypto / ec / asm / ecp_nistz256-sparcv9.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # ECP_NISTZ256 module for SPARCv9.
11 #
12 # February 2015.
13 #
14 # Original ECP_NISTZ256 submission targeting x86_64 is detailed in
15 # http://eprint.iacr.org/2013/816. In the process of adaptation
16 # original .c module was made 32-bit savvy in order to make this
17 # implementation possible.
18 #
19 #                       with/without -DECP_NISTZ256_ASM
20 # UltraSPARC III        +12-18%
21 # SPARC T4              +99-550% (+66-150% on 32-bit Solaris)
22 #
23 # Ranges denote minimum and maximum improvement coefficients depending
24 # on benchmark. Lower coefficients are for ECDSA sign, server-side
25 # operation. Keep in mind that +200% means 3x improvement.
26
27 $output = pop;
28 open STDOUT,">$output";
29
30 $code.=<<___;
31 #include "sparc_arch.h"
32
33 #define LOCALS  (STACK_BIAS+STACK_FRAME)
34 #ifdef  __arch64__
35 .register       %g2,#scratch
36 .register       %g3,#scratch
37 # define STACK64_FRAME  STACK_FRAME
38 # define LOCALS64       LOCALS
39 #else
40 # define STACK64_FRAME  (2047+192)
41 # define LOCALS64       STACK64_FRAME
42 #endif
43
44 .section        ".text",#alloc,#execinstr
45 ___
46 ########################################################################
47 # Convert ecp_nistz256_table.c to layout expected by ecp_nistz_gather_w7
48 #
49 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
50 open TABLE,"<ecp_nistz256_table.c"              or
51 open TABLE,"<${dir}../ecp_nistz256_table.c"     or
52 die "failed to open ecp_nistz256_table.c:",$!;
53
54 use integer;
55
56 foreach(<TABLE>) {
57         s/TOBN\(\s*(0x[0-9a-f]+),\s*(0x[0-9a-f]+)\s*\)/push @arr,hex($2),hex($1)/geo;
58 }
59 close TABLE;
60
61 # See ecp_nistz256_table.c for explanation for why it's 64*16*37.
62 # 64*16*37-1 is because $#arr returns last valid index or @arr, not
63 # amount of elements.
64 die "insane number of elements" if ($#arr != 64*16*37-1);
65
66 $code.=<<___;
67 .globl  ecp_nistz256_precomputed
68 .align  4096
69 ecp_nistz256_precomputed:
70 ___
71 ########################################################################
72 # this conversion smashes P256_POINT_AFFINE by individual bytes with
73 # 64 byte interval, similar to
74 #       1111222233334444
75 #       1234123412341234
76 for(1..37) {
77         @tbl = splice(@arr,0,64*16);
78         for($i=0;$i<64;$i++) {
79                 undef @line;
80                 for($j=0;$j<64;$j++) {
81                         push @line,(@tbl[$j*16+$i/4]>>(($i%4)*8))&0xff;
82                 }
83                 $code.=".byte\t";
84                 $code.=join(',',map { sprintf "0x%02x",$_} @line);
85                 $code.="\n";
86         }
87 }
88
89 {{{
90 my ($rp,$ap,$bp)=map("%i$_",(0..2));
91 my @acc=map("%l$_",(0..7));
92 my ($t0,$t1,$t2,$t3,$t4,$t5,$t6,$t7)=(map("%o$_",(0..5)),"%g4","%g5");
93 my ($bi,$a0,$mask,$carry)=(map("%i$_",(3..5)),"%g1");
94 my ($rp_real,$ap_real)=("%g2","%g3");
95
96 $code.=<<___;
97 .size   ecp_nistz256_precomputed,.-ecp_nistz256_precomputed
98 .align  64
99 .LRR:   ! 2^512 mod P precomputed for NIST P256 polynomial
100 .long   0x00000003, 0x00000000, 0xffffffff, 0xfffffffb
101 .long   0xfffffffe, 0xffffffff, 0xfffffffd, 0x00000004
102 .Lone:
103 .long   1,0,0,0,0,0,0,0
104 .asciz  "ECP_NISTZ256 for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
105
106 ! void  ecp_nistz256_to_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
107 .globl  ecp_nistz256_to_mont
108 .align  64
109 ecp_nistz256_to_mont:
110         save    %sp,-STACK_FRAME,%sp
111         nop
112 1:      call    .+8
113         add     %o7,.LRR-1b,$bp
114         call    __ecp_nistz256_mul_mont
115         nop
116         ret
117         restore
118 .size   ecp_nistz256_to_mont,.-ecp_nistz256_to_mont
119
120 ! void  ecp_nistz256_from_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
121 .globl  ecp_nistz256_from_mont
122 .align  32
123 ecp_nistz256_from_mont:
124         save    %sp,-STACK_FRAME,%sp
125         nop
126 1:      call    .+8
127         add     %o7,.Lone-1b,$bp
128         call    __ecp_nistz256_mul_mont
129         nop
130         ret
131         restore
132 .size   ecp_nistz256_from_mont,.-ecp_nistz256_from_mont
133
134 ! void  ecp_nistz256_mul_mont(BN_ULONG %i0[8],const BN_ULONG %i1[8],
135 !                                             const BN_ULONG %i2[8]);
136 .globl  ecp_nistz256_mul_mont
137 .align  32
138 ecp_nistz256_mul_mont:
139         save    %sp,-STACK_FRAME,%sp
140         nop
141         call    __ecp_nistz256_mul_mont
142         nop
143         ret
144         restore
145 .size   ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
146
147 ! void  ecp_nistz256_sqr_mont(BN_ULONG %i0[8],const BN_ULONG %i2[8]);
148 .globl  ecp_nistz256_sqr_mont
149 .align  32
150 ecp_nistz256_sqr_mont:
151         save    %sp,-STACK_FRAME,%sp
152         mov     $ap,$bp
153         call    __ecp_nistz256_mul_mont
154         nop
155         ret
156         restore
157 .size   ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
158 ___
159
160 ########################################################################
161 # Special thing to keep in mind is that $t0-$t7 hold 64-bit values,
162 # while all others are meant to keep 32. "Meant to" means that additions
163 # to @acc[0-7] do "contaminate" upper bits, but they are cleared before
164 # they can affect outcome (follow 'and' with $mask). Also keep in mind
165 # that addition with carry is addition with 32-bit carry, even though
166 # CPU is 64-bit. [Addition with 64-bit carry was introduced in T3, see
167 # below for VIS3 code paths.]
168
169 $code.=<<___;
170 .align  32
171 __ecp_nistz256_mul_mont:
172         ld      [$bp+0],$bi             ! b[0]
173         mov     -1,$mask
174         ld      [$ap+0],$a0
175         srl     $mask,0,$mask           ! 0xffffffff
176         ld      [$ap+4],$t1
177         ld      [$ap+8],$t2
178         ld      [$ap+12],$t3
179         ld      [$ap+16],$t4
180         ld      [$ap+20],$t5
181         ld      [$ap+24],$t6
182         ld      [$ap+28],$t7
183         mulx    $a0,$bi,$t0             ! a[0-7]*b[0], 64-bit results
184         mulx    $t1,$bi,$t1
185         mulx    $t2,$bi,$t2
186         mulx    $t3,$bi,$t3
187         mulx    $t4,$bi,$t4
188         mulx    $t5,$bi,$t5
189         mulx    $t6,$bi,$t6
190         mulx    $t7,$bi,$t7
191         srlx    $t0,32,@acc[1]          ! extract high parts
192         srlx    $t1,32,@acc[2]
193         srlx    $t2,32,@acc[3]
194         srlx    $t3,32,@acc[4]
195         srlx    $t4,32,@acc[5]
196         srlx    $t5,32,@acc[6]
197         srlx    $t6,32,@acc[7]
198         srlx    $t7,32,@acc[0]          ! "@acc[8]"
199         mov     0,$carry
200 ___
201 for($i=1;$i<8;$i++) {
202 $code.=<<___;
203         addcc   @acc[1],$t1,@acc[1]     ! accumulate high parts
204         ld      [$bp+4*$i],$bi          ! b[$i]
205         ld      [$ap+4],$t1             ! re-load a[1-7]
206         addccc  @acc[2],$t2,@acc[2]
207         addccc  @acc[3],$t3,@acc[3]
208         ld      [$ap+8],$t2
209         ld      [$ap+12],$t3
210         addccc  @acc[4],$t4,@acc[4]
211         addccc  @acc[5],$t5,@acc[5]
212         ld      [$ap+16],$t4
213         ld      [$ap+20],$t5
214         addccc  @acc[6],$t6,@acc[6]
215         addccc  @acc[7],$t7,@acc[7]
216         ld      [$ap+24],$t6
217         ld      [$ap+28],$t7
218         addccc  @acc[0],$carry,@acc[0]  ! "@acc[8]"
219         addc    %g0,%g0,$carry
220 ___
221         # Reduction iteration is normally performed by accumulating
222         # result of multiplication of modulus by "magic" digit [and
223         # omitting least significant word, which is guaranteed to
224         # be 0], but thanks to special form of modulus and "magic"
225         # digit being equal to least significant word, it can be
226         # performed with additions and subtractions alone. Indeed:
227         #
228         #        ffff.0001.0000.0000.0000.ffff.ffff.ffff
229         # *                                         abcd
230         # + xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
231         #
232         # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
233         # rewrite above as:
234         #
235         #   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.abcd
236         # + abcd.0000.abcd.0000.0000.abcd.0000.0000.0000
237         # -      abcd.0000.0000.0000.0000.0000.0000.abcd
238         #
239         # or marking redundant operations:
240         #
241         #   xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.xxxx.----
242         # + abcd.0000.abcd.0000.0000.abcd.----.----.----
243         # -      abcd.----.----.----.----.----.----.----
244
245 $code.=<<___;
246         ! multiplication-less reduction
247         addcc   @acc[3],$t0,@acc[3]     ! r[3]+=r[0]
248         addccc  @acc[4],%g0,@acc[4]     ! r[4]+=0
249          and    @acc[1],$mask,@acc[1]
250          and    @acc[2],$mask,@acc[2]
251         addccc  @acc[5],%g0,@acc[5]     ! r[5]+=0
252         addccc  @acc[6],$t0,@acc[6]     ! r[6]+=r[0]
253          and    @acc[3],$mask,@acc[3]
254          and    @acc[4],$mask,@acc[4]
255         addccc  @acc[7],%g0,@acc[7]     ! r[7]+=0
256         addccc  @acc[0],$t0,@acc[0]     ! r[8]+=r[0]    "@acc[8]"
257          and    @acc[5],$mask,@acc[5]
258          and    @acc[6],$mask,@acc[6]
259         addc    $carry,%g0,$carry       ! top-most carry
260         subcc   @acc[7],$t0,@acc[7]     ! r[7]-=r[0]
261         subccc  @acc[0],%g0,@acc[0]     ! r[8]-=0       "@acc[8]"
262         subc    $carry,%g0,$carry       ! top-most carry
263          and    @acc[7],$mask,@acc[7]
264          and    @acc[0],$mask,@acc[0]   ! "@acc[8]"
265 ___
266         push(@acc,shift(@acc));         # rotate registers to "omit" acc[0]
267 $code.=<<___;
268         mulx    $a0,$bi,$t0             ! a[0-7]*b[$i], 64-bit results
269         mulx    $t1,$bi,$t1
270         mulx    $t2,$bi,$t2
271         mulx    $t3,$bi,$t3
272         mulx    $t4,$bi,$t4
273         mulx    $t5,$bi,$t5
274         mulx    $t6,$bi,$t6
275         mulx    $t7,$bi,$t7
276         add     @acc[0],$t0,$t0         ! accumulate low parts, can't overflow
277         add     @acc[1],$t1,$t1
278         srlx    $t0,32,@acc[1]          ! extract high parts
279         add     @acc[2],$t2,$t2
280         srlx    $t1,32,@acc[2]
281         add     @acc[3],$t3,$t3
282         srlx    $t2,32,@acc[3]
283         add     @acc[4],$t4,$t4
284         srlx    $t3,32,@acc[4]
285         add     @acc[5],$t5,$t5
286         srlx    $t4,32,@acc[5]
287         add     @acc[6],$t6,$t6
288         srlx    $t5,32,@acc[6]
289         add     @acc[7],$t7,$t7
290         srlx    $t6,32,@acc[7]
291         srlx    $t7,32,@acc[0]          ! "@acc[8]"
292 ___
293 }
294 $code.=<<___;
295         addcc   @acc[1],$t1,@acc[1]     ! accumulate high parts
296         addccc  @acc[2],$t2,@acc[2]
297         addccc  @acc[3],$t3,@acc[3]
298         addccc  @acc[4],$t4,@acc[4]
299         addccc  @acc[5],$t5,@acc[5]
300         addccc  @acc[6],$t6,@acc[6]
301         addccc  @acc[7],$t7,@acc[7]
302         addccc  @acc[0],$carry,@acc[0]  ! "@acc[8]"
303         addc    %g0,%g0,$carry
304
305         addcc   @acc[3],$t0,@acc[3]     ! multiplication-less reduction
306         addccc  @acc[4],%g0,@acc[4]
307         addccc  @acc[5],%g0,@acc[5]
308         addccc  @acc[6],$t0,@acc[6]
309         addccc  @acc[7],%g0,@acc[7]
310         addccc  @acc[0],$t0,@acc[0]     ! "@acc[8]"
311         addc    $carry,%g0,$carry
312         subcc   @acc[7],$t0,@acc[7]
313         subccc  @acc[0],%g0,@acc[0]     ! "@acc[8]"
314         subc    $carry,%g0,$carry       ! top-most carry
315 ___
316         push(@acc,shift(@acc));         # rotate registers to omit acc[0]
317 $code.=<<___;
318         ! Final step is "if result > mod, subtract mod", but we do it
319         ! "other way around", namely subtract modulus from result
320         ! and if it borrowed, add modulus back.
321
322         subcc   @acc[0],-1,@acc[0]      ! subtract modulus
323         subccc  @acc[1],-1,@acc[1]
324         subccc  @acc[2],-1,@acc[2]
325         subccc  @acc[3],0,@acc[3]
326         subccc  @acc[4],0,@acc[4]
327         subccc  @acc[5],0,@acc[5]
328         subccc  @acc[6],1,@acc[6]
329         subccc  @acc[7],-1,@acc[7]
330         subc    $carry,0,$carry         ! broadcast borrow bit
331
332         ! Note that because mod has special form, i.e. consists of
333         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
334         ! using value of broadcasted borrow and the borrow bit itself.
335         ! To minimize dependency chain we first broadcast and then
336         ! extract the bit by negating (follow $bi).
337
338         addcc   @acc[0],$carry,@acc[0]  ! add modulus or zero
339         addccc  @acc[1],$carry,@acc[1]
340         neg     $carry,$bi
341         st      @acc[0],[$rp]
342         addccc  @acc[2],$carry,@acc[2]
343         st      @acc[1],[$rp+4]
344         addccc  @acc[3],0,@acc[3]
345         st      @acc[2],[$rp+8]
346         addccc  @acc[4],0,@acc[4]
347         st      @acc[3],[$rp+12]
348         addccc  @acc[5],0,@acc[5]
349         st      @acc[4],[$rp+16]
350         addccc  @acc[6],$bi,@acc[6]
351         st      @acc[5],[$rp+20]
352         addc    @acc[7],$carry,@acc[7]
353         st      @acc[6],[$rp+24]
354         retl
355         st      @acc[7],[$rp+28]
356 .size   __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont
357
358 ! void  ecp_nistz256_add(BN_ULONG %i0[8],const BN_ULONG %i1[8],
359 !                                        const BN_ULONG %i2[8]);
360 .globl  ecp_nistz256_add
361 .align  32
362 ecp_nistz256_add:
363         save    %sp,-STACK_FRAME,%sp
364         ld      [$ap],@acc[0]
365         ld      [$ap+4],@acc[1]
366         ld      [$ap+8],@acc[2]
367         ld      [$ap+12],@acc[3]
368         ld      [$ap+16],@acc[4]
369         ld      [$ap+20],@acc[5]
370         ld      [$ap+24],@acc[6]
371         call    __ecp_nistz256_add
372         ld      [$ap+28],@acc[7]
373         ret
374         restore
375 .size   ecp_nistz256_add,.-ecp_nistz256_add
376
377 .align  32
378 __ecp_nistz256_add:
379         ld      [$bp+0],$t0             ! b[0]
380         ld      [$bp+4],$t1
381         ld      [$bp+8],$t2
382         ld      [$bp+12],$t3
383         addcc   @acc[0],$t0,@acc[0]
384         ld      [$bp+16],$t4
385         ld      [$bp+20],$t5
386         addccc  @acc[1],$t1,@acc[1]
387         ld      [$bp+24],$t6
388         ld      [$bp+28],$t7
389         addccc  @acc[2],$t2,@acc[2]
390         addccc  @acc[3],$t3,@acc[3]
391         addccc  @acc[4],$t4,@acc[4]
392         addccc  @acc[5],$t5,@acc[5]
393         addccc  @acc[6],$t6,@acc[6]
394         addccc  @acc[7],$t7,@acc[7]
395         subc    %g0,%g0,$carry          ! broadcast carry bit
396
397 .Lreduce_by_sub:
398
399         ! if a+b carries, subtract modulus.
400         !
401         ! Note that because mod has special form, i.e. consists of
402         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
403         ! using value of broadcasted borrow and the borrow bit itself.
404         ! To minimize dependency chain we first broadcast and then
405         ! extract the bit by negating (follow $bi).
406
407         subcc   @acc[0],$carry,@acc[0]  ! subtract synthesized modulus
408         subccc  @acc[1],$carry,@acc[1]
409         neg     $carry,$bi
410         st      @acc[0],[$rp]
411         subccc  @acc[2],$carry,@acc[2]
412         st      @acc[1],[$rp+4]
413         subccc  @acc[3],0,@acc[3]
414         st      @acc[2],[$rp+8]
415         subccc  @acc[4],0,@acc[4]
416         st      @acc[3],[$rp+12]
417         subccc  @acc[5],0,@acc[5]
418         st      @acc[4],[$rp+16]
419         subccc  @acc[6],$bi,@acc[6]
420         st      @acc[5],[$rp+20]
421         subc    @acc[7],$carry,@acc[7]
422         st      @acc[6],[$rp+24]
423         retl
424         st      @acc[7],[$rp+28]
425 .size   __ecp_nistz256_add,.-__ecp_nistz256_add
426
427 ! void  ecp_nistz256_mul_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
428 .globl  ecp_nistz256_mul_by_2
429 .align  32
430 ecp_nistz256_mul_by_2:
431         save    %sp,-STACK_FRAME,%sp
432         ld      [$ap],@acc[0]
433         ld      [$ap+4],@acc[1]
434         ld      [$ap+8],@acc[2]
435         ld      [$ap+12],@acc[3]
436         ld      [$ap+16],@acc[4]
437         ld      [$ap+20],@acc[5]
438         ld      [$ap+24],@acc[6]
439         call    __ecp_nistz256_mul_by_2
440         ld      [$ap+28],@acc[7]
441         ret
442         restore
443 .size   ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2
444
445 .align  32
446 __ecp_nistz256_mul_by_2:
447         addcc   @acc[0],@acc[0],@acc[0] ! a+a=2*a
448         addccc  @acc[1],@acc[1],@acc[1]
449         addccc  @acc[2],@acc[2],@acc[2]
450         addccc  @acc[3],@acc[3],@acc[3]
451         addccc  @acc[4],@acc[4],@acc[4]
452         addccc  @acc[5],@acc[5],@acc[5]
453         addccc  @acc[6],@acc[6],@acc[6]
454         addccc  @acc[7],@acc[7],@acc[7]
455         b       .Lreduce_by_sub
456         subc    %g0,%g0,$carry          ! broadcast carry bit
457 .size   __ecp_nistz256_mul_by_2,.-__ecp_nistz256_mul_by_2
458
459 ! void  ecp_nistz256_mul_by_3(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
460 .globl  ecp_nistz256_mul_by_3
461 .align  32
462 ecp_nistz256_mul_by_3:
463         save    %sp,-STACK_FRAME,%sp
464         ld      [$ap],@acc[0]
465         ld      [$ap+4],@acc[1]
466         ld      [$ap+8],@acc[2]
467         ld      [$ap+12],@acc[3]
468         ld      [$ap+16],@acc[4]
469         ld      [$ap+20],@acc[5]
470         ld      [$ap+24],@acc[6]
471         call    __ecp_nistz256_mul_by_3
472         ld      [$ap+28],@acc[7]
473         ret
474         restore
475 .size   ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3
476
477 .align  32
478 __ecp_nistz256_mul_by_3:
479         addcc   @acc[0],@acc[0],$t0     ! a+a=2*a
480         addccc  @acc[1],@acc[1],$t1
481         addccc  @acc[2],@acc[2],$t2
482         addccc  @acc[3],@acc[3],$t3
483         addccc  @acc[4],@acc[4],$t4
484         addccc  @acc[5],@acc[5],$t5
485         addccc  @acc[6],@acc[6],$t6
486         addccc  @acc[7],@acc[7],$t7
487         subc    %g0,%g0,$carry          ! broadcast carry bit
488
489         subcc   $t0,$carry,$t0          ! .Lreduce_by_sub but without stores
490         neg     $carry,$bi
491         subccc  $t1,$carry,$t1
492         subccc  $t2,$carry,$t2
493         subccc  $t3,0,$t3
494         subccc  $t4,0,$t4
495         subccc  $t5,0,$t5
496         subccc  $t6,$bi,$t6
497         subc    $t7,$carry,$t7
498
499         addcc   $t0,@acc[0],@acc[0]     ! 2*a+a=3*a
500         addccc  $t1,@acc[1],@acc[1]
501         addccc  $t2,@acc[2],@acc[2]
502         addccc  $t3,@acc[3],@acc[3]
503         addccc  $t4,@acc[4],@acc[4]
504         addccc  $t5,@acc[5],@acc[5]
505         addccc  $t6,@acc[6],@acc[6]
506         addccc  $t7,@acc[7],@acc[7]
507         b       .Lreduce_by_sub
508         subc    %g0,%g0,$carry          ! broadcast carry bit
509 .size   __ecp_nistz256_mul_by_3,.-__ecp_nistz256_mul_by_3
510
511 ! void  ecp_nistz256_sub(BN_ULONG %i0[8],const BN_ULONG %i1[8],
512 !                                        const BN_ULONG %i2[8]);
513 .globl  ecp_nistz256_sub
514 .align  32
515 ecp_nistz256_sub:
516         save    %sp,-STACK_FRAME,%sp
517         ld      [$ap],@acc[0]
518         ld      [$ap+4],@acc[1]
519         ld      [$ap+8],@acc[2]
520         ld      [$ap+12],@acc[3]
521         ld      [$ap+16],@acc[4]
522         ld      [$ap+20],@acc[5]
523         ld      [$ap+24],@acc[6]
524         call    __ecp_nistz256_sub_from
525         ld      [$ap+28],@acc[7]
526         ret
527         restore
528 .size   ecp_nistz256_sub,.-ecp_nistz256_sub
529
530 ! void  ecp_nistz256_neg(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
531 .globl  ecp_nistz256_neg
532 .align  32
533 ecp_nistz256_neg:
534         save    %sp,-STACK_FRAME,%sp
535         mov     $ap,$bp
536         mov     0,@acc[0]
537         mov     0,@acc[1]
538         mov     0,@acc[2]
539         mov     0,@acc[3]
540         mov     0,@acc[4]
541         mov     0,@acc[5]
542         mov     0,@acc[6]
543         call    __ecp_nistz256_sub_from
544         mov     0,@acc[7]
545         ret
546         restore
547 .size   ecp_nistz256_neg,.-ecp_nistz256_neg
548
549 .align  32
550 __ecp_nistz256_sub_from:
551         ld      [$bp+0],$t0             ! b[0]
552         ld      [$bp+4],$t1
553         ld      [$bp+8],$t2
554         ld      [$bp+12],$t3
555         subcc   @acc[0],$t0,@acc[0]
556         ld      [$bp+16],$t4
557         ld      [$bp+20],$t5
558         subccc  @acc[1],$t1,@acc[1]
559         subccc  @acc[2],$t2,@acc[2]
560         ld      [$bp+24],$t6
561         ld      [$bp+28],$t7
562         subccc  @acc[3],$t3,@acc[3]
563         subccc  @acc[4],$t4,@acc[4]
564         subccc  @acc[5],$t5,@acc[5]
565         subccc  @acc[6],$t6,@acc[6]
566         subccc  @acc[7],$t7,@acc[7]
567         subc    %g0,%g0,$carry          ! broadcast borrow bit
568
569 .Lreduce_by_add:
570
571         ! if a-b borrows, add modulus.
572         !
573         ! Note that because mod has special form, i.e. consists of
574         ! 0xffffffff, 1 and 0s, we can conditionally synthesize it by
575         ! using value of broadcasted borrow and the borrow bit itself.
576         ! To minimize dependency chain we first broadcast and then
577         ! extract the bit by negating (follow $bi).
578
579         addcc   @acc[0],$carry,@acc[0]  ! add synthesized modulus
580         addccc  @acc[1],$carry,@acc[1]
581         neg     $carry,$bi
582         st      @acc[0],[$rp]
583         addccc  @acc[2],$carry,@acc[2]
584         st      @acc[1],[$rp+4]
585         addccc  @acc[3],0,@acc[3]
586         st      @acc[2],[$rp+8]
587         addccc  @acc[4],0,@acc[4]
588         st      @acc[3],[$rp+12]
589         addccc  @acc[5],0,@acc[5]
590         st      @acc[4],[$rp+16]
591         addccc  @acc[6],$bi,@acc[6]
592         st      @acc[5],[$rp+20]
593         addc    @acc[7],$carry,@acc[7]
594         st      @acc[6],[$rp+24]
595         retl
596         st      @acc[7],[$rp+28]
597 .size   __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from
598
599 .align  32
600 __ecp_nistz256_sub_morf:
601         ld      [$bp+0],$t0             ! b[0]
602         ld      [$bp+4],$t1
603         ld      [$bp+8],$t2
604         ld      [$bp+12],$t3
605         subcc   $t0,@acc[0],@acc[0]
606         ld      [$bp+16],$t4
607         ld      [$bp+20],$t5
608         subccc  $t1,@acc[1],@acc[1]
609         subccc  $t2,@acc[2],@acc[2]
610         ld      [$bp+24],$t6
611         ld      [$bp+28],$t7
612         subccc  $t3,@acc[3],@acc[3]
613         subccc  $t4,@acc[4],@acc[4]
614         subccc  $t5,@acc[5],@acc[5]
615         subccc  $t6,@acc[6],@acc[6]
616         subccc  $t7,@acc[7],@acc[7]
617         b       .Lreduce_by_add
618         subc    %g0,%g0,$carry          ! broadcast borrow bit
619 .size   __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf
620
621 ! void  ecp_nistz256_div_by_2(BN_ULONG %i0[8],const BN_ULONG %i1[8]);
622 .globl  ecp_nistz256_div_by_2
623 .align  32
624 ecp_nistz256_div_by_2:
625         save    %sp,-STACK_FRAME,%sp
626         ld      [$ap],@acc[0]
627         ld      [$ap+4],@acc[1]
628         ld      [$ap+8],@acc[2]
629         ld      [$ap+12],@acc[3]
630         ld      [$ap+16],@acc[4]
631         ld      [$ap+20],@acc[5]
632         ld      [$ap+24],@acc[6]
633         call    __ecp_nistz256_div_by_2
634         ld      [$ap+28],@acc[7]
635         ret
636         restore
637 .size   ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2
638
639 .align  32
640 __ecp_nistz256_div_by_2:
641         ! ret = (a is odd ? a+mod : a) >> 1
642
643         and     @acc[0],1,$bi
644         neg     $bi,$carry
645         addcc   @acc[0],$carry,@acc[0]
646         addccc  @acc[1],$carry,@acc[1]
647         addccc  @acc[2],$carry,@acc[2]
648         addccc  @acc[3],0,@acc[3]
649         addccc  @acc[4],0,@acc[4]
650         addccc  @acc[5],0,@acc[5]
651         addccc  @acc[6],$bi,@acc[6]
652         addccc  @acc[7],$carry,@acc[7]
653         addc    %g0,%g0,$carry
654
655         ! ret >>= 1
656
657         srl     @acc[0],1,@acc[0]
658         sll     @acc[1],31,$t0
659         srl     @acc[1],1,@acc[1]
660         or      @acc[0],$t0,@acc[0]
661         sll     @acc[2],31,$t1
662         srl     @acc[2],1,@acc[2]
663         or      @acc[1],$t1,@acc[1]
664         sll     @acc[3],31,$t2
665         st      @acc[0],[$rp]
666         srl     @acc[3],1,@acc[3]
667         or      @acc[2],$t2,@acc[2]
668         sll     @acc[4],31,$t3
669         st      @acc[1],[$rp+4]
670         srl     @acc[4],1,@acc[4]
671         or      @acc[3],$t3,@acc[3]
672         sll     @acc[5],31,$t4
673         st      @acc[2],[$rp+8]
674         srl     @acc[5],1,@acc[5]
675         or      @acc[4],$t4,@acc[4]
676         sll     @acc[6],31,$t5
677         st      @acc[3],[$rp+12]
678         srl     @acc[6],1,@acc[6]
679         or      @acc[5],$t5,@acc[5]
680         sll     @acc[7],31,$t6
681         st      @acc[4],[$rp+16]
682         srl     @acc[7],1,@acc[7]
683         or      @acc[6],$t6,@acc[6]
684         sll     $carry,31,$t7
685         st      @acc[5],[$rp+20]
686         or      @acc[7],$t7,@acc[7]
687         st      @acc[6],[$rp+24]
688         retl
689         st      @acc[7],[$rp+28]
690 .size   __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2
691 ___
692
693 ########################################################################
694 # following subroutines are "literal" implemetation of those found in
695 # ecp_nistz256.c
696 #
697 ########################################################################
698 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
699 #
700 {
701 my ($S,$M,$Zsqr,$tmp0)=map(32*$_,(0..3));
702 # above map() describes stack layout with 4 temporary
703 # 256-bit vectors on top.
704
705 $code.=<<___;
706 #ifdef __PIC__
707 SPARC_PIC_THUNK(%g1)
708 #endif
709
710 .globl  ecp_nistz256_point_double
711 .align  32
712 ecp_nistz256_point_double:
713         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
714         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
715         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
716         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
717         be      ecp_nistz256_point_double_vis3
718         nop
719
720         save    %sp,-STACK_FRAME-32*4,%sp
721
722         mov     $rp,$rp_real
723         mov     $ap,$ap_real
724
725         ld      [$ap+32],@acc[0]
726         ld      [$ap+32+4],@acc[1]
727         ld      [$ap+32+8],@acc[2]
728         ld      [$ap+32+12],@acc[3]
729         ld      [$ap+32+16],@acc[4]
730         ld      [$ap+32+20],@acc[5]
731         ld      [$ap+32+24],@acc[6]
732         ld      [$ap+32+28],@acc[7]
733         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(S, in_y);
734         add     %sp,LOCALS+$S,$rp
735
736         add     $ap_real,64,$bp
737         add     $ap_real,64,$ap
738         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Zsqr, in_z);
739         add     %sp,LOCALS+$Zsqr,$rp
740
741         add     $ap_real,0,$bp
742         call    __ecp_nistz256_add      ! p256_add(M, Zsqr, in_x);
743         add     %sp,LOCALS+$M,$rp
744
745         add     %sp,LOCALS+$S,$bp
746         add     %sp,LOCALS+$S,$ap
747         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(S, S);
748         add     %sp,LOCALS+$S,$rp
749
750         ld      [$ap_real],@acc[0]
751         add     %sp,LOCALS+$Zsqr,$bp
752         ld      [$ap_real+4],@acc[1]
753         ld      [$ap_real+8],@acc[2]
754         ld      [$ap_real+12],@acc[3]
755         ld      [$ap_real+16],@acc[4]
756         ld      [$ap_real+20],@acc[5]
757         ld      [$ap_real+24],@acc[6]
758         ld      [$ap_real+28],@acc[7]
759         call    __ecp_nistz256_sub_from ! p256_sub(Zsqr, in_x, Zsqr);
760         add     %sp,LOCALS+$Zsqr,$rp
761
762         add     $ap_real,32,$bp
763         add     $ap_real,64,$ap
764         call    __ecp_nistz256_mul_mont ! p256_mul_mont(tmp0, in_z, in_y);
765         add     %sp,LOCALS+$tmp0,$rp
766
767         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(res_z, tmp0);
768         add     $rp_real,64,$rp
769
770         add     %sp,LOCALS+$Zsqr,$bp
771         add     %sp,LOCALS+$M,$ap
772         call    __ecp_nistz256_mul_mont ! p256_mul_mont(M, M, Zsqr);
773         add     %sp,LOCALS+$M,$rp
774
775         call    __ecp_nistz256_mul_by_3 ! p256_mul_by_3(M, M);
776         add     %sp,LOCALS+$M,$rp
777
778         add     %sp,LOCALS+$S,$bp
779         add     %sp,LOCALS+$S,$ap
780         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(tmp0, S);
781         add     %sp,LOCALS+$tmp0,$rp
782
783         call    __ecp_nistz256_div_by_2 ! p256_div_by_2(res_y, tmp0);
784         add     $rp_real,32,$rp
785
786         add     $ap_real,0,$bp
787         add     %sp,LOCALS+$S,$ap
788         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, in_x);
789         add     %sp,LOCALS+$S,$rp
790
791         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(tmp0, S);
792         add     %sp,LOCALS+$tmp0,$rp
793
794         add     %sp,LOCALS+$M,$bp
795         add     %sp,LOCALS+$M,$ap
796         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(res_x, M);
797         add     $rp_real,0,$rp
798
799         add     %sp,LOCALS+$tmp0,$bp
800         call    __ecp_nistz256_sub_from ! p256_sub(res_x, res_x, tmp0);
801         add     $rp_real,0,$rp
802
803         add     %sp,LOCALS+$S,$bp
804         call    __ecp_nistz256_sub_morf ! p256_sub(S, S, res_x);
805         add     %sp,LOCALS+$S,$rp
806
807         add     %sp,LOCALS+$M,$bp
808         add     %sp,LOCALS+$S,$ap
809         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S, S, M);
810         add     %sp,LOCALS+$S,$rp
811
812         add     $rp_real,32,$bp
813         call    __ecp_nistz256_sub_from ! p256_sub(res_y, S, res_y);
814         add     $rp_real,32,$rp
815
816         ret
817         restore
818 .size   ecp_nistz256_point_double,.-ecp_nistz256_point_double
819 ___
820 }
821
822 ########################################################################
823 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
824 #                             const P256_POINT *in2);
825 {
826 my ($res_x,$res_y,$res_z,
827     $H,$Hsqr,$R,$Rsqr,$Hcub,
828     $U1,$U2,$S1,$S2)=map(32*$_,(0..11));
829 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
830
831 # above map() describes stack layout with 12 temporary
832 # 256-bit vectors on top. Then we reserve some space for
833 # !in1infty, !in2infty, result of check for zero and return pointer.
834
835 my $bp_real=$rp_real;
836
837 $code.=<<___;
838 .globl  ecp_nistz256_point_add
839 .align  32
840 ecp_nistz256_point_add:
841         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
842         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
843         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
844         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
845         be      ecp_nistz256_point_add_vis3
846         nop
847
848         save    %sp,-STACK_FRAME-32*12-32,%sp
849
850         stx     $rp,[%fp+STACK_BIAS-8]  ! off-load $rp
851         mov     $ap,$ap_real
852         mov     $bp,$bp_real
853
854         ld      [$bp],@acc[0]           ! in2_x
855         ld      [$bp+4],@acc[1]
856         ld      [$bp+8],@acc[2]
857         ld      [$bp+12],@acc[3]
858         ld      [$bp+16],@acc[4]
859         ld      [$bp+20],@acc[5]
860         ld      [$bp+24],@acc[6]
861         ld      [$bp+28],@acc[7]
862         ld      [$bp+32],$t0            ! in2_y
863         ld      [$bp+32+4],$t1
864         ld      [$bp+32+8],$t2
865         ld      [$bp+32+12],$t3
866         ld      [$bp+32+16],$t4
867         ld      [$bp+32+20],$t5
868         ld      [$bp+32+24],$t6
869         ld      [$bp+32+28],$t7
870         or      @acc[1],@acc[0],@acc[0]
871         or      @acc[3],@acc[2],@acc[2]
872         or      @acc[5],@acc[4],@acc[4]
873         or      @acc[7],@acc[6],@acc[6]
874         or      @acc[2],@acc[0],@acc[0]
875         or      @acc[6],@acc[4],@acc[4]
876         or      @acc[4],@acc[0],@acc[0]
877         or      $t1,$t0,$t0
878         or      $t3,$t2,$t2
879         or      $t5,$t4,$t4
880         or      $t7,$t6,$t6
881         or      $t2,$t0,$t0
882         or      $t6,$t4,$t4
883         or      $t4,$t0,$t0
884         or      @acc[0],$t0,$t0         ! !in2infty
885         movrnz  $t0,-1,$t0
886         st      $t0,[%fp+STACK_BIAS-12]
887
888         ld      [$ap],@acc[0]           ! in1_x
889         ld      [$ap+4],@acc[1]
890         ld      [$ap+8],@acc[2]
891         ld      [$ap+12],@acc[3]
892         ld      [$ap+16],@acc[4]
893         ld      [$ap+20],@acc[5]
894         ld      [$ap+24],@acc[6]
895         ld      [$ap+28],@acc[7]
896         ld      [$ap+32],$t0            ! in1_y
897         ld      [$ap+32+4],$t1
898         ld      [$ap+32+8],$t2
899         ld      [$ap+32+12],$t3
900         ld      [$ap+32+16],$t4
901         ld      [$ap+32+20],$t5
902         ld      [$ap+32+24],$t6
903         ld      [$ap+32+28],$t7
904         or      @acc[1],@acc[0],@acc[0]
905         or      @acc[3],@acc[2],@acc[2]
906         or      @acc[5],@acc[4],@acc[4]
907         or      @acc[7],@acc[6],@acc[6]
908         or      @acc[2],@acc[0],@acc[0]
909         or      @acc[6],@acc[4],@acc[4]
910         or      @acc[4],@acc[0],@acc[0]
911         or      $t1,$t0,$t0
912         or      $t3,$t2,$t2
913         or      $t5,$t4,$t4
914         or      $t7,$t6,$t6
915         or      $t2,$t0,$t0
916         or      $t6,$t4,$t4
917         or      $t4,$t0,$t0
918         or      @acc[0],$t0,$t0         ! !in1infty
919         movrnz  $t0,-1,$t0
920         st      $t0,[%fp+STACK_BIAS-16]
921
922         add     $bp_real,64,$bp
923         add     $bp_real,64,$ap
924         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z2sqr, in2_z);
925         add     %sp,LOCALS+$Z2sqr,$rp
926
927         add     $ap_real,64,$bp
928         add     $ap_real,64,$ap
929         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
930         add     %sp,LOCALS+$Z1sqr,$rp
931
932         add     $bp_real,64,$bp
933         add     %sp,LOCALS+$Z2sqr,$ap
934         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S1, Z2sqr, in2_z);
935         add     %sp,LOCALS+$S1,$rp
936
937         add     $ap_real,64,$bp
938         add     %sp,LOCALS+$Z1sqr,$ap
939         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
940         add     %sp,LOCALS+$S2,$rp
941
942         add     $ap_real,32,$bp
943         add     %sp,LOCALS+$S1,$ap
944         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S1, S1, in1_y);
945         add     %sp,LOCALS+$S1,$rp
946
947         add     $bp_real,32,$bp
948         add     %sp,LOCALS+$S2,$ap
949         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
950         add     %sp,LOCALS+$S2,$rp
951
952         add     %sp,LOCALS+$S1,$bp
953         call    __ecp_nistz256_sub_from ! p256_sub(R, S2, S1);
954         add     %sp,LOCALS+$R,$rp
955
956         or      @acc[1],@acc[0],@acc[0] ! see if result is zero
957         or      @acc[3],@acc[2],@acc[2]
958         or      @acc[5],@acc[4],@acc[4]
959         or      @acc[7],@acc[6],@acc[6]
960         or      @acc[2],@acc[0],@acc[0]
961         or      @acc[6],@acc[4],@acc[4]
962         or      @acc[4],@acc[0],@acc[0]
963         st      @acc[0],[%fp+STACK_BIAS-20]
964
965         add     $ap_real,0,$bp
966         add     %sp,LOCALS+$Z2sqr,$ap
967         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U1, in1_x, Z2sqr);
968         add     %sp,LOCALS+$U1,$rp
969
970         add     $bp_real,0,$bp
971         add     %sp,LOCALS+$Z1sqr,$ap
972         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in2_x, Z1sqr);
973         add     %sp,LOCALS+$U2,$rp
974
975         add     %sp,LOCALS+$U1,$bp
976         call    __ecp_nistz256_sub_from ! p256_sub(H, U2, U1);
977         add     %sp,LOCALS+$H,$rp
978
979         or      @acc[1],@acc[0],@acc[0] ! see if result is zero
980         or      @acc[3],@acc[2],@acc[2]
981         or      @acc[5],@acc[4],@acc[4]
982         or      @acc[7],@acc[6],@acc[6]
983         or      @acc[2],@acc[0],@acc[0]
984         or      @acc[6],@acc[4],@acc[4]
985         orcc    @acc[4],@acc[0],@acc[0]
986
987         bne,pt  %icc,.Ladd_proceed      ! is_equal(U1,U2)?
988         nop
989
990         ld      [%fp+STACK_BIAS-12],$t0
991         ld      [%fp+STACK_BIAS-16],$t1
992         ld      [%fp+STACK_BIAS-20],$t2
993         andcc   $t0,$t1,%g0
994         be,pt   %icc,.Ladd_proceed      ! (in1infty || in2infty)?
995         nop
996         andcc   $t2,$t2,%g0
997         be,pt   %icc,.Ladd_proceed      ! is_equal(S1,S2)?
998         nop
999
1000         ldx     [%fp+STACK_BIAS-8],$rp
1001         st      %g0,[$rp]
1002         st      %g0,[$rp+4]
1003         st      %g0,[$rp+8]
1004         st      %g0,[$rp+12]
1005         st      %g0,[$rp+16]
1006         st      %g0,[$rp+20]
1007         st      %g0,[$rp+24]
1008         st      %g0,[$rp+28]
1009         st      %g0,[$rp+32]
1010         st      %g0,[$rp+32+4]
1011         st      %g0,[$rp+32+8]
1012         st      %g0,[$rp+32+12]
1013         st      %g0,[$rp+32+16]
1014         st      %g0,[$rp+32+20]
1015         st      %g0,[$rp+32+24]
1016         st      %g0,[$rp+32+28]
1017         st      %g0,[$rp+64]
1018         st      %g0,[$rp+64+4]
1019         st      %g0,[$rp+64+8]
1020         st      %g0,[$rp+64+12]
1021         st      %g0,[$rp+64+16]
1022         st      %g0,[$rp+64+20]
1023         st      %g0,[$rp+64+24]
1024         st      %g0,[$rp+64+28]
1025         b       .Ladd_done
1026         nop
1027
1028 .align  16
1029 .Ladd_proceed:
1030         add     %sp,LOCALS+$R,$bp
1031         add     %sp,LOCALS+$R,$ap
1032         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1033         add     %sp,LOCALS+$Rsqr,$rp
1034
1035         add     $ap_real,64,$bp
1036         add     %sp,LOCALS+$H,$ap
1037         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1038         add     %sp,LOCALS+$res_z,$rp
1039
1040         add     %sp,LOCALS+$H,$bp
1041         add     %sp,LOCALS+$H,$ap
1042         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1043         add     %sp,LOCALS+$Hsqr,$rp
1044
1045         add     $bp_real,64,$bp
1046         add     %sp,LOCALS+$res_z,$ap
1047         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, res_z, in2_z);
1048         add     %sp,LOCALS+$res_z,$rp
1049
1050         add     %sp,LOCALS+$H,$bp
1051         add     %sp,LOCALS+$Hsqr,$ap
1052         call    __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1053         add     %sp,LOCALS+$Hcub,$rp
1054
1055         add     %sp,LOCALS+$U1,$bp
1056         add     %sp,LOCALS+$Hsqr,$ap
1057         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, U1, Hsqr);
1058         add     %sp,LOCALS+$U2,$rp
1059
1060         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1061         add     %sp,LOCALS+$Hsqr,$rp
1062
1063         add     %sp,LOCALS+$Rsqr,$bp
1064         call    __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1065         add     %sp,LOCALS+$res_x,$rp
1066
1067         add     %sp,LOCALS+$Hcub,$bp
1068         call    __ecp_nistz256_sub_from !  p256_sub(res_x, res_x, Hcub);
1069         add     %sp,LOCALS+$res_x,$rp
1070
1071         add     %sp,LOCALS+$U2,$bp
1072         call    __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1073         add     %sp,LOCALS+$res_y,$rp
1074
1075         add     %sp,LOCALS+$Hcub,$bp
1076         add     %sp,LOCALS+$S1,$ap
1077         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S1, Hcub);
1078         add     %sp,LOCALS+$S2,$rp
1079
1080         add     %sp,LOCALS+$R,$bp
1081         add     %sp,LOCALS+$res_y,$ap
1082         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1083         add     %sp,LOCALS+$res_y,$rp
1084
1085         add     %sp,LOCALS+$S2,$bp
1086         call    __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1087         add     %sp,LOCALS+$res_y,$rp
1088
1089         ld      [%fp+STACK_BIAS-16],$t1 ! !in1infty
1090         ld      [%fp+STACK_BIAS-12],$t2 ! !in2infty
1091         ldx     [%fp+STACK_BIAS-8],$rp
1092 ___
1093 for($i=0;$i<96;$i+=8) {                 # conditional moves
1094 $code.=<<___;
1095         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1096         ld      [%sp+LOCALS+$i+4],@acc[1]
1097         ld      [$bp_real+$i],@acc[2]           ! in2
1098         ld      [$bp_real+$i+4],@acc[3]
1099         ld      [$ap_real+$i],@acc[4]           ! in1
1100         ld      [$ap_real+$i+4],@acc[5]
1101         movrz   $t1,@acc[2],@acc[0]
1102         movrz   $t1,@acc[3],@acc[1]
1103         movrz   $t2,@acc[4],@acc[0]
1104         movrz   $t2,@acc[5],@acc[1]
1105         st      @acc[0],[$rp+$i]
1106         st      @acc[1],[$rp+$i+4]
1107 ___
1108 }
1109 $code.=<<___;
1110 .Ladd_done:
1111         ret
1112         restore
1113 .size   ecp_nistz256_point_add,.-ecp_nistz256_point_add
1114 ___
1115 }
1116
1117 ########################################################################
1118 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
1119 #                                    const P256_POINT_AFFINE *in2);
1120 {
1121 my ($res_x,$res_y,$res_z,
1122     $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..9));
1123 my $Z1sqr = $S2;
1124 # above map() describes stack layout with 10 temporary
1125 # 256-bit vectors on top. Then we reserve some space for
1126 # !in1infty, !in2infty, result of check for zero and return pointer.
1127
1128 my @ONE_mont=(1,0,0,-1,-1,-1,-2,0);
1129 my $bp_real=$rp_real;
1130
1131 $code.=<<___;
1132 .globl  ecp_nistz256_point_add_affine
1133 .align  32
1134 ecp_nistz256_point_add_affine:
1135         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
1136         ld      [%g1],%g1               ! OPENSSL_sparcv9cap_P[0]
1137         and     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK),%g1
1138         cmp     %g1,(SPARCV9_VIS3|SPARCV9_64BIT_STACK)
1139         be      ecp_nistz256_point_add_affine_vis3
1140         nop
1141
1142         save    %sp,-STACK_FRAME-32*10-32,%sp
1143
1144         stx     $rp,[%fp+STACK_BIAS-8]  ! off-load $rp
1145         mov     $ap,$ap_real
1146         mov     $bp,$bp_real
1147
1148         ld      [$ap],@acc[0]           ! in1_x
1149         ld      [$ap+4],@acc[1]
1150         ld      [$ap+8],@acc[2]
1151         ld      [$ap+12],@acc[3]
1152         ld      [$ap+16],@acc[4]
1153         ld      [$ap+20],@acc[5]
1154         ld      [$ap+24],@acc[6]
1155         ld      [$ap+28],@acc[7]
1156         ld      [$ap+32],$t0            ! in1_y
1157         ld      [$ap+32+4],$t1
1158         ld      [$ap+32+8],$t2
1159         ld      [$ap+32+12],$t3
1160         ld      [$ap+32+16],$t4
1161         ld      [$ap+32+20],$t5
1162         ld      [$ap+32+24],$t6
1163         ld      [$ap+32+28],$t7
1164         or      @acc[1],@acc[0],@acc[0]
1165         or      @acc[3],@acc[2],@acc[2]
1166         or      @acc[5],@acc[4],@acc[4]
1167         or      @acc[7],@acc[6],@acc[6]
1168         or      @acc[2],@acc[0],@acc[0]
1169         or      @acc[6],@acc[4],@acc[4]
1170         or      @acc[4],@acc[0],@acc[0]
1171         or      $t1,$t0,$t0
1172         or      $t3,$t2,$t2
1173         or      $t5,$t4,$t4
1174         or      $t7,$t6,$t6
1175         or      $t2,$t0,$t0
1176         or      $t6,$t4,$t4
1177         or      $t4,$t0,$t0
1178         or      @acc[0],$t0,$t0         ! !in1infty
1179         movrnz  $t0,-1,$t0
1180         st      $t0,[%fp+STACK_BIAS-16]
1181
1182         ld      [$bp],@acc[0]           ! in2_x
1183         ld      [$bp+4],@acc[1]
1184         ld      [$bp+8],@acc[2]
1185         ld      [$bp+12],@acc[3]
1186         ld      [$bp+16],@acc[4]
1187         ld      [$bp+20],@acc[5]
1188         ld      [$bp+24],@acc[6]
1189         ld      [$bp+28],@acc[7]
1190         ld      [$bp+32],$t0            ! in2_y
1191         ld      [$bp+32+4],$t1
1192         ld      [$bp+32+8],$t2
1193         ld      [$bp+32+12],$t3
1194         ld      [$bp+32+16],$t4
1195         ld      [$bp+32+20],$t5
1196         ld      [$bp+32+24],$t6
1197         ld      [$bp+32+28],$t7
1198         or      @acc[1],@acc[0],@acc[0]
1199         or      @acc[3],@acc[2],@acc[2]
1200         or      @acc[5],@acc[4],@acc[4]
1201         or      @acc[7],@acc[6],@acc[6]
1202         or      @acc[2],@acc[0],@acc[0]
1203         or      @acc[6],@acc[4],@acc[4]
1204         or      @acc[4],@acc[0],@acc[0]
1205         or      $t1,$t0,$t0
1206         or      $t3,$t2,$t2
1207         or      $t5,$t4,$t4
1208         or      $t7,$t6,$t6
1209         or      $t2,$t0,$t0
1210         or      $t6,$t4,$t4
1211         or      $t4,$t0,$t0
1212         or      @acc[0],$t0,$t0         ! !in2infty
1213         movrnz  $t0,-1,$t0
1214         st      $t0,[%fp+STACK_BIAS-12]
1215
1216         add     $ap_real,64,$bp
1217         add     $ap_real,64,$ap
1218         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Z1sqr, in1_z);
1219         add     %sp,LOCALS+$Z1sqr,$rp
1220
1221         add     $bp_real,0,$bp
1222         add     %sp,LOCALS+$Z1sqr,$ap
1223         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, Z1sqr, in2_x);
1224         add     %sp,LOCALS+$U2,$rp
1225
1226         add     $ap_real,0,$bp
1227         call    __ecp_nistz256_sub_from ! p256_sub(H, U2, in1_x);
1228         add     %sp,LOCALS+$H,$rp
1229
1230         add     $ap_real,64,$bp
1231         add     %sp,LOCALS+$Z1sqr,$ap
1232         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, Z1sqr, in1_z);
1233         add     %sp,LOCALS+$S2,$rp
1234
1235         add     $ap_real,64,$bp
1236         add     %sp,LOCALS+$H,$ap
1237         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_z, H, in1_z);
1238         add     %sp,LOCALS+$res_z,$rp
1239
1240         add     $bp_real,32,$bp
1241         add     %sp,LOCALS+$S2,$ap
1242         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, S2, in2_y);
1243         add     %sp,LOCALS+$S2,$rp
1244
1245         add     $ap_real,32,$bp
1246         call    __ecp_nistz256_sub_from ! p256_sub(R, S2, in1_y);
1247         add     %sp,LOCALS+$R,$rp
1248
1249         add     %sp,LOCALS+$H,$bp
1250         add     %sp,LOCALS+$H,$ap
1251         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Hsqr, H);
1252         add     %sp,LOCALS+$Hsqr,$rp
1253
1254         add     %sp,LOCALS+$R,$bp
1255         add     %sp,LOCALS+$R,$ap
1256         call    __ecp_nistz256_mul_mont ! p256_sqr_mont(Rsqr, R);
1257         add     %sp,LOCALS+$Rsqr,$rp
1258
1259         add     %sp,LOCALS+$H,$bp
1260         add     %sp,LOCALS+$Hsqr,$ap
1261         call    __ecp_nistz256_mul_mont ! p256_mul_mont(Hcub, Hsqr, H);
1262         add     %sp,LOCALS+$Hcub,$rp
1263
1264         add     $ap_real,0,$bp
1265         add     %sp,LOCALS+$Hsqr,$ap
1266         call    __ecp_nistz256_mul_mont ! p256_mul_mont(U2, in1_x, Hsqr);
1267         add     %sp,LOCALS+$U2,$rp
1268
1269         call    __ecp_nistz256_mul_by_2 ! p256_mul_by_2(Hsqr, U2);
1270         add     %sp,LOCALS+$Hsqr,$rp
1271
1272         add     %sp,LOCALS+$Rsqr,$bp
1273         call    __ecp_nistz256_sub_morf ! p256_sub(res_x, Rsqr, Hsqr);
1274         add     %sp,LOCALS+$res_x,$rp
1275
1276         add     %sp,LOCALS+$Hcub,$bp
1277         call    __ecp_nistz256_sub_from !  p256_sub(res_x, res_x, Hcub);
1278         add     %sp,LOCALS+$res_x,$rp
1279
1280         add     %sp,LOCALS+$U2,$bp
1281         call    __ecp_nistz256_sub_morf ! p256_sub(res_y, U2, res_x);
1282         add     %sp,LOCALS+$res_y,$rp
1283
1284         add     $ap_real,32,$bp
1285         add     %sp,LOCALS+$Hcub,$ap
1286         call    __ecp_nistz256_mul_mont ! p256_mul_mont(S2, in1_y, Hcub);
1287         add     %sp,LOCALS+$S2,$rp
1288
1289         add     %sp,LOCALS+$R,$bp
1290         add     %sp,LOCALS+$res_y,$ap
1291         call    __ecp_nistz256_mul_mont ! p256_mul_mont(res_y, res_y, R);
1292         add     %sp,LOCALS+$res_y,$rp
1293
1294         add     %sp,LOCALS+$S2,$bp
1295         call    __ecp_nistz256_sub_from ! p256_sub(res_y, res_y, S2);
1296         add     %sp,LOCALS+$res_y,$rp
1297
1298         ld      [%fp+STACK_BIAS-16],$t1 ! !in1infty
1299         ld      [%fp+STACK_BIAS-12],$t2 ! !in2infty
1300         ldx     [%fp+STACK_BIAS-8],$rp
1301 ___
1302 for($i=0;$i<64;$i+=8) {                 # conditional moves
1303 $code.=<<___;
1304         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1305         ld      [%sp+LOCALS+$i+4],@acc[1]
1306         ld      [$bp_real+$i],@acc[2]           ! in2
1307         ld      [$bp_real+$i+4],@acc[3]
1308         ld      [$ap_real+$i],@acc[4]           ! in1
1309         ld      [$ap_real+$i+4],@acc[5]
1310         movrz   $t1,@acc[2],@acc[0]
1311         movrz   $t1,@acc[3],@acc[1]
1312         movrz   $t2,@acc[4],@acc[0]
1313         movrz   $t2,@acc[5],@acc[1]
1314         st      @acc[0],[$rp+$i]
1315         st      @acc[1],[$rp+$i+4]
1316 ___
1317 }
1318 for(;$i<96;$i+=8) {
1319 my $j=($i-64)/4;
1320 $code.=<<___;
1321         ld      [%sp+LOCALS+$i],@acc[0]         ! res
1322         ld      [%sp+LOCALS+$i+4],@acc[1]
1323         ld      [$ap_real+$i],@acc[4]           ! in1
1324         ld      [$ap_real+$i+4],@acc[5]
1325         movrz   $t1,@ONE_mont[$j],@acc[0]
1326         movrz   $t1,@ONE_mont[$j+1],@acc[1]
1327         movrz   $t2,@acc[4],@acc[0]
1328         movrz   $t2,@acc[5],@acc[1]
1329         st      @acc[0],[$rp+$i]
1330         st      @acc[1],[$rp+$i+4]
1331 ___
1332 }
1333 $code.=<<___;
1334         ret
1335         restore
1336 .size   ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine
1337 ___
1338 }                                                               }}}
1339 {{{
1340 my ($out,$inp,$index)=map("%i$_",(0..2));
1341 my $mask="%o0";
1342
1343 $code.=<<___;
1344 ! void  ecp_nistz256_scatter_w5(void *%i0,const P256_POINT *%i1,
1345 !                                         int %i2);
1346 .globl  ecp_nistz256_scatter_w5
1347 .align  32
1348 ecp_nistz256_scatter_w5:
1349         save    %sp,-STACK_FRAME,%sp
1350
1351         sll     $index,2,$index
1352         add     $out,$index,$out
1353
1354         ld      [$inp],%l0              ! X
1355         ld      [$inp+4],%l1
1356         ld      [$inp+8],%l2
1357         ld      [$inp+12],%l3
1358         ld      [$inp+16],%l4
1359         ld      [$inp+20],%l5
1360         ld      [$inp+24],%l6
1361         ld      [$inp+28],%l7
1362         add     $inp,32,$inp
1363         st      %l0,[$out+64*0-4]
1364         st      %l1,[$out+64*1-4]
1365         st      %l2,[$out+64*2-4]
1366         st      %l3,[$out+64*3-4]
1367         st      %l4,[$out+64*4-4]
1368         st      %l5,[$out+64*5-4]
1369         st      %l6,[$out+64*6-4]
1370         st      %l7,[$out+64*7-4]
1371         add     $out,64*8,$out
1372
1373         ld      [$inp],%l0              ! Y
1374         ld      [$inp+4],%l1
1375         ld      [$inp+8],%l2
1376         ld      [$inp+12],%l3
1377         ld      [$inp+16],%l4
1378         ld      [$inp+20],%l5
1379         ld      [$inp+24],%l6
1380         ld      [$inp+28],%l7
1381         add     $inp,32,$inp
1382         st      %l0,[$out+64*0-4]
1383         st      %l1,[$out+64*1-4]
1384         st      %l2,[$out+64*2-4]
1385         st      %l3,[$out+64*3-4]
1386         st      %l4,[$out+64*4-4]
1387         st      %l5,[$out+64*5-4]
1388         st      %l6,[$out+64*6-4]
1389         st      %l7,[$out+64*7-4]
1390         add     $out,64*8,$out
1391
1392         ld      [$inp],%l0              ! Z
1393         ld      [$inp+4],%l1
1394         ld      [$inp+8],%l2
1395         ld      [$inp+12],%l3
1396         ld      [$inp+16],%l4
1397         ld      [$inp+20],%l5
1398         ld      [$inp+24],%l6
1399         ld      [$inp+28],%l7
1400         st      %l0,[$out+64*0-4]
1401         st      %l1,[$out+64*1-4]
1402         st      %l2,[$out+64*2-4]
1403         st      %l3,[$out+64*3-4]
1404         st      %l4,[$out+64*4-4]
1405         st      %l5,[$out+64*5-4]
1406         st      %l6,[$out+64*6-4]
1407         st      %l7,[$out+64*7-4]
1408
1409         ret
1410         restore
1411 .size   ecp_nistz256_scatter_w5,.-ecp_nistz256_scatter_w5
1412
1413 ! void  ecp_nistz256_gather_w5(P256_POINT *%i0,const void *%i1,
1414 !                                              int %i2);
1415 .globl  ecp_nistz256_gather_w5
1416 .align  32
1417 ecp_nistz256_gather_w5:
1418         save    %sp,-STACK_FRAME,%sp
1419
1420         neg     $index,$mask
1421         srax    $mask,63,$mask
1422
1423         add     $index,$mask,$index
1424         sll     $index,2,$index
1425         add     $inp,$index,$inp
1426
1427         ld      [$inp+64*0],%l0
1428         ld      [$inp+64*1],%l1
1429         ld      [$inp+64*2],%l2
1430         ld      [$inp+64*3],%l3
1431         ld      [$inp+64*4],%l4
1432         ld      [$inp+64*5],%l5
1433         ld      [$inp+64*6],%l6
1434         ld      [$inp+64*7],%l7
1435         add     $inp,64*8,$inp
1436         and     %l0,$mask,%l0
1437         and     %l1,$mask,%l1
1438         st      %l0,[$out]              ! X
1439         and     %l2,$mask,%l2
1440         st      %l1,[$out+4]
1441         and     %l3,$mask,%l3
1442         st      %l2,[$out+8]
1443         and     %l4,$mask,%l4
1444         st      %l3,[$out+12]
1445         and     %l5,$mask,%l5
1446         st      %l4,[$out+16]
1447         and     %l6,$mask,%l6
1448         st      %l5,[$out+20]
1449         and     %l7,$mask,%l7
1450         st      %l6,[$out+24]
1451         st      %l7,[$out+28]
1452         add     $out,32,$out
1453
1454         ld      [$inp+64*0],%l0
1455         ld      [$inp+64*1],%l1
1456         ld      [$inp+64*2],%l2
1457         ld      [$inp+64*3],%l3
1458         ld      [$inp+64*4],%l4
1459         ld      [$inp+64*5],%l5
1460         ld      [$inp+64*6],%l6
1461         ld      [$inp+64*7],%l7
1462         add     $inp,64*8,$inp
1463         and     %l0,$mask,%l0
1464         and     %l1,$mask,%l1
1465         st      %l0,[$out]              ! Y
1466         and     %l2,$mask,%l2
1467         st      %l1,[$out+4]
1468         and     %l3,$mask,%l3
1469         st      %l2,[$out+8]
1470         and     %l4,$mask,%l4
1471         st      %l3,[$out+12]
1472         and     %l5,$mask,%l5
1473         st      %l4,[$out+16]
1474         and     %l6,$mask,%l6
1475         st      %l5,[$out+20]
1476         and     %l7,$mask,%l7
1477         st      %l6,[$out+24]
1478         st      %l7,[$out+28]
1479         add     $out,32,$out
1480
1481         ld      [$inp+64*0],%l0
1482         ld      [$inp+64*1],%l1
1483         ld      [$inp+64*2],%l2
1484         ld      [$inp+64*3],%l3
1485         ld      [$inp+64*4],%l4
1486         ld      [$inp+64*5],%l5
1487         ld      [$inp+64*6],%l6
1488         ld      [$inp+64*7],%l7
1489         and     %l0,$mask,%l0
1490         and     %l1,$mask,%l1
1491         st      %l0,[$out]              ! Z
1492         and     %l2,$mask,%l2
1493         st      %l1,[$out+4]
1494         and     %l3,$mask,%l3
1495         st      %l2,[$out+8]
1496         and     %l4,$mask,%l4
1497         st      %l3,[$out+12]
1498         and     %l5,$mask,%l5
1499         st      %l4,[$out+16]
1500         and     %l6,$mask,%l6
1501         st      %l5,[$out+20]
1502         and     %l7,$mask,%l7
1503         st      %l6,[$out+24]
1504         st      %l7,[$out+28]
1505
1506         ret
1507         restore
1508 .size   ecp_nistz256_gather_w5,.-ecp_nistz256_gather_w5
1509
1510 ! void  ecp_nistz256_scatter_w7(void *%i0,const P256_POINT_AFFINE *%i1,
1511 !                                         int %i2);
1512 .globl  ecp_nistz256_scatter_w7
1513 .align  32
1514 ecp_nistz256_scatter_w7:
1515         save    %sp,-STACK_FRAME,%sp
1516         nop
1517         add     $out,$index,$out
1518         mov     64/4,$index
1519 .Loop_scatter_w7:
1520         ld      [$inp],%l0
1521         add     $inp,4,$inp
1522         subcc   $index,1,$index
1523         stb     %l0,[$out+64*0-1]
1524         srl     %l0,8,%l1
1525         stb     %l1,[$out+64*1-1]
1526         srl     %l0,16,%l2
1527         stb     %l2,[$out+64*2-1]
1528         srl     %l0,24,%l3
1529         stb     %l3,[$out+64*3-1]
1530         bne     .Loop_scatter_w7
1531         add     $out,64*4,$out
1532
1533         ret
1534         restore
1535 .size   ecp_nistz256_scatter_w7,.-ecp_nistz256_scatter_w7
1536
1537 ! void  ecp_nistz256_gather_w7(P256_POINT_AFFINE *%i0,const void *%i1,
1538 !                                                     int %i2);
1539 .globl  ecp_nistz256_gather_w7
1540 .align  32
1541 ecp_nistz256_gather_w7:
1542         save    %sp,-STACK_FRAME,%sp
1543
1544         neg     $index,$mask
1545         srax    $mask,63,$mask
1546
1547         add     $index,$mask,$index
1548         add     $inp,$index,$inp
1549         mov     64/4,$index
1550
1551 .Loop_gather_w7:
1552         ldub    [$inp+64*0],%l0
1553         prefetch [$inp+3840+64*0],1
1554         subcc   $index,1,$index
1555         ldub    [$inp+64*1],%l1
1556         prefetch [$inp+3840+64*1],1
1557         ldub    [$inp+64*2],%l2
1558         prefetch [$inp+3840+64*2],1
1559         ldub    [$inp+64*3],%l3
1560         prefetch [$inp+3840+64*3],1
1561         add     $inp,64*4,$inp
1562         sll     %l1,8,%l1
1563         sll     %l2,16,%l2
1564         or      %l0,%l1,%l0
1565         sll     %l3,24,%l3
1566         or      %l0,%l2,%l0
1567         or      %l0,%l3,%l0
1568         and     %l0,$mask,%l0
1569         st      %l0,[$out]
1570         bne     .Loop_gather_w7
1571         add     $out,4,$out
1572
1573         ret
1574         restore
1575 .size   ecp_nistz256_gather_w7,.-ecp_nistz256_gather_w7
1576 ___
1577 }}}
1578 {{{
1579 ########################################################################
1580 # Following subroutines are VIS3 counterparts of those above that
1581 # implement ones found in ecp_nistz256.c. Key difference is that they
1582 # use 128-bit muliplication and addition with 64-bit carry, and in order
1583 # to do that they perform conversion from uin32_t[8] to uint64_t[4] upon
1584 # entry and vice versa on return.
1585 #
1586 my ($rp,$ap,$bp)=map("%i$_",(0..2));
1587 my ($t0,$t1,$t2,$t3,$a0,$a1,$a2,$a3)=map("%l$_",(0..7));
1588 my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5)=map("%o$_",(0..5));
1589 my ($bi,$poly1,$poly3,$minus1)=(map("%i$_",(3..5)),"%g1");
1590 my ($rp_real,$ap_real)=("%g2","%g3");
1591 my ($acc6,$acc7)=($bp,$bi);     # used in squaring
1592
1593 $code.=<<___;
1594 .align  32
1595 __ecp_nistz256_mul_by_2_vis3:
1596         addcc   $acc0,$acc0,$acc0
1597         addxccc $acc1,$acc1,$acc1
1598         addxccc $acc2,$acc2,$acc2
1599         addxccc $acc3,$acc3,$acc3
1600         b       .Lreduce_by_sub_vis3
1601         addxc   %g0,%g0,$acc4           ! did it carry?
1602 .size   __ecp_nistz256_mul_by_2_vis3,.-__ecp_nistz256_mul_by_2_vis3
1603
1604 .align  32
1605 __ecp_nistz256_add_vis3:
1606         ldx     [$bp+0],$t0
1607         ldx     [$bp+8],$t1
1608         ldx     [$bp+16],$t2
1609         ldx     [$bp+24],$t3
1610
1611 __ecp_nistz256_add_noload_vis3:
1612
1613         addcc   $t0,$acc0,$acc0
1614         addxccc $t1,$acc1,$acc1
1615         addxccc $t2,$acc2,$acc2
1616         addxccc $t3,$acc3,$acc3
1617         addxc   %g0,%g0,$acc4           ! did it carry?
1618
1619 .Lreduce_by_sub_vis3:
1620
1621         addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
1622         addxccc $acc1,$poly1,$t1
1623         addxccc $acc2,$minus1,$t2
1624         addxc   $acc3,$poly3,$t3
1625
1626         movrnz  $acc4,$t0,$acc0         ! if a+b carried, ret = ret-mod
1627         movrnz  $acc4,$t1,$acc1
1628         stx     $acc0,[$rp]
1629         movrnz  $acc4,$t2,$acc2
1630         stx     $acc1,[$rp+8]
1631         movrnz  $acc4,$t3,$acc3
1632         stx     $acc2,[$rp+16]
1633         retl
1634         stx     $acc3,[$rp+24]
1635 .size   __ecp_nistz256_add_vis3,.-__ecp_nistz256_add_vis3
1636
1637 ! Trouble with subtraction is that there is no subtraction with 64-bit
1638 ! borrow, only with 32-bit one. For this reason we "decompose" 64-bit
1639 ! $acc0-$acc3 to 32-bit values and pick b[4] in 32-bit pieces. But
1640 ! recall that SPARC is big-endian, which is why you'll observe that
1641 ! b[4] is accessed as 4-0-12-8-20-16-28-24. And prior reduction we
1642 ! "collect" result back to 64-bit $acc0-$acc3.
1643 .align  32
1644 __ecp_nistz256_sub_from_vis3:
1645         ld      [$bp+4],$t0
1646         ld      [$bp+0],$t1
1647         ld      [$bp+12],$t2
1648         ld      [$bp+8],$t3
1649
1650         srlx    $acc0,32,$acc4
1651         not     $poly1,$poly1
1652         srlx    $acc1,32,$acc5
1653         subcc   $acc0,$t0,$acc0
1654         ld      [$bp+20],$t0
1655         subccc  $acc4,$t1,$acc4
1656         ld      [$bp+16],$t1
1657         subccc  $acc1,$t2,$acc1
1658         ld      [$bp+28],$t2
1659         and     $acc0,$poly1,$acc0
1660         subccc  $acc5,$t3,$acc5
1661         ld      [$bp+24],$t3
1662         sllx    $acc4,32,$acc4
1663         and     $acc1,$poly1,$acc1
1664         sllx    $acc5,32,$acc5
1665         or      $acc0,$acc4,$acc0
1666         srlx    $acc2,32,$acc4
1667         or      $acc1,$acc5,$acc1
1668         srlx    $acc3,32,$acc5
1669         subccc  $acc2,$t0,$acc2
1670         subccc  $acc4,$t1,$acc4
1671         subccc  $acc3,$t2,$acc3
1672         and     $acc2,$poly1,$acc2
1673         subccc  $acc5,$t3,$acc5
1674         sllx    $acc4,32,$acc4
1675         and     $acc3,$poly1,$acc3
1676         sllx    $acc5,32,$acc5
1677         or      $acc2,$acc4,$acc2
1678         subc    %g0,%g0,$acc4           ! did it borrow?
1679         b       .Lreduce_by_add_vis3
1680         or      $acc3,$acc5,$acc3
1681 .size   __ecp_nistz256_sub_from_vis3,.-__ecp_nistz256_sub_from_vis3
1682
1683 .align  32
1684 __ecp_nistz256_sub_morf_vis3:
1685         ld      [$bp+4],$t0
1686         ld      [$bp+0],$t1
1687         ld      [$bp+12],$t2
1688         ld      [$bp+8],$t3
1689
1690         srlx    $acc0,32,$acc4
1691         not     $poly1,$poly1
1692         srlx    $acc1,32,$acc5
1693         subcc   $t0,$acc0,$acc0
1694         ld      [$bp+20],$t0
1695         subccc  $t1,$acc4,$acc4
1696         ld      [$bp+16],$t1
1697         subccc  $t2,$acc1,$acc1
1698         ld      [$bp+28],$t2
1699         and     $acc0,$poly1,$acc0
1700         subccc  $t3,$acc5,$acc5
1701         ld      [$bp+24],$t3
1702         sllx    $acc4,32,$acc4
1703         and     $acc1,$poly1,$acc1
1704         sllx    $acc5,32,$acc5
1705         or      $acc0,$acc4,$acc0
1706         srlx    $acc2,32,$acc4
1707         or      $acc1,$acc5,$acc1
1708         srlx    $acc3,32,$acc5
1709         subccc  $t0,$acc2,$acc2
1710         subccc  $t1,$acc4,$acc4
1711         subccc  $t2,$acc3,$acc3
1712         and     $acc2,$poly1,$acc2
1713         subccc  $t3,$acc5,$acc5
1714         sllx    $acc4,32,$acc4
1715         and     $acc3,$poly1,$acc3
1716         sllx    $acc5,32,$acc5
1717         or      $acc2,$acc4,$acc2
1718         subc    %g0,%g0,$acc4           ! did it borrow?
1719         or      $acc3,$acc5,$acc3
1720
1721 .Lreduce_by_add_vis3:
1722
1723         addcc   $acc0,-1,$t0            ! add modulus
1724         not     $poly3,$t3
1725         addxccc $acc1,$poly1,$t1
1726         not     $poly1,$poly1           ! restore $poly1
1727         addxccc $acc2,%g0,$t2
1728         addxc   $acc3,$t3,$t3
1729
1730         movrnz  $acc4,$t0,$acc0         ! if a-b borrowed, ret = ret+mod
1731         movrnz  $acc4,$t1,$acc1
1732         stx     $acc0,[$rp]
1733         movrnz  $acc4,$t2,$acc2
1734         stx     $acc1,[$rp+8]
1735         movrnz  $acc4,$t3,$acc3
1736         stx     $acc2,[$rp+16]
1737         retl
1738         stx     $acc3,[$rp+24]
1739 .size   __ecp_nistz256_sub_morf_vis3,.-__ecp_nistz256_sub_morf_vis3
1740
1741 .align  32
1742 __ecp_nistz256_div_by_2_vis3:
1743         ! ret = (a is odd ? a+mod : a) >> 1
1744
1745         not     $poly1,$t1
1746         not     $poly3,$t3
1747         and     $acc0,1,$acc5
1748         addcc   $acc0,-1,$t0            ! add modulus
1749         addxccc $acc1,$t1,$t1
1750         addxccc $acc2,%g0,$t2
1751         addxccc $acc3,$t3,$t3
1752         addxc   %g0,%g0,$acc4           ! carry bit
1753
1754         movrnz  $acc5,$t0,$acc0
1755         movrnz  $acc5,$t1,$acc1
1756         movrnz  $acc5,$t2,$acc2
1757         movrnz  $acc5,$t3,$acc3
1758         movrz   $acc5,%g0,$acc4
1759
1760         ! ret >>= 1
1761
1762         srlx    $acc0,1,$acc0
1763         sllx    $acc1,63,$t0
1764         srlx    $acc1,1,$acc1
1765         or      $acc0,$t0,$acc0
1766         sllx    $acc2,63,$t1
1767         srlx    $acc2,1,$acc2
1768         or      $acc1,$t1,$acc1
1769         sllx    $acc3,63,$t2
1770         stx     $acc0,[$rp]
1771         srlx    $acc3,1,$acc3
1772         or      $acc2,$t2,$acc2
1773         sllx    $acc4,63,$t3            ! don't forget carry bit
1774         stx     $acc1,[$rp+8]
1775         or      $acc3,$t3,$acc3
1776         stx     $acc2,[$rp+16]
1777         retl
1778         stx     $acc3,[$rp+24]
1779 .size   __ecp_nistz256_div_by_2_vis3,.-__ecp_nistz256_div_by_2_vis3
1780
1781 ! compared to __ecp_nistz256_mul_mont it's almost 4x smaller and
1782 ! 4x faster [on T4]...
1783 .align  32
1784 __ecp_nistz256_mul_mont_vis3:
1785         mulx    $a0,$bi,$acc0
1786         not     $poly3,$poly3           ! 0xFFFFFFFF00000001
1787         umulxhi $a0,$bi,$t0
1788         mulx    $a1,$bi,$acc1
1789         umulxhi $a1,$bi,$t1
1790         mulx    $a2,$bi,$acc2
1791         umulxhi $a2,$bi,$t2
1792         mulx    $a3,$bi,$acc3
1793         umulxhi $a3,$bi,$t3
1794         ldx     [$bp+8],$bi             ! b[1]
1795
1796         addcc   $acc1,$t0,$acc1         ! accumulate high parts of multiplication
1797          sllx   $acc0,32,$t0
1798         addxccc $acc2,$t1,$acc2
1799          srlx   $acc0,32,$t1
1800         addxccc $acc3,$t2,$acc3
1801         addxc   %g0,$t3,$acc4
1802         mov     0,$acc5
1803 ___
1804 for($i=1;$i<4;$i++) {
1805         # Reduction iteration is normally performed by accumulating
1806         # result of multiplication of modulus by "magic" digit [and
1807         # omitting least significant word, which is guaranteed to
1808         # be 0], but thanks to special form of modulus and "magic"
1809         # digit being equal to least significant word, it can be
1810         # performed with additions and subtractions alone. Indeed:
1811         #
1812         #            ffff0001.00000000.0000ffff.ffffffff
1813         # *                                     abcdefgh
1814         # + xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1815         #
1816         # Now observing that ff..ff*x = (2^n-1)*x = 2^n*x-x, we
1817         # rewrite above as:
1818         #
1819         #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.abcdefgh
1820         # + abcdefgh.abcdefgh.0000abcd.efgh0000.00000000
1821         # - 0000abcd.efgh0000.00000000.00000000.abcdefgh
1822         #
1823         # or marking redundant operations:
1824         #
1825         #   xxxxxxxx.xxxxxxxx.xxxxxxxx.xxxxxxxx.--------
1826         # + abcdefgh.abcdefgh.0000abcd.efgh0000.--------
1827         # - 0000abcd.efgh0000.--------.--------.--------
1828         #   ^^^^^^^^ but this word is calculated with umulxhi, because
1829         #            there is no subtract with 64-bit borrow:-(
1830
1831 $code.=<<___;
1832         sub     $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1833         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1834         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1835         mulx    $a0,$bi,$t0
1836         addxccc $acc2,$t1,$acc1
1837         mulx    $a1,$bi,$t1
1838         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1839         mulx    $a2,$bi,$t2
1840         addxccc $acc4,$t3,$acc3
1841         mulx    $a3,$bi,$t3
1842         addxc   $acc5,%g0,$acc4
1843
1844         addcc   $acc0,$t0,$acc0         ! accumulate low parts of multiplication
1845         umulxhi $a0,$bi,$t0
1846         addxccc $acc1,$t1,$acc1
1847         umulxhi $a1,$bi,$t1
1848         addxccc $acc2,$t2,$acc2
1849         umulxhi $a2,$bi,$t2
1850         addxccc $acc3,$t3,$acc3
1851         umulxhi $a3,$bi,$t3
1852         addxc   $acc4,%g0,$acc4
1853 ___
1854 $code.=<<___    if ($i<3);
1855         ldx     [$bp+8*($i+1)],$bi      ! bp[$i+1]
1856 ___
1857 $code.=<<___;
1858         addcc   $acc1,$t0,$acc1         ! accumulate high parts of multiplication 
1859          sllx   $acc0,32,$t0
1860         addxccc $acc2,$t1,$acc2
1861          srlx   $acc0,32,$t1
1862         addxccc $acc3,$t2,$acc3
1863         addxccc $acc4,$t3,$acc4
1864         addxc   %g0,%g0,$acc5
1865 ___
1866 }
1867 $code.=<<___;
1868         sub     $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1869         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1870         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1871         addxccc $acc2,$t1,$acc1
1872         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1873         addxccc $acc4,$t3,$acc3
1874         b       .Lmul_final_vis3        ! see below
1875         addxc   $acc5,%g0,$acc4
1876 .size   __ecp_nistz256_mul_mont_vis3,.-__ecp_nistz256_mul_mont_vis3
1877
1878 ! compared to above __ecp_nistz256_mul_mont_vis3 it's 21% less
1879 ! instructions, but only 14% faster [on T4]...
1880 .align  32
1881 __ecp_nistz256_sqr_mont_vis3:
1882         !  |  |  |  |  |  |a1*a0|  |
1883         !  |  |  |  |  |a2*a0|  |  |
1884         !  |  |a3*a2|a3*a0|  |  |  |
1885         !  |  |  |  |a2*a1|  |  |  |
1886         !  |  |  |a3*a1|  |  |  |  |
1887         ! *|  |  |  |  |  |  |  | 2|
1888         ! +|a3*a3|a2*a2|a1*a1|a0*a0|
1889         !  |--+--+--+--+--+--+--+--|
1890         !  |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is $accx, i.e. follow $accx
1891         !
1892         !  "can't overflow" below mark carrying into high part of
1893         !  multiplication result, which can't overflow, because it
1894         !  can never be all ones.
1895
1896         mulx    $a1,$a0,$acc1           ! a[1]*a[0]
1897         umulxhi $a1,$a0,$t1
1898         mulx    $a2,$a0,$acc2           ! a[2]*a[0]
1899         umulxhi $a2,$a0,$t2
1900         mulx    $a3,$a0,$acc3           ! a[3]*a[0]
1901         umulxhi $a3,$a0,$acc4
1902
1903         addcc   $acc2,$t1,$acc2         ! accumulate high parts of multiplication
1904         mulx    $a2,$a1,$t0             ! a[2]*a[1]
1905         umulxhi $a2,$a1,$t1
1906         addxccc $acc3,$t2,$acc3
1907         mulx    $a3,$a1,$t2             ! a[3]*a[1]
1908         umulxhi $a3,$a1,$t3
1909         addxc   $acc4,%g0,$acc4         ! can't overflow
1910
1911         mulx    $a3,$a2,$acc5           ! a[3]*a[2]
1912         not     $poly3,$poly3           ! 0xFFFFFFFF00000001
1913         umulxhi $a3,$a2,$acc6
1914
1915         addcc   $t2,$t1,$t1             ! accumulate high parts of multiplication
1916         mulx    $a0,$a0,$acc0           ! a[0]*a[0]
1917         addxc   $t3,%g0,$t2             ! can't overflow
1918
1919         addcc   $acc3,$t0,$acc3         ! accumulate low parts of multiplication
1920         umulxhi $a0,$a0,$a0
1921         addxccc $acc4,$t1,$acc4
1922         mulx    $a1,$a1,$t1             ! a[1]*a[1]
1923         addxccc $acc5,$t2,$acc5
1924         umulxhi $a1,$a1,$a1
1925         addxc   $acc6,%g0,$acc6         ! can't overflow
1926
1927         addcc   $acc1,$acc1,$acc1       ! acc[1-6]*=2
1928         mulx    $a2,$a2,$t2             ! a[2]*a[2]
1929         addxccc $acc2,$acc2,$acc2
1930         umulxhi $a2,$a2,$a2
1931         addxccc $acc3,$acc3,$acc3
1932         mulx    $a3,$a3,$t3             ! a[3]*a[3]
1933         addxccc $acc4,$acc4,$acc4
1934         umulxhi $a3,$a3,$a3
1935         addxccc $acc5,$acc5,$acc5
1936         addxccc $acc6,$acc6,$acc6
1937         addxc   %g0,%g0,$acc7
1938
1939         addcc   $acc1,$a0,$acc1         ! +a[i]*a[i]
1940         addxccc $acc2,$t1,$acc2
1941         addxccc $acc3,$a1,$acc3
1942         addxccc $acc4,$t2,$acc4
1943          sllx   $acc0,32,$t0
1944         addxccc $acc5,$a2,$acc5
1945          srlx   $acc0,32,$t1
1946         addxccc $acc6,$t3,$acc6
1947          sub    $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1948         addxc   $acc7,$a3,$acc7
1949 ___
1950 for($i=0;$i<3;$i++) {                   # reductions, see commentary
1951                                         # in multiplication for details
1952 $code.=<<___;
1953         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1954         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1955          sllx   $acc0,32,$t0
1956         addxccc $acc2,$t1,$acc1
1957          srlx   $acc0,32,$t1
1958         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1959          sub    $acc0,$t0,$t2           ! acc0*0xFFFFFFFF00000001, low part
1960         addxc   %g0,$t3,$acc3           ! cant't overflow
1961 ___
1962 }
1963 $code.=<<___;
1964         umulxhi $acc0,$poly3,$t3        ! acc0*0xFFFFFFFF00000001, high part
1965         addcc   $acc1,$t0,$acc0         ! +=acc[0]<<96 and omit acc[0]
1966         addxccc $acc2,$t1,$acc1
1967         addxccc $acc3,$t2,$acc2         ! +=acc[0]*0xFFFFFFFF00000001
1968         addxc   %g0,$t3,$acc3           ! can't overflow
1969
1970         addcc   $acc0,$acc4,$acc0       ! accumulate upper half
1971         addxccc $acc1,$acc5,$acc1
1972         addxccc $acc2,$acc6,$acc2
1973         addxccc $acc3,$acc7,$acc3
1974         addxc   %g0,%g0,$acc4
1975
1976 .Lmul_final_vis3:
1977
1978         ! Final step is "if result > mod, subtract mod", but as comparison
1979         ! means subtraction, we do the subtraction and then copy outcome
1980         ! if it didn't borrow. But note that as we [have to] replace
1981         ! subtraction with addition with negative, carry/borrow logic is
1982         ! inverse.
1983
1984         addcc   $acc0,1,$t0             ! add -modulus, i.e. subtract
1985         not     $poly3,$poly3           ! restore 0x00000000FFFFFFFE
1986         addxccc $acc1,$poly1,$t1
1987         addxccc $acc2,$minus1,$t2
1988         addxccc $acc3,$poly3,$t3
1989         addxccc $acc4,$minus1,%g0       ! did it carry?
1990
1991         movcs   %xcc,$t0,$acc0
1992         movcs   %xcc,$t1,$acc1
1993         stx     $acc0,[$rp]
1994         movcs   %xcc,$t2,$acc2
1995         stx     $acc1,[$rp+8]
1996         movcs   %xcc,$t3,$acc3
1997         stx     $acc2,[$rp+16]
1998         retl
1999         stx     $acc3,[$rp+24]
2000 .size   __ecp_nistz256_sqr_mont_vis3,.-__ecp_nistz256_sqr_mont_vis3
2001 ___
2002
2003 ########################################################################
2004 # void ecp_nistz256_point_double(P256_POINT *out,const P256_POINT *inp);
2005 #
2006 {
2007 my ($res_x,$res_y,$res_z,
2008     $in_x,$in_y,$in_z,
2009     $S,$M,$Zsqr,$tmp0)=map(32*$_,(0..9));
2010 # above map() describes stack layout with 10 temporary
2011 # 256-bit vectors on top.
2012
2013 $code.=<<___;
2014 .align  32
2015 ecp_nistz256_point_double_vis3:
2016         save    %sp,-STACK64_FRAME-32*10,%sp
2017
2018         mov     $rp,$rp_real
2019         mov     -1,$minus1
2020         mov     -2,$poly3
2021         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2022         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2023
2024         ! convert input to uint64_t[4]
2025         ld      [$ap],$a0                       ! in_x
2026         ld      [$ap+4],$t0
2027         ld      [$ap+8],$a1
2028         ld      [$ap+12],$t1
2029         ld      [$ap+16],$a2
2030         ld      [$ap+20],$t2
2031         ld      [$ap+24],$a3
2032         ld      [$ap+28],$t3
2033         sllx    $t0,32,$t0
2034         sllx    $t1,32,$t1
2035         ld      [$ap+32],$acc0                  ! in_y
2036         or      $a0,$t0,$a0
2037         ld      [$ap+32+4],$t0
2038         sllx    $t2,32,$t2
2039         ld      [$ap+32+8],$acc1
2040         or      $a1,$t1,$a1
2041         ld      [$ap+32+12],$t1
2042         sllx    $t3,32,$t3
2043         ld      [$ap+32+16],$acc2
2044         or      $a2,$t2,$a2
2045         ld      [$ap+32+20],$t2
2046         or      $a3,$t3,$a3
2047         ld      [$ap+32+24],$acc3
2048         sllx    $t0,32,$t0
2049         ld      [$ap+32+28],$t3
2050         sllx    $t1,32,$t1
2051         stx     $a0,[%sp+LOCALS64+$in_x]
2052         sllx    $t2,32,$t2
2053         stx     $a1,[%sp+LOCALS64+$in_x+8]
2054         sllx    $t3,32,$t3
2055         stx     $a2,[%sp+LOCALS64+$in_x+16]
2056         or      $acc0,$t0,$acc0
2057         stx     $a3,[%sp+LOCALS64+$in_x+24]
2058         or      $acc1,$t1,$acc1
2059         stx     $acc0,[%sp+LOCALS64+$in_y]
2060         or      $acc2,$t2,$acc2
2061         stx     $acc1,[%sp+LOCALS64+$in_y+8]
2062         or      $acc3,$t3,$acc3
2063         stx     $acc2,[%sp+LOCALS64+$in_y+16]
2064         stx     $acc3,[%sp+LOCALS64+$in_y+24]
2065
2066         ld      [$ap+64],$a0                    ! in_z
2067         ld      [$ap+64+4],$t0
2068         ld      [$ap+64+8],$a1
2069         ld      [$ap+64+12],$t1
2070         ld      [$ap+64+16],$a2
2071         ld      [$ap+64+20],$t2
2072         ld      [$ap+64+24],$a3
2073         ld      [$ap+64+28],$t3
2074         sllx    $t0,32,$t0
2075         sllx    $t1,32,$t1
2076         or      $a0,$t0,$a0
2077         sllx    $t2,32,$t2
2078         or      $a1,$t1,$a1
2079         sllx    $t3,32,$t3
2080         or      $a2,$t2,$a2
2081         or      $a3,$t3,$a3
2082         sllx    $t0,32,$t0
2083         sllx    $t1,32,$t1
2084         stx     $a0,[%sp+LOCALS64+$in_z]
2085         sllx    $t2,32,$t2
2086         stx     $a1,[%sp+LOCALS64+$in_z+8]
2087         sllx    $t3,32,$t3
2088         stx     $a2,[%sp+LOCALS64+$in_z+16]
2089         stx     $a3,[%sp+LOCALS64+$in_z+24]
2090
2091         ! in_y is still in $acc0-$acc3
2092         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(S, in_y);
2093         add     %sp,LOCALS64+$S,$rp
2094
2095         ! in_z is still in $a0-$a3
2096         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Zsqr, in_z);
2097         add     %sp,LOCALS64+$Zsqr,$rp
2098
2099         mov     $acc0,$a0                       ! put Zsqr aside
2100         mov     $acc1,$a1
2101         mov     $acc2,$a2
2102         mov     $acc3,$a3
2103
2104         add     %sp,LOCALS64+$in_x,$bp
2105         call    __ecp_nistz256_add_vis3         ! p256_add(M, Zsqr, in_x);
2106         add     %sp,LOCALS64+$M,$rp
2107
2108         mov     $a0,$acc0                       ! restore Zsqr
2109         ldx     [%sp+LOCALS64+$S],$a0           ! forward load
2110         mov     $a1,$acc1
2111         ldx     [%sp+LOCALS64+$S+8],$a1
2112         mov     $a2,$acc2
2113         ldx     [%sp+LOCALS64+$S+16],$a2
2114         mov     $a3,$acc3
2115         ldx     [%sp+LOCALS64+$S+24],$a3
2116
2117         add     %sp,LOCALS64+$in_x,$bp
2118         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(Zsqr, in_x, Zsqr);
2119         add     %sp,LOCALS64+$Zsqr,$rp
2120
2121         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(S, S);
2122         add     %sp,LOCALS64+$S,$rp
2123
2124         ldx     [%sp+LOCALS64+$in_z],$bi
2125         ldx     [%sp+LOCALS64+$in_y],$a0
2126         ldx     [%sp+LOCALS64+$in_y+8],$a1
2127         ldx     [%sp+LOCALS64+$in_y+16],$a2
2128         ldx     [%sp+LOCALS64+$in_y+24],$a3
2129         add     %sp,LOCALS64+$in_z,$bp
2130         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(tmp0, in_z, in_y);
2131         add     %sp,LOCALS64+$tmp0,$rp
2132
2133         ldx     [%sp+LOCALS64+$M],$bi           ! forward load
2134         ldx     [%sp+LOCALS64+$Zsqr],$a0
2135         ldx     [%sp+LOCALS64+$Zsqr+8],$a1
2136         ldx     [%sp+LOCALS64+$Zsqr+16],$a2
2137         ldx     [%sp+LOCALS64+$Zsqr+24],$a3
2138
2139         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(res_z, tmp0);
2140         add     %sp,LOCALS64+$res_z,$rp
2141
2142         add     %sp,LOCALS64+$M,$bp
2143         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(M, M, Zsqr);
2144         add     %sp,LOCALS64+$M,$rp
2145
2146         mov     $acc0,$a0                       ! put aside M
2147         mov     $acc1,$a1
2148         mov     $acc2,$a2
2149         mov     $acc3,$a3
2150         call    __ecp_nistz256_mul_by_2_vis3
2151         add     %sp,LOCALS64+$M,$rp
2152         mov     $a0,$t0                         ! copy M
2153         ldx     [%sp+LOCALS64+$S],$a0           ! forward load
2154         mov     $a1,$t1
2155         ldx     [%sp+LOCALS64+$S+8],$a1
2156         mov     $a2,$t2
2157         ldx     [%sp+LOCALS64+$S+16],$a2
2158         mov     $a3,$t3
2159         ldx     [%sp+LOCALS64+$S+24],$a3
2160         call    __ecp_nistz256_add_noload_vis3  ! p256_mul_by_3(M, M);
2161         add     %sp,LOCALS64+$M,$rp
2162
2163         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(tmp0, S);
2164         add     %sp,LOCALS64+$tmp0,$rp
2165
2166         ldx     [%sp+LOCALS64+$S],$bi           ! forward load
2167         ldx     [%sp+LOCALS64+$in_x],$a0
2168         ldx     [%sp+LOCALS64+$in_x+8],$a1
2169         ldx     [%sp+LOCALS64+$in_x+16],$a2
2170         ldx     [%sp+LOCALS64+$in_x+24],$a3
2171
2172         call    __ecp_nistz256_div_by_2_vis3    ! p256_div_by_2(res_y, tmp0);
2173         add     %sp,LOCALS64+$res_y,$rp
2174
2175         add     %sp,LOCALS64+$S,$bp
2176         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S, S, in_x);
2177         add     %sp,LOCALS64+$S,$rp
2178
2179         ldx     [%sp+LOCALS64+$M],$a0           ! forward load
2180         ldx     [%sp+LOCALS64+$M+8],$a1
2181         ldx     [%sp+LOCALS64+$M+16],$a2
2182         ldx     [%sp+LOCALS64+$M+24],$a3
2183
2184         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(tmp0, S);
2185         add     %sp,LOCALS64+$tmp0,$rp
2186
2187         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(res_x, M);
2188         add     %sp,LOCALS64+$res_x,$rp
2189
2190         add     %sp,LOCALS64+$tmp0,$bp
2191         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_x, res_x, tmp0);
2192         add     %sp,LOCALS64+$res_x,$rp
2193
2194         ldx     [%sp+LOCALS64+$M],$a0           ! forward load
2195         ldx     [%sp+LOCALS64+$M+8],$a1
2196         ldx     [%sp+LOCALS64+$M+16],$a2
2197         ldx     [%sp+LOCALS64+$M+24],$a3
2198
2199         add     %sp,LOCALS64+$S,$bp
2200         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(S, S, res_x);
2201         add     %sp,LOCALS64+$S,$rp
2202
2203         mov     $acc0,$bi
2204         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S, S, M);
2205         add     %sp,LOCALS64+$S,$rp
2206
2207         ldx     [%sp+LOCALS64+$res_x],$a0       ! forward load
2208         ldx     [%sp+LOCALS64+$res_x+8],$a1
2209         ldx     [%sp+LOCALS64+$res_x+16],$a2
2210         ldx     [%sp+LOCALS64+$res_x+24],$a3
2211
2212         add     %sp,LOCALS64+$res_y,$bp
2213         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, S, res_y);
2214         add     %sp,LOCALS64+$res_y,$bp
2215
2216         ! convert output to uint_32[8]
2217         srlx    $a0,32,$t0
2218         srlx    $a1,32,$t1
2219         st      $a0,[$rp_real]                  ! res_x
2220         srlx    $a2,32,$t2
2221         st      $t0,[$rp_real+4]
2222         srlx    $a3,32,$t3
2223         st      $a1,[$rp_real+8]
2224         st      $t1,[$rp_real+12]
2225         st      $a2,[$rp_real+16]
2226         st      $t2,[$rp_real+20]
2227         st      $a3,[$rp_real+24]
2228         st      $t3,[$rp_real+28]
2229
2230         ldx     [%sp+LOCALS64+$res_z],$a0       ! forward load
2231         srlx    $acc0,32,$t0
2232         ldx     [%sp+LOCALS64+$res_z+8],$a1
2233         srlx    $acc1,32,$t1
2234         ldx     [%sp+LOCALS64+$res_z+16],$a2
2235         srlx    $acc2,32,$t2
2236         ldx     [%sp+LOCALS64+$res_z+24],$a3
2237         srlx    $acc3,32,$t3
2238         st      $acc0,[$rp_real+32]             ! res_y
2239         st      $t0,  [$rp_real+32+4]
2240         st      $acc1,[$rp_real+32+8]
2241         st      $t1,  [$rp_real+32+12]
2242         st      $acc2,[$rp_real+32+16]
2243         st      $t2,  [$rp_real+32+20]
2244         st      $acc3,[$rp_real+32+24]
2245         st      $t3,  [$rp_real+32+28]
2246
2247         srlx    $a0,32,$t0
2248         srlx    $a1,32,$t1
2249         st      $a0,[$rp_real+64]               ! res_z
2250         srlx    $a2,32,$t2
2251         st      $t0,[$rp_real+64+4]
2252         srlx    $a3,32,$t3
2253         st      $a1,[$rp_real+64+8]
2254         st      $t1,[$rp_real+64+12]
2255         st      $a2,[$rp_real+64+16]
2256         st      $t2,[$rp_real+64+20]
2257         st      $a3,[$rp_real+64+24]
2258         st      $t3,[$rp_real+64+28]
2259
2260         ret
2261         restore
2262 .size   ecp_nistz256_point_double_vis3,.-ecp_nistz256_point_double_vis3
2263 ___
2264 }
2265 ########################################################################
2266 # void ecp_nistz256_point_add(P256_POINT *out,const P256_POINT *in1,
2267 #                             const P256_POINT *in2);
2268 {
2269 my ($res_x,$res_y,$res_z,
2270     $in1_x,$in1_y,$in1_z,
2271     $in2_x,$in2_y,$in2_z,
2272     $H,$Hsqr,$R,$Rsqr,$Hcub,
2273     $U1,$U2,$S1,$S2)=map(32*$_,(0..17));
2274 my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
2275
2276 # above map() describes stack layout with 18 temporary
2277 # 256-bit vectors on top. Then we reserve some space for
2278 # !in1infty, !in2infty and result of check for zero.
2279
2280 $code.=<<___;
2281 .globl  ecp_nistz256_point_add_vis3
2282 .align  32
2283 ecp_nistz256_point_add_vis3:
2284         save    %sp,-STACK64_FRAME-32*18-32,%sp
2285
2286         mov     $rp,$rp_real
2287         mov     -1,$minus1
2288         mov     -2,$poly3
2289         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2290         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2291
2292         ! convert input to uint64_t[4]
2293         ld      [$bp],$a0                       ! in2_x
2294         ld      [$bp+4],$t0
2295         ld      [$bp+8],$a1
2296         ld      [$bp+12],$t1
2297         ld      [$bp+16],$a2
2298         ld      [$bp+20],$t2
2299         ld      [$bp+24],$a3
2300         ld      [$bp+28],$t3
2301         sllx    $t0,32,$t0
2302         sllx    $t1,32,$t1
2303         ld      [$bp+32],$acc0                  ! in2_y
2304         or      $a0,$t0,$a0
2305         ld      [$bp+32+4],$t0
2306         sllx    $t2,32,$t2
2307         ld      [$bp+32+8],$acc1
2308         or      $a1,$t1,$a1
2309         ld      [$bp+32+12],$t1
2310         sllx    $t3,32,$t3
2311         ld      [$bp+32+16],$acc2
2312         or      $a2,$t2,$a2
2313         ld      [$bp+32+20],$t2
2314         or      $a3,$t3,$a3
2315         ld      [$bp+32+24],$acc3
2316         sllx    $t0,32,$t0
2317         ld      [$bp+32+28],$t3
2318         sllx    $t1,32,$t1
2319         stx     $a0,[%sp+LOCALS64+$in2_x]
2320         sllx    $t2,32,$t2
2321         stx     $a1,[%sp+LOCALS64+$in2_x+8]
2322         sllx    $t3,32,$t3
2323         stx     $a2,[%sp+LOCALS64+$in2_x+16]
2324         or      $acc0,$t0,$acc0
2325         stx     $a3,[%sp+LOCALS64+$in2_x+24]
2326         or      $acc1,$t1,$acc1
2327         stx     $acc0,[%sp+LOCALS64+$in2_y]
2328         or      $acc2,$t2,$acc2
2329         stx     $acc1,[%sp+LOCALS64+$in2_y+8]
2330         or      $acc3,$t3,$acc3
2331         stx     $acc2,[%sp+LOCALS64+$in2_y+16]
2332         stx     $acc3,[%sp+LOCALS64+$in2_y+24]
2333
2334         or      $a1,$a0,$a0
2335         or      $a3,$a2,$a2
2336         or      $acc1,$acc0,$acc0
2337         or      $acc3,$acc2,$acc2
2338         or      $a2,$a0,$a0
2339         or      $acc2,$acc0,$acc0
2340         or      $acc0,$a0,$a0
2341         movrnz  $a0,-1,$a0                      ! !in2infty
2342         stx     $a0,[%fp+STACK_BIAS-8]
2343
2344         ld      [$bp+64],$acc0                  ! in2_z
2345         ld      [$bp+64+4],$t0
2346         ld      [$bp+64+8],$acc1
2347         ld      [$bp+64+12],$t1
2348         ld      [$bp+64+16],$acc2
2349         ld      [$bp+64+20],$t2
2350         ld      [$bp+64+24],$acc3
2351         ld      [$bp+64+28],$t3
2352         sllx    $t0,32,$t0
2353         sllx    $t1,32,$t1
2354         ld      [$ap],$a0                       ! in1_x
2355         or      $acc0,$t0,$acc0
2356         ld      [$ap+4],$t0
2357         sllx    $t2,32,$t2
2358         ld      [$ap+8],$a1
2359         or      $acc1,$t1,$acc1
2360         ld      [$ap+12],$t1
2361         sllx    $t3,32,$t3
2362         ld      [$ap+16],$a2
2363         or      $acc2,$t2,$acc2
2364         ld      [$ap+20],$t2
2365         or      $acc3,$t3,$acc3
2366         ld      [$ap+24],$a3
2367         sllx    $t0,32,$t0
2368         ld      [$ap+28],$t3
2369         sllx    $t1,32,$t1
2370         stx     $acc0,[%sp+LOCALS64+$in2_z]
2371         sllx    $t2,32,$t2
2372         stx     $acc1,[%sp+LOCALS64+$in2_z+8]
2373         sllx    $t3,32,$t3
2374         stx     $acc2,[%sp+LOCALS64+$in2_z+16]
2375         stx     $acc3,[%sp+LOCALS64+$in2_z+24]
2376
2377         or      $a0,$t0,$a0
2378         ld      [$ap+32],$acc0                  ! in1_y
2379         or      $a1,$t1,$a1
2380         ld      [$ap+32+4],$t0
2381         or      $a2,$t2,$a2
2382         ld      [$ap+32+8],$acc1
2383         or      $a3,$t3,$a3
2384         ld      [$ap+32+12],$t1
2385         ld      [$ap+32+16],$acc2
2386         ld      [$ap+32+20],$t2
2387         ld      [$ap+32+24],$acc3
2388         sllx    $t0,32,$t0
2389         ld      [$ap+32+28],$t3
2390         sllx    $t1,32,$t1
2391         stx     $a0,[%sp+LOCALS64+$in1_x]
2392         sllx    $t2,32,$t2
2393         stx     $a1,[%sp+LOCALS64+$in1_x+8]
2394         sllx    $t3,32,$t3
2395         stx     $a2,[%sp+LOCALS64+$in1_x+16]
2396         or      $acc0,$t0,$acc0
2397         stx     $a3,[%sp+LOCALS64+$in1_x+24]
2398         or      $acc1,$t1,$acc1
2399         stx     $acc0,[%sp+LOCALS64+$in1_y]
2400         or      $acc2,$t2,$acc2
2401         stx     $acc1,[%sp+LOCALS64+$in1_y+8]
2402         or      $acc3,$t3,$acc3
2403         stx     $acc2,[%sp+LOCALS64+$in1_y+16]
2404         stx     $acc3,[%sp+LOCALS64+$in1_y+24]
2405
2406         or      $a1,$a0,$a0
2407         or      $a3,$a2,$a2
2408         or      $acc1,$acc0,$acc0
2409         or      $acc3,$acc2,$acc2
2410         or      $a2,$a0,$a0
2411         or      $acc2,$acc0,$acc0
2412         or      $acc0,$a0,$a0
2413         movrnz  $a0,-1,$a0                      ! !in1infty
2414         stx     $a0,[%fp+STACK_BIAS-16]
2415
2416         ldx     [%sp+LOCALS64+$in2_z],$a0       ! forward load
2417         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2418         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2419         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2420
2421         ld      [$ap+64],$acc0                  ! in1_z
2422         ld      [$ap+64+4],$t0
2423         ld      [$ap+64+8],$acc1
2424         ld      [$ap+64+12],$t1
2425         ld      [$ap+64+16],$acc2
2426         ld      [$ap+64+20],$t2
2427         ld      [$ap+64+24],$acc3
2428         ld      [$ap+64+28],$t3
2429         sllx    $t0,32,$t0
2430         sllx    $t1,32,$t1
2431         or      $acc0,$t0,$acc0
2432         sllx    $t2,32,$t2
2433         or      $acc1,$t1,$acc1
2434         sllx    $t3,32,$t3
2435         stx     $acc0,[%sp+LOCALS64+$in1_z]
2436         or      $acc2,$t2,$acc2
2437         stx     $acc1,[%sp+LOCALS64+$in1_z+8]
2438         or      $acc3,$t3,$acc3
2439         stx     $acc2,[%sp+LOCALS64+$in1_z+16]
2440         stx     $acc3,[%sp+LOCALS64+$in1_z+24]
2441
2442         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z2sqr, in2_z);
2443         add     %sp,LOCALS64+$Z2sqr,$rp
2444
2445         ldx     [%sp+LOCALS64+$in1_z],$a0
2446         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2447         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2448         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2449         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z1sqr, in1_z);
2450         add     %sp,LOCALS64+$Z1sqr,$rp
2451
2452         ldx     [%sp+LOCALS64+$Z2sqr],$bi
2453         ldx     [%sp+LOCALS64+$in2_z],$a0
2454         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2455         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2456         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2457         add     %sp,LOCALS64+$Z2sqr,$bp
2458         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S1, Z2sqr, in2_z);
2459         add     %sp,LOCALS64+$S1,$rp
2460
2461         ldx     [%sp+LOCALS64+$Z1sqr],$bi
2462         ldx     [%sp+LOCALS64+$in1_z],$a0
2463         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2464         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2465         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2466         add     %sp,LOCALS64+$Z1sqr,$bp
2467         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, Z1sqr, in1_z);
2468         add     %sp,LOCALS64+$S2,$rp
2469
2470         ldx     [%sp+LOCALS64+$S1],$bi
2471         ldx     [%sp+LOCALS64+$in1_y],$a0
2472         ldx     [%sp+LOCALS64+$in1_y+8],$a1
2473         ldx     [%sp+LOCALS64+$in1_y+16],$a2
2474         ldx     [%sp+LOCALS64+$in1_y+24],$a3
2475         add     %sp,LOCALS64+$S1,$bp
2476         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S1, S1, in1_y);
2477         add     %sp,LOCALS64+$S1,$rp
2478
2479         ldx     [%sp+LOCALS64+$S2],$bi
2480         ldx     [%sp+LOCALS64+$in2_y],$a0
2481         ldx     [%sp+LOCALS64+$in2_y+8],$a1
2482         ldx     [%sp+LOCALS64+$in2_y+16],$a2
2483         ldx     [%sp+LOCALS64+$in2_y+24],$a3
2484         add     %sp,LOCALS64+$S2,$bp
2485         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S2, in2_y);
2486         add     %sp,LOCALS64+$S2,$rp
2487
2488         ldx     [%sp+LOCALS64+$Z2sqr],$bi       ! forward load
2489         ldx     [%sp+LOCALS64+$in1_x],$a0
2490         ldx     [%sp+LOCALS64+$in1_x+8],$a1
2491         ldx     [%sp+LOCALS64+$in1_x+16],$a2
2492         ldx     [%sp+LOCALS64+$in1_x+24],$a3
2493
2494         add     %sp,LOCALS64+$S1,$bp
2495         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(R, S2, S1);
2496         add     %sp,LOCALS64+$R,$rp
2497
2498         or      $acc1,$acc0,$acc0               ! see if result is zero
2499         or      $acc3,$acc2,$acc2
2500         or      $acc2,$acc0,$acc0
2501         stx     $acc0,[%fp+STACK_BIAS-24]
2502
2503         add     %sp,LOCALS64+$Z2sqr,$bp
2504         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U1, in1_x, Z2sqr);
2505         add     %sp,LOCALS64+$U1,$rp
2506
2507         ldx     [%sp+LOCALS64+$Z1sqr],$bi
2508         ldx     [%sp+LOCALS64+$in2_x],$a0
2509         ldx     [%sp+LOCALS64+$in2_x+8],$a1
2510         ldx     [%sp+LOCALS64+$in2_x+16],$a2
2511         ldx     [%sp+LOCALS64+$in2_x+24],$a3
2512         add     %sp,LOCALS64+$Z1sqr,$bp
2513         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, in2_x, Z1sqr);
2514         add     %sp,LOCALS64+$U2,$rp
2515
2516         ldx     [%sp+LOCALS64+$R],$a0           ! forward load
2517         ldx     [%sp+LOCALS64+$R+8],$a1
2518         ldx     [%sp+LOCALS64+$R+16],$a2
2519         ldx     [%sp+LOCALS64+$R+24],$a3
2520
2521         add     %sp,LOCALS64+$U1,$bp
2522         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(H, U2, U1);
2523         add     %sp,LOCALS64+$H,$rp
2524
2525         or      $acc1,$acc0,$acc0               ! see if result is zero
2526         or      $acc3,$acc2,$acc2
2527         orcc    $acc2,$acc0,$acc0
2528
2529         bne,pt  %xcc,.Ladd_proceed_vis3         ! is_equal(U1,U2)?
2530         nop
2531
2532         ldx     [%fp+STACK_BIAS-8],$t0
2533         ldx     [%fp+STACK_BIAS-16],$t1
2534         ldx     [%fp+STACK_BIAS-24],$t2
2535         andcc   $t0,$t1,%g0
2536         be,pt   %xcc,.Ladd_proceed_vis3         ! (in1infty || in2infty)?
2537         nop
2538         andcc   $t2,$t2,%g0
2539         be,pt   %xcc,.Ladd_proceed_vis3         ! is_equal(S1,S2)?
2540         nop
2541
2542         st      %g0,[$rp_real]
2543         st      %g0,[$rp_real+4]
2544         st      %g0,[$rp_real+8]
2545         st      %g0,[$rp_real+12]
2546         st      %g0,[$rp_real+16]
2547         st      %g0,[$rp_real+20]
2548         st      %g0,[$rp_real+24]
2549         st      %g0,[$rp_real+28]
2550         st      %g0,[$rp_real+32]
2551         st      %g0,[$rp_real+32+4]
2552         st      %g0,[$rp_real+32+8]
2553         st      %g0,[$rp_real+32+12]
2554         st      %g0,[$rp_real+32+16]
2555         st      %g0,[$rp_real+32+20]
2556         st      %g0,[$rp_real+32+24]
2557         st      %g0,[$rp_real+32+28]
2558         st      %g0,[$rp_real+64]
2559         st      %g0,[$rp_real+64+4]
2560         st      %g0,[$rp_real+64+8]
2561         st      %g0,[$rp_real+64+12]
2562         st      %g0,[$rp_real+64+16]
2563         st      %g0,[$rp_real+64+20]
2564         st      %g0,[$rp_real+64+24]
2565         st      %g0,[$rp_real+64+28]
2566         b       .Ladd_done_vis3
2567         nop
2568
2569 .align  16
2570 .Ladd_proceed_vis3:
2571         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Rsqr, R);
2572         add     %sp,LOCALS64+$Rsqr,$rp
2573
2574         ldx     [%sp+LOCALS64+$H],$bi
2575         ldx     [%sp+LOCALS64+$in1_z],$a0
2576         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2577         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2578         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2579         add     %sp,LOCALS64+$H,$bp
2580         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, H, in1_z);
2581         add     %sp,LOCALS64+$res_z,$rp
2582
2583         ldx     [%sp+LOCALS64+$H],$a0
2584         ldx     [%sp+LOCALS64+$H+8],$a1
2585         ldx     [%sp+LOCALS64+$H+16],$a2
2586         ldx     [%sp+LOCALS64+$H+24],$a3
2587         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Hsqr, H);
2588         add     %sp,LOCALS64+$Hsqr,$rp
2589
2590         ldx     [%sp+LOCALS64+$res_z],$bi
2591         ldx     [%sp+LOCALS64+$in2_z],$a0
2592         ldx     [%sp+LOCALS64+$in2_z+8],$a1
2593         ldx     [%sp+LOCALS64+$in2_z+16],$a2
2594         ldx     [%sp+LOCALS64+$in2_z+24],$a3
2595         add     %sp,LOCALS64+$res_z,$bp
2596         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, res_z, in2_z);
2597         add     %sp,LOCALS64+$res_z,$rp
2598
2599         ldx     [%sp+LOCALS64+$H],$bi
2600         ldx     [%sp+LOCALS64+$Hsqr],$a0
2601         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2602         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2603         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2604         add     %sp,LOCALS64+$H,$bp
2605         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(Hcub, Hsqr, H);
2606         add     %sp,LOCALS64+$Hcub,$rp
2607
2608         ldx     [%sp+LOCALS64+$U1],$bi
2609         ldx     [%sp+LOCALS64+$Hsqr],$a0
2610         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2611         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2612         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2613         add     %sp,LOCALS64+$U1,$bp
2614         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, U1, Hsqr);
2615         add     %sp,LOCALS64+$U2,$rp
2616
2617         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(Hsqr, U2);
2618         add     %sp,LOCALS64+$Hsqr,$rp
2619
2620         add     %sp,LOCALS64+$Rsqr,$bp
2621         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_x, Rsqr, Hsqr);
2622         add     %sp,LOCALS64+$res_x,$rp
2623
2624         add     %sp,LOCALS64+$Hcub,$bp
2625         call    __ecp_nistz256_sub_from_vis3    !  p256_sub(res_x, res_x, Hcub);
2626         add     %sp,LOCALS64+$res_x,$rp
2627
2628         ldx     [%sp+LOCALS64+$S1],$bi          ! forward load
2629         ldx     [%sp+LOCALS64+$Hcub],$a0
2630         ldx     [%sp+LOCALS64+$Hcub+8],$a1
2631         ldx     [%sp+LOCALS64+$Hcub+16],$a2
2632         ldx     [%sp+LOCALS64+$Hcub+24],$a3
2633
2634         add     %sp,LOCALS64+$U2,$bp
2635         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_y, U2, res_x);
2636         add     %sp,LOCALS64+$res_y,$rp
2637
2638         add     %sp,LOCALS64+$S1,$bp
2639         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S1, Hcub);
2640         add     %sp,LOCALS64+$S2,$rp
2641
2642         ldx     [%sp+LOCALS64+$R],$bi
2643         ldx     [%sp+LOCALS64+$res_y],$a0
2644         ldx     [%sp+LOCALS64+$res_y+8],$a1
2645         ldx     [%sp+LOCALS64+$res_y+16],$a2
2646         ldx     [%sp+LOCALS64+$res_y+24],$a3
2647         add     %sp,LOCALS64+$R,$bp
2648         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_y, res_y, R);
2649         add     %sp,LOCALS64+$res_y,$rp
2650
2651         add     %sp,LOCALS64+$S2,$bp
2652         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, res_y, S2);
2653         add     %sp,LOCALS64+$res_y,$rp
2654
2655         ldx     [%fp+STACK_BIAS-16],$t1         ! !in1infty
2656         ldx     [%fp+STACK_BIAS-8],$t2          ! !in2infty
2657 ___
2658 for($i=0;$i<96;$i+=16) {                        # conditional moves
2659 $code.=<<___;
2660         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2661         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2662         ldx     [%sp+LOCALS64+$in2_x+$i],$acc2  ! in2
2663         ldx     [%sp+LOCALS64+$in2_x+$i+8],$acc3
2664         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2665         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2666         movrz   $t1,$acc2,$acc0
2667         movrz   $t1,$acc3,$acc1
2668         movrz   $t2,$acc4,$acc0
2669         movrz   $t2,$acc5,$acc1
2670         srlx    $acc0,32,$acc2
2671         srlx    $acc1,32,$acc3
2672         st      $acc0,[$rp_real+$i]
2673         st      $acc2,[$rp_real+$i+4]
2674         st      $acc1,[$rp_real+$i+8]
2675         st      $acc3,[$rp_real+$i+12]
2676 ___
2677 }
2678 $code.=<<___;
2679 .Ladd_done_vis3:
2680         ret
2681         restore
2682 .size   ecp_nistz256_point_add_vis3,.-ecp_nistz256_point_add_vis3
2683 ___
2684 }
2685 ########################################################################
2686 # void ecp_nistz256_point_add_affine(P256_POINT *out,const P256_POINT *in1,
2687 #                                    const P256_POINT_AFFINE *in2);
2688 {
2689 my ($res_x,$res_y,$res_z,
2690     $in1_x,$in1_y,$in1_z,
2691     $in2_x,$in2_y,
2692     $U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr)=map(32*$_,(0..14));
2693 my $Z1sqr = $S2;
2694 # above map() describes stack layout with 15 temporary
2695 # 256-bit vectors on top. Then we reserve some space for
2696 # !in1infty and !in2infty.
2697
2698 $code.=<<___;
2699 .align  32
2700 ecp_nistz256_point_add_affine_vis3:
2701         save    %sp,-STACK64_FRAME-32*15-32,%sp
2702
2703         mov     $rp,$rp_real
2704         mov     -1,$minus1
2705         mov     -2,$poly3
2706         sllx    $minus1,32,$poly1               ! 0xFFFFFFFF00000000
2707         srl     $poly3,0,$poly3                 ! 0x00000000FFFFFFFE
2708
2709         ! convert input to uint64_t[4]
2710         ld      [$bp],$a0                       ! in2_x
2711         ld      [$bp+4],$t0
2712         ld      [$bp+8],$a1
2713         ld      [$bp+12],$t1
2714         ld      [$bp+16],$a2
2715         ld      [$bp+20],$t2
2716         ld      [$bp+24],$a3
2717         ld      [$bp+28],$t3
2718         sllx    $t0,32,$t0
2719         sllx    $t1,32,$t1
2720         ld      [$bp+32],$acc0                  ! in2_y
2721         or      $a0,$t0,$a0
2722         ld      [$bp+32+4],$t0
2723         sllx    $t2,32,$t2
2724         ld      [$bp+32+8],$acc1
2725         or      $a1,$t1,$a1
2726         ld      [$bp+32+12],$t1
2727         sllx    $t3,32,$t3
2728         ld      [$bp+32+16],$acc2
2729         or      $a2,$t2,$a2
2730         ld      [$bp+32+20],$t2
2731         or      $a3,$t3,$a3
2732         ld      [$bp+32+24],$acc3
2733         sllx    $t0,32,$t0
2734         ld      [$bp+32+28],$t3
2735         sllx    $t1,32,$t1
2736         stx     $a0,[%sp+LOCALS64+$in2_x]
2737         sllx    $t2,32,$t2
2738         stx     $a1,[%sp+LOCALS64+$in2_x+8]
2739         sllx    $t3,32,$t3
2740         stx     $a2,[%sp+LOCALS64+$in2_x+16]
2741         or      $acc0,$t0,$acc0
2742         stx     $a3,[%sp+LOCALS64+$in2_x+24]
2743         or      $acc1,$t1,$acc1
2744         stx     $acc0,[%sp+LOCALS64+$in2_y]
2745         or      $acc2,$t2,$acc2
2746         stx     $acc1,[%sp+LOCALS64+$in2_y+8]
2747         or      $acc3,$t3,$acc3
2748         stx     $acc2,[%sp+LOCALS64+$in2_y+16]
2749         stx     $acc3,[%sp+LOCALS64+$in2_y+24]
2750
2751         or      $a1,$a0,$a0
2752         or      $a3,$a2,$a2
2753         or      $acc1,$acc0,$acc0
2754         or      $acc3,$acc2,$acc2
2755         or      $a2,$a0,$a0
2756         or      $acc2,$acc0,$acc0
2757         or      $acc0,$a0,$a0
2758         movrnz  $a0,-1,$a0                      ! !in2infty
2759         stx     $a0,[%fp+STACK_BIAS-8]
2760
2761         ld      [$ap],$a0                       ! in1_x
2762         ld      [$ap+4],$t0
2763         ld      [$ap+8],$a1
2764         ld      [$ap+12],$t1
2765         ld      [$ap+16],$a2
2766         ld      [$ap+20],$t2
2767         ld      [$ap+24],$a3
2768         ld      [$ap+28],$t3
2769         sllx    $t0,32,$t0
2770         sllx    $t1,32,$t1
2771         ld      [$ap+32],$acc0                  ! in1_y
2772         or      $a0,$t0,$a0
2773         ld      [$ap+32+4],$t0
2774         sllx    $t2,32,$t2
2775         ld      [$ap+32+8],$acc1
2776         or      $a1,$t1,$a1
2777         ld      [$ap+32+12],$t1
2778         sllx    $t3,32,$t3
2779         ld      [$ap+32+16],$acc2
2780         or      $a2,$t2,$a2
2781         ld      [$ap+32+20],$t2
2782         or      $a3,$t3,$a3
2783         ld      [$ap+32+24],$acc3
2784         sllx    $t0,32,$t0
2785         ld      [$ap+32+28],$t3
2786         sllx    $t1,32,$t1
2787         stx     $a0,[%sp+LOCALS64+$in1_x]
2788         sllx    $t2,32,$t2
2789         stx     $a1,[%sp+LOCALS64+$in1_x+8]
2790         sllx    $t3,32,$t3
2791         stx     $a2,[%sp+LOCALS64+$in1_x+16]
2792         or      $acc0,$t0,$acc0
2793         stx     $a3,[%sp+LOCALS64+$in1_x+24]
2794         or      $acc1,$t1,$acc1
2795         stx     $acc0,[%sp+LOCALS64+$in1_y]
2796         or      $acc2,$t2,$acc2
2797         stx     $acc1,[%sp+LOCALS64+$in1_y+8]
2798         or      $acc3,$t3,$acc3
2799         stx     $acc2,[%sp+LOCALS64+$in1_y+16]
2800         stx     $acc3,[%sp+LOCALS64+$in1_y+24]
2801
2802         or      $a1,$a0,$a0
2803         or      $a3,$a2,$a2
2804         or      $acc1,$acc0,$acc0
2805         or      $acc3,$acc2,$acc2
2806         or      $a2,$a0,$a0
2807         or      $acc2,$acc0,$acc0
2808         or      $acc0,$a0,$a0
2809         movrnz  $a0,-1,$a0                      ! !in1infty
2810         stx     $a0,[%fp+STACK_BIAS-16]
2811
2812         ld      [$ap+64],$a0                    ! in1_z
2813         ld      [$ap+64+4],$t0
2814         ld      [$ap+64+8],$a1
2815         ld      [$ap+64+12],$t1
2816         ld      [$ap+64+16],$a2
2817         ld      [$ap+64+20],$t2
2818         ld      [$ap+64+24],$a3
2819         ld      [$ap+64+28],$t3
2820         sllx    $t0,32,$t0
2821         sllx    $t1,32,$t1
2822         or      $a0,$t0,$a0
2823         sllx    $t2,32,$t2
2824         or      $a1,$t1,$a1
2825         sllx    $t3,32,$t3
2826         stx     $a0,[%sp+LOCALS64+$in1_z]
2827         or      $a2,$t2,$a2
2828         stx     $a1,[%sp+LOCALS64+$in1_z+8]
2829         or      $a3,$t3,$a3
2830         stx     $a2,[%sp+LOCALS64+$in1_z+16]
2831         stx     $a3,[%sp+LOCALS64+$in1_z+24]
2832
2833         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Z1sqr, in1_z);
2834         add     %sp,LOCALS64+$Z1sqr,$rp
2835
2836         ldx     [%sp+LOCALS64+$in2_x],$bi
2837         mov     $acc0,$a0
2838         mov     $acc1,$a1
2839         mov     $acc2,$a2
2840         mov     $acc3,$a3
2841         add     %sp,LOCALS64+$in2_x,$bp
2842         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, Z1sqr, in2_x);
2843         add     %sp,LOCALS64+$U2,$rp
2844
2845         ldx     [%sp+LOCALS64+$Z1sqr],$bi       ! forward load
2846         ldx     [%sp+LOCALS64+$in1_z],$a0
2847         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2848         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2849         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2850
2851         add     %sp,LOCALS64+$in1_x,$bp
2852         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(H, U2, in1_x);
2853         add     %sp,LOCALS64+$H,$rp
2854
2855         add     %sp,LOCALS64+$Z1sqr,$bp
2856         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, Z1sqr, in1_z);
2857         add     %sp,LOCALS64+$S2,$rp
2858
2859         ldx     [%sp+LOCALS64+$H],$bi
2860         ldx     [%sp+LOCALS64+$in1_z],$a0
2861         ldx     [%sp+LOCALS64+$in1_z+8],$a1
2862         ldx     [%sp+LOCALS64+$in1_z+16],$a2
2863         ldx     [%sp+LOCALS64+$in1_z+24],$a3
2864         add     %sp,LOCALS64+$H,$bp
2865         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_z, H, in1_z);
2866         add     %sp,LOCALS64+$res_z,$rp
2867
2868         ldx     [%sp+LOCALS64+$S2],$bi
2869         ldx     [%sp+LOCALS64+$in2_y],$a0
2870         ldx     [%sp+LOCALS64+$in2_y+8],$a1
2871         ldx     [%sp+LOCALS64+$in2_y+16],$a2
2872         ldx     [%sp+LOCALS64+$in2_y+24],$a3
2873         add     %sp,LOCALS64+$S2,$bp
2874         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, S2, in2_y);
2875         add     %sp,LOCALS64+$S2,$rp
2876
2877         ldx     [%sp+LOCALS64+$H],$a0           ! forward load
2878         ldx     [%sp+LOCALS64+$H+8],$a1
2879         ldx     [%sp+LOCALS64+$H+16],$a2
2880         ldx     [%sp+LOCALS64+$H+24],$a3
2881
2882         add     %sp,LOCALS64+$in1_y,$bp
2883         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(R, S2, in1_y);
2884         add     %sp,LOCALS64+$R,$rp
2885
2886         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Hsqr, H);
2887         add     %sp,LOCALS64+$Hsqr,$rp
2888
2889         ldx     [%sp+LOCALS64+$R],$a0
2890         ldx     [%sp+LOCALS64+$R+8],$a1
2891         ldx     [%sp+LOCALS64+$R+16],$a2
2892         ldx     [%sp+LOCALS64+$R+24],$a3
2893         call    __ecp_nistz256_sqr_mont_vis3    ! p256_sqr_mont(Rsqr, R);
2894         add     %sp,LOCALS64+$Rsqr,$rp
2895
2896         ldx     [%sp+LOCALS64+$H],$bi
2897         ldx     [%sp+LOCALS64+$Hsqr],$a0
2898         ldx     [%sp+LOCALS64+$Hsqr+8],$a1
2899         ldx     [%sp+LOCALS64+$Hsqr+16],$a2
2900         ldx     [%sp+LOCALS64+$Hsqr+24],$a3
2901         add     %sp,LOCALS64+$H,$bp
2902         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(Hcub, Hsqr, H);
2903         add     %sp,LOCALS64+$Hcub,$rp
2904
2905         ldx     [%sp+LOCALS64+$Hsqr],$bi
2906         ldx     [%sp+LOCALS64+$in1_x],$a0
2907         ldx     [%sp+LOCALS64+$in1_x+8],$a1
2908         ldx     [%sp+LOCALS64+$in1_x+16],$a2
2909         ldx     [%sp+LOCALS64+$in1_x+24],$a3
2910         add     %sp,LOCALS64+$Hsqr,$bp
2911         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(U2, in1_x, Hsqr);
2912         add     %sp,LOCALS64+$U2,$rp
2913
2914         call    __ecp_nistz256_mul_by_2_vis3    ! p256_mul_by_2(Hsqr, U2);
2915         add     %sp,LOCALS64+$Hsqr,$rp
2916
2917         add     %sp,LOCALS64+$Rsqr,$bp
2918         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_x, Rsqr, Hsqr);
2919         add     %sp,LOCALS64+$res_x,$rp
2920
2921         add     %sp,LOCALS64+$Hcub,$bp
2922         call    __ecp_nistz256_sub_from_vis3    !  p256_sub(res_x, res_x, Hcub);
2923         add     %sp,LOCALS64+$res_x,$rp
2924
2925         ldx     [%sp+LOCALS64+$Hcub],$bi        ! forward load
2926         ldx     [%sp+LOCALS64+$in1_y],$a0
2927         ldx     [%sp+LOCALS64+$in1_y+8],$a1
2928         ldx     [%sp+LOCALS64+$in1_y+16],$a2
2929         ldx     [%sp+LOCALS64+$in1_y+24],$a3
2930
2931         add     %sp,LOCALS64+$U2,$bp
2932         call    __ecp_nistz256_sub_morf_vis3    ! p256_sub(res_y, U2, res_x);
2933         add     %sp,LOCALS64+$res_y,$rp
2934
2935         add     %sp,LOCALS64+$Hcub,$bp
2936         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(S2, in1_y, Hcub);
2937         add     %sp,LOCALS64+$S2,$rp
2938
2939         ldx     [%sp+LOCALS64+$R],$bi
2940         ldx     [%sp+LOCALS64+$res_y],$a0
2941         ldx     [%sp+LOCALS64+$res_y+8],$a1
2942         ldx     [%sp+LOCALS64+$res_y+16],$a2
2943         ldx     [%sp+LOCALS64+$res_y+24],$a3
2944         add     %sp,LOCALS64+$R,$bp
2945         call    __ecp_nistz256_mul_mont_vis3    ! p256_mul_mont(res_y, res_y, R);
2946         add     %sp,LOCALS64+$res_y,$rp
2947
2948         add     %sp,LOCALS64+$S2,$bp
2949         call    __ecp_nistz256_sub_from_vis3    ! p256_sub(res_y, res_y, S2);
2950         add     %sp,LOCALS64+$res_y,$rp
2951
2952         ldx     [%fp+STACK_BIAS-16],$t1         ! !in1infty
2953         ldx     [%fp+STACK_BIAS-8],$t2          ! !in2infty
2954 1:      call    .+8
2955         add     %o7,.Lone_mont_vis3-1b,$bp
2956 ___
2957 for($i=0;$i<64;$i+=16) {                        # conditional moves
2958 $code.=<<___;
2959         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2960         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2961         ldx     [%sp+LOCALS64+$in2_x+$i],$acc2  ! in2
2962         ldx     [%sp+LOCALS64+$in2_x+$i+8],$acc3
2963         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2964         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2965         movrz   $t1,$acc2,$acc0
2966         movrz   $t1,$acc3,$acc1
2967         movrz   $t2,$acc4,$acc0
2968         movrz   $t2,$acc5,$acc1
2969         srlx    $acc0,32,$acc2
2970         srlx    $acc1,32,$acc3
2971         st      $acc0,[$rp_real+$i]
2972         st      $acc2,[$rp_real+$i+4]
2973         st      $acc1,[$rp_real+$i+8]
2974         st      $acc3,[$rp_real+$i+12]
2975 ___
2976 }
2977 for(;$i<96;$i+=16) {
2978 $code.=<<___;
2979         ldx     [%sp+LOCALS64+$res_x+$i],$acc0  ! res
2980         ldx     [%sp+LOCALS64+$res_x+$i+8],$acc1
2981         ldx     [$bp+$i-64],$acc2               ! "in2"
2982         ldx     [$bp+$i-64+8],$acc3
2983         ldx     [%sp+LOCALS64+$in1_x+$i],$acc4  ! in1
2984         ldx     [%sp+LOCALS64+$in1_x+$i+8],$acc5
2985         movrz   $t1,$acc2,$acc0
2986         movrz   $t1,$acc3,$acc1
2987         movrz   $t2,$acc4,$acc0
2988         movrz   $t2,$acc5,$acc1
2989         srlx    $acc0,32,$acc2
2990         srlx    $acc1,32,$acc3
2991         st      $acc0,[$rp_real+$i]
2992         st      $acc2,[$rp_real+$i+4]
2993         st      $acc1,[$rp_real+$i+8]
2994         st      $acc3,[$rp_real+$i+12]
2995 ___
2996 }
2997 $code.=<<___;
2998         ret
2999         restore
3000 .size   ecp_nistz256_point_add_affine_vis3,.-ecp_nistz256_point_add_affine_vis3
3001 .align  64
3002 .Lone_mont_vis3:
3003 .long   0x00000000,0x00000001, 0xffffffff,0x00000000
3004 .long   0xffffffff,0xffffffff, 0x00000000,0xfffffffe
3005 .align  64
3006 ___
3007 }                                                               }}}
3008 \f
3009 # Purpose of these subroutines is to explicitly encode VIS instructions,
3010 # so that one can compile the module without having to specify VIS
3011 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
3012 # Idea is to reserve for option to produce "universal" binary and let
3013 # programmer detect if current CPU is VIS capable at run-time.
3014 sub unvis3 {
3015 my ($mnemonic,$rs1,$rs2,$rd)=@_;
3016 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
3017 my ($ref,$opf);
3018 my %visopf = (  "addxc"         => 0x011,
3019                 "addxccc"       => 0x013,
3020                 "umulxhi"       => 0x016        );
3021
3022     $ref = "$mnemonic\t$rs1,$rs2,$rd";
3023
3024     if ($opf=$visopf{$mnemonic}) {
3025         foreach ($rs1,$rs2,$rd) {
3026             return $ref if (!/%([goli])([0-9])/);
3027             $_=$bias{$1}+$2;
3028         }
3029
3030         return  sprintf ".word\t0x%08x !%s",
3031                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
3032                         $ref;
3033     } else {
3034         return $ref;
3035     }
3036 }
3037
3038 foreach (split("\n",$code)) {
3039         s/\`([^\`]*)\`/eval $1/ge;
3040
3041         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
3042                 &unvis3($1,$2,$3,$4)
3043          /ge;
3044
3045         print $_,"\n";
3046 }
3047
3048 close STDOUT;