SPARCv9 assembly pack: unify build rules and argument handling.
[openssl.git] / crypto / poly1305 / asm / poly1305-sparcv9.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements Poly1305 hash for SPARCv9, vanilla, as well
11 # as VIS3 and FMA extensions.
12 #
13 # May, August 2015
14 #
15 # Numbers are cycles per processed byte with poly1305_blocks alone.
16 #
17 #                       IALU(*)         FMA
18 #
19 # UltraSPARC III        11.9(**)
20 # SPARC T3              7.85
21 # SPARC T4              1.67(***)       6.55
22 # SPARC64 X             5.54            3.64
23 #
24 # (*)   Comparison to compiler-generated code is really problematic,
25 #       because latter's performance varies too much depending on too
26 #       many variables. For example, one can measure from 5x to 15x
27 #       improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
28 #       unfair comparison, because compiler doesn't use VIS3, but
29 #       given same initial conditions coefficient varies from 3x to 9x.
30 # (**)  Pre-III performance should be even worse; floating-point
31 #       performance for UltraSPARC I-IV on the other hand is reported
32 #       to be 4.25 for hand-coded assembly, but they are just too old
33 #       to care about.
34 # (***) Multi-process benchmark saturates at ~12.5x single-process
35 #       result on 8-core processor, or ~21GBps per 2.85GHz socket.
36
37 my $output = pop;
38 open STDOUT,">$output";
39
40 my ($ctx,$inp,$len,$padbit,$shl,$shr)   = map("%i$_",(0..5));
41 my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4)    = map("%l$_",(0..7));
42 my ($h0,$h1,$h2,$h3, $t0,$t1,$t2)       = map("%o$_",(0..5,7));
43 my ($d0,$d1,$d2,$d3)                    = map("%g$_",(1..4));
44
45 $code.=<<___;
46 #include "sparc_arch.h"
47
48 #ifdef  __arch64__
49 .register       %g2,#scratch
50 .register       %g3,#scratch
51 # define        STPTR   stx
52 # define        SIZE_T  8
53 #else
54 # define        STPTR   st
55 # define        SIZE_T  4
56 #endif
57 #define LOCALS  (STACK_BIAS+STACK_FRAME)
58
59 .section        ".text",#alloc,#execinstr
60
61 #ifdef __PIC__
62 SPARC_PIC_THUNK(%g1)
63 #endif
64
65 .globl  poly1305_init
66 .align  32
67 poly1305_init:
68         save    %sp,-STACK_FRAME-16,%sp
69         nop
70
71         SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
72         ld      [%g1],%g1
73
74         and     %g1,SPARCV9_FMADD|SPARCV9_PREFER_FPU|SPARCV9_VIS3,%g1
75         cmp     %g1,SPARCV9_FMADD|SPARCV9_PREFER_FPU
76         be      .Lpoly1305_init_fma
77         nop
78
79         stx     %g0,[$ctx+0]
80         stx     %g0,[$ctx+8]            ! zero hash value
81         brz,pn  $inp,.Lno_key
82         stx     %g0,[$ctx+16]
83
84         and     $inp,7,$shr             ! alignment factor
85         andn    $inp,7,$inp
86         sll     $shr,3,$shr             ! *8
87         neg     $shr,$shl
88
89         sethi   %hi(0x0ffffffc),$t0
90         set     8,$h1
91         or      $t0,%lo(0x0ffffffc),$t0
92         set     16,$h2
93         sllx    $t0,32,$t1
94         or      $t0,$t1,$t1             ! 0x0ffffffc0ffffffc
95         or      $t1,3,$t0               ! 0x0ffffffc0fffffff
96
97         ldxa    [$inp+%g0]0x88,$h0      ! load little-endian key
98         brz,pt  $shr,.Lkey_aligned
99         ldxa    [$inp+$h1]0x88,$h1
100
101         ldxa    [$inp+$h2]0x88,$h2
102         srlx    $h0,$shr,$h0
103         sllx    $h1,$shl,$t2
104         srlx    $h1,$shr,$h1
105         or      $t2,$h0,$h0
106         sllx    $h2,$shl,$h2
107         or      $h2,$h1,$h1
108
109 .Lkey_aligned:
110         and     $t0,$h0,$h0
111         and     $t1,$h1,$h1
112         stx     $h0,[$ctx+32+0]         ! store key
113         stx     $h1,[$ctx+32+8]
114
115         andcc   %g1,SPARCV9_VIS3,%g0
116         be      .Lno_key
117         nop
118
119 1:      call    .+8
120         add     %o7,poly1305_blocks_vis3-1b,%o7
121
122         add     %o7,poly1305_emit-poly1305_blocks_vis3,%o5
123         STPTR   %o7,[%i2]
124         STPTR   %o5,[%i2+SIZE_T]
125
126         ret
127         restore %g0,1,%o0               ! return 1
128
129 .Lno_key:
130         ret
131         restore %g0,%g0,%o0             ! return 0
132 .size   poly1305_init,.-poly1305_init
133
134 .globl  poly1305_blocks
135 .align  32
136 poly1305_blocks:
137         save    %sp,-STACK_FRAME,%sp
138         andn    $len,15,$len
139
140         brz,pn  $len,.Lno_data
141         nop
142
143         ld      [$ctx+32+0],$r1         ! load key
144         ld      [$ctx+32+4],$r0
145         ld      [$ctx+32+8],$r3
146         ld      [$ctx+32+12],$r2
147
148         ld      [$ctx+0],$h1            ! load hash value
149         ld      [$ctx+4],$h0
150         ld      [$ctx+8],$h3
151         ld      [$ctx+12],$h2
152         ld      [$ctx+16],$h4
153
154         and     $inp,7,$shr             ! alignment factor
155         andn    $inp,7,$inp
156         set     8,$d1
157         sll     $shr,3,$shr             ! *8
158         set     16,$d2
159         neg     $shr,$shl
160
161         srl     $r1,2,$s1
162         srl     $r2,2,$s2
163         add     $r1,$s1,$s1
164         srl     $r3,2,$s3
165         add     $r2,$s2,$s2
166         add     $r3,$s3,$s3
167
168 .Loop:
169         ldxa    [$inp+%g0]0x88,$d0      ! load little-endian input
170         brz,pt  $shr,.Linp_aligned
171         ldxa    [$inp+$d1]0x88,$d1
172
173         ldxa    [$inp+$d2]0x88,$d2
174         srlx    $d0,$shr,$d0
175         sllx    $d1,$shl,$t1
176         srlx    $d1,$shr,$d1
177         or      $t1,$d0,$d0
178         sllx    $d2,$shl,$d2
179         or      $d2,$d1,$d1
180
181 .Linp_aligned:
182         srlx    $d0,32,$t0
183         addcc   $d0,$h0,$h0             ! accumulate input
184         srlx    $d1,32,$t1
185         addccc  $t0,$h1,$h1
186         addccc  $d1,$h2,$h2
187         addccc  $t1,$h3,$h3
188         addc    $padbit,$h4,$h4
189
190         umul    $r0,$h0,$d0
191         umul    $r1,$h0,$d1
192         umul    $r2,$h0,$d2
193         umul    $r3,$h0,$d3
194          sub    $len,16,$len
195          add    $inp,16,$inp
196
197         umul    $s3,$h1,$t0
198         umul    $r0,$h1,$t1
199         umul    $r1,$h1,$t2
200         add     $t0,$d0,$d0
201         add     $t1,$d1,$d1
202         umul    $r2,$h1,$t0
203         add     $t2,$d2,$d2
204         add     $t0,$d3,$d3
205
206         umul    $s2,$h2,$t1
207         umul    $s3,$h2,$t2
208         umul    $r0,$h2,$t0
209         add     $t1,$d0,$d0
210         add     $t2,$d1,$d1
211         umul    $r1,$h2,$t1
212         add     $t0,$d2,$d2
213         add     $t1,$d3,$d3
214
215         umul    $s1,$h3,$t2
216         umul    $s2,$h3,$t0
217         umul    $s3,$h3,$t1
218         add     $t2,$d0,$d0
219         add     $t0,$d1,$d1
220         umul    $r0,$h3,$t2
221         add     $t1,$d2,$d2
222         add     $t2,$d3,$d3
223
224         umul    $s1,$h4,$t0
225         umul    $s2,$h4,$t1
226         umul    $s3,$h4,$t2
227         umul    $r0,$h4,$h4
228         add     $t0,$d1,$d1
229         add     $t1,$d2,$d2
230         srlx    $d0,32,$h1
231         add     $t2,$d3,$d3
232         srlx    $d1,32,$h2
233
234         addcc   $d1,$h1,$h1
235         srlx    $d2,32,$h3
236          set    8,$d1
237         addccc  $d2,$h2,$h2
238         srlx    $d3,32,$t0
239          set    16,$d2
240         addccc  $d3,$h3,$h3
241         addc    $t0,$h4,$h4
242
243         srl     $h4,2,$t0               ! final reduction step
244         andn    $h4,3,$t1
245         and     $h4,3,$h4
246         add     $t1,$t0,$t0
247
248         addcc   $t0,$d0,$h0
249         addccc  %g0,$h1,$h1
250         addccc  %g0,$h2,$h2
251         brnz,pt $len,.Loop
252         addc    %g0,$h3,$h3
253
254         st      $h1,[$ctx+0]            ! store hash value
255         st      $h0,[$ctx+4]
256         st      $h3,[$ctx+8]
257         st      $h2,[$ctx+12]
258         st      $h4,[$ctx+16]
259
260 .Lno_data:
261         ret
262         restore
263 .size   poly1305_blocks,.-poly1305_blocks
264 ___
265 ########################################################################
266 # VIS3 has umulxhi and addxc...
267 {
268 my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
269 my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
270
271 $code.=<<___;
272 .align  32
273 poly1305_blocks_vis3:
274         save    %sp,-STACK_FRAME,%sp
275         andn    $len,15,$len
276
277         brz,pn  $len,.Lno_data
278         nop
279
280         ldx     [$ctx+32+0],$R0         ! load key
281         ldx     [$ctx+32+8],$R1
282
283         ldx     [$ctx+0],$H0            ! load hash value
284         ldx     [$ctx+8],$H1
285         ld      [$ctx+16],$H2
286
287         and     $inp,7,$shr             ! alignment factor
288         andn    $inp,7,$inp
289         set     8,$r1
290         sll     $shr,3,$shr             ! *8
291         set     16,$r2
292         neg     $shr,$shl
293
294         srlx    $R1,2,$S1
295         add     $R1,$S1,$S1
296
297 .Loop_vis3:
298         ldxa    [$inp+%g0]0x88,$D0      ! load little-endian input
299         brz,pt  $shr,.Linp_aligned_vis3
300         ldxa    [$inp+$r1]0x88,$D1
301
302         ldxa    [$inp+$r2]0x88,$D2
303         srlx    $D0,$shr,$D0
304         sllx    $D1,$shl,$T1
305         srlx    $D1,$shr,$D1
306         or      $T1,$D0,$D0
307         sllx    $D2,$shl,$D2
308         or      $D2,$D1,$D1
309
310 .Linp_aligned_vis3:
311         addcc   $D0,$H0,$H0             ! accumulate input
312          sub    $len,16,$len
313         addxccc $D1,$H1,$H1
314          add    $inp,16,$inp
315
316         mulx    $R0,$H0,$D0             ! r0*h0
317         addxc   $padbit,$H2,$H2
318         umulxhi $R0,$H0,$D1
319         mulx    $S1,$H1,$T0             ! s1*h1
320         umulxhi $S1,$H1,$T1
321         addcc   $T0,$D0,$D0
322         mulx    $R1,$H0,$T0             ! r1*h0
323         addxc   $T1,$D1,$D1
324         umulxhi $R1,$H0,$D2
325         addcc   $T0,$D1,$D1
326         mulx    $R0,$H1,$T0             ! r0*h1
327         addxc   %g0,$D2,$D2
328         umulxhi $R0,$H1,$T1
329         addcc   $T0,$D1,$D1
330         mulx    $S1,$H2,$T0             ! s1*h2
331         addxc   $T1,$D2,$D2
332         mulx    $R0,$H2,$T1             ! r0*h2
333         addcc   $T0,$D1,$D1
334         addxc   $T1,$D2,$D2
335
336         srlx    $D2,2,$T0               ! final reduction step
337         andn    $D2,3,$T1
338         and     $D2,3,$H2
339         add     $T1,$T0,$T0
340
341         addcc   $T0,$D0,$H0
342         brnz,pt $len,.Loop_vis3
343         addxc   %g0,$D1,$H1
344
345         stx     $H0,[$ctx+0]            ! store hash value
346         stx     $H1,[$ctx+8]
347         st      $H2,[$ctx+16]
348
349         ret
350         restore
351 .size   poly1305_blocks_vis3,.-poly1305_blocks_vis3
352 ___
353 }
354 my ($mac,$nonce) = ($inp,$len);
355
356 $code.=<<___;
357 .globl  poly1305_emit
358 .align  32
359 poly1305_emit:
360         save    %sp,-STACK_FRAME,%sp
361
362         ld      [$ctx+0],$h1            ! load hash value
363         ld      [$ctx+4],$h0
364         ld      [$ctx+8],$h3
365         ld      [$ctx+12],$h2
366         ld      [$ctx+16],$h4
367
368         addcc   $h0,5,$r0               ! compare to modulus
369         addccc  $h1,0,$r1
370         addccc  $h2,0,$r2
371         addccc  $h3,0,$r3
372         addc    $h4,0,$h4
373         andcc   $h4,4,%g0               ! did it carry/borrow?
374
375         movnz   %icc,$r0,$h0
376         ld      [$nonce+0],$r0          ! load nonce
377         movnz   %icc,$r1,$h1
378         ld      [$nonce+4],$r1
379         movnz   %icc,$r2,$h2
380         ld      [$nonce+8],$r2
381         movnz   %icc,$r3,$h3
382         ld      [$nonce+12],$r3
383
384         addcc   $r0,$h0,$h0             ! accumulate nonce
385         addccc  $r1,$h1,$h1
386         addccc  $r2,$h2,$h2
387         addc    $r3,$h3,$h3
388
389         srl     $h0,8,$r0
390         stb     $h0,[$mac+0]            ! store little-endian result
391         srl     $h0,16,$r1
392         stb     $r0,[$mac+1]
393         srl     $h0,24,$r2
394         stb     $r1,[$mac+2]
395         stb     $r2,[$mac+3]
396
397         srl     $h1,8,$r0
398         stb     $h1,[$mac+4]
399         srl     $h1,16,$r1
400         stb     $r0,[$mac+5]
401         srl     $h1,24,$r2
402         stb     $r1,[$mac+6]
403         stb     $r2,[$mac+7]
404
405         srl     $h2,8,$r0
406         stb     $h2,[$mac+8]
407         srl     $h2,16,$r1
408         stb     $r0,[$mac+9]
409         srl     $h2,24,$r2
410         stb     $r1,[$mac+10]
411         stb     $r2,[$mac+11]
412
413         srl     $h3,8,$r0
414         stb     $h3,[$mac+12]
415         srl     $h3,16,$r1
416         stb     $r0,[$mac+13]
417         srl     $h3,24,$r2
418         stb     $r1,[$mac+14]
419         stb     $r2,[$mac+15]
420
421         ret
422         restore
423 .size   poly1305_emit,.-poly1305_emit
424 ___
425
426 {
427 my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
428 my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
429 my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
430 my $i2=$step;
431
432 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
433     $two0,$two32,$two64,$two96,$two130,$five_two130,
434     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
435     $s2lo,$s2hi,$s3lo,$s3hi,
436     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
437 # borrowings
438 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
439 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
440 my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
441
442 $code.=<<___;
443 .align  32
444 poly1305_init_fma:
445         save    %sp,-STACK_FRAME-16,%sp
446         nop
447
448 .Lpoly1305_init_fma:
449 1:      call    .+8
450         add     %o7,.Lconsts_fma-1b,%o7
451
452         ldd     [%o7+8*0],$two0                 ! load constants
453         ldd     [%o7+8*1],$two32
454         ldd     [%o7+8*2],$two64
455         ldd     [%o7+8*3],$two96
456         ldd     [%o7+8*5],$five_two130
457
458         std     $two0,[$ctx+8*0]                ! initial hash value, biased 0
459         std     $two32,[$ctx+8*1]
460         std     $two64,[$ctx+8*2]
461         std     $two96,[$ctx+8*3]
462
463         brz,pn  $inp,.Lno_key_fma
464         nop
465
466         stx     %fsr,[%sp+LOCALS]               ! save original %fsr
467         ldx     [%o7+8*6],%fsr                  ! load new %fsr
468
469         std     $two0,[$ctx+8*4]                ! key "template"
470         std     $two32,[$ctx+8*5]
471         std     $two64,[$ctx+8*6]
472         std     $two96,[$ctx+8*7]
473
474         and     $inp,7,$shr
475         andn    $inp,7,$inp                     ! align pointer
476         mov     8,$i1
477         sll     $shr,3,$shr
478         mov     16,$i2
479         neg     $shr,$shl
480
481         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian key
482         ldxa    [$inp+$i1]0x88,$in2
483
484         brz     $shr,.Lkey_aligned_fma
485         sethi   %hi(0xf0000000),$i1             !   0xf0000000
486
487         ldxa    [$inp+$i2]0x88,$in4
488
489         srlx    $in0,$shr,$in0                  ! align data
490         sllx    $in2,$shl,$in1
491         srlx    $in2,$shr,$in2
492         or      $in1,$in0,$in0
493         sllx    $in4,$shl,$in3
494         or      $in3,$in2,$in2
495
496 .Lkey_aligned_fma:
497         or      $i1,3,$i2                       !   0xf0000003
498         srlx    $in0,32,$in1
499         andn    $in0,$i1,$in0                   ! &=0x0fffffff
500         andn    $in1,$i2,$in1                   ! &=0x0ffffffc
501         srlx    $in2,32,$in3
502         andn    $in2,$i2,$in2
503         andn    $in3,$i2,$in3
504
505         st      $in0,[$ctx+`8*4+4`]             ! fill "template"
506         st      $in1,[$ctx+`8*5+4`]
507         st      $in2,[$ctx+`8*6+4`]
508         st      $in3,[$ctx+`8*7+4`]
509
510         ldd     [$ctx+8*4],$h0lo                ! load [biased] key
511         ldd     [$ctx+8*5],$h1lo
512         ldd     [$ctx+8*6],$h2lo
513         ldd     [$ctx+8*7],$h3lo
514
515         fsubd   $h0lo,$two0, $h0lo              ! r0
516          ldd    [%o7+8*7],$two0                 ! more constants
517         fsubd   $h1lo,$two32,$h1lo              ! r1
518          ldd    [%o7+8*8],$two32
519         fsubd   $h2lo,$two64,$h2lo              ! r2
520          ldd    [%o7+8*9],$two64
521         fsubd   $h3lo,$two96,$h3lo              ! r3
522          ldd    [%o7+8*10],$two96
523
524         fmuld   $five_two130,$h1lo,$s1lo        ! s1
525         fmuld   $five_two130,$h2lo,$s2lo        ! s2
526         fmuld   $five_two130,$h3lo,$s3lo        ! s3
527
528         faddd   $h0lo,$two0, $h0hi
529         faddd   $h1lo,$two32,$h1hi
530         faddd   $h2lo,$two64,$h2hi
531         faddd   $h3lo,$two96,$h3hi
532
533         fsubd   $h0hi,$two0, $h0hi
534          ldd    [%o7+8*11],$two0                ! more constants
535         fsubd   $h1hi,$two32,$h1hi
536          ldd    [%o7+8*12],$two32
537         fsubd   $h2hi,$two64,$h2hi
538          ldd    [%o7+8*13],$two64
539         fsubd   $h3hi,$two96,$h3hi
540
541         fsubd   $h0lo,$h0hi,$h0lo
542          std    $h0hi,[$ctx+8*5]                ! r0hi
543         fsubd   $h1lo,$h1hi,$h1lo
544          std    $h1hi,[$ctx+8*7]                ! r1hi
545         fsubd   $h2lo,$h2hi,$h2lo
546          std    $h2hi,[$ctx+8*9]                ! r2hi
547         fsubd   $h3lo,$h3hi,$h3lo
548          std    $h3hi,[$ctx+8*11]               ! r3hi
549
550         faddd   $s1lo,$two0, $s1hi
551         faddd   $s2lo,$two32,$s2hi
552         faddd   $s3lo,$two64,$s3hi
553
554         fsubd   $s1hi,$two0, $s1hi
555         fsubd   $s2hi,$two32,$s2hi
556         fsubd   $s3hi,$two64,$s3hi
557
558         fsubd   $s1lo,$s1hi,$s1lo
559         fsubd   $s2lo,$s2hi,$s2lo
560         fsubd   $s3lo,$s3hi,$s3lo
561
562         ldx     [%sp+LOCALS],%fsr               ! restore %fsr
563
564         std     $h0lo,[$ctx+8*4]                ! r0lo
565         std     $h1lo,[$ctx+8*6]                ! r1lo
566         std     $h2lo,[$ctx+8*8]                ! r2lo
567         std     $h3lo,[$ctx+8*10]               ! r3lo
568
569         std     $s1hi,[$ctx+8*13]
570         std     $s2hi,[$ctx+8*15]
571         std     $s3hi,[$ctx+8*17]
572
573         std     $s1lo,[$ctx+8*12]
574         std     $s2lo,[$ctx+8*14]
575         std     $s3lo,[$ctx+8*16]
576
577         add     %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
578         add     %o7,poly1305_emit_fma-.Lconsts_fma,%o1
579         STPTR   %o0,[%i2]
580         STPTR   %o1,[%i2+SIZE_T]
581
582         ret
583         restore %g0,1,%o0                       ! return 1
584
585 .Lno_key_fma:
586         ret
587         restore %g0,%g0,%o0                     ! return 0
588 .size   poly1305_init_fma,.-poly1305_init_fma
589
590 .align  32
591 poly1305_blocks_fma:
592         save    %sp,-STACK_FRAME-48,%sp
593         srlx    $len,4,$len
594
595         brz,pn  $len,.Labort
596         sub     $len,1,$len
597
598 1:      call    .+8
599         add     %o7,.Lconsts_fma-1b,%o7
600
601         ldd     [%o7+8*0],$two0                 ! load constants
602         ldd     [%o7+8*1],$two32
603         ldd     [%o7+8*2],$two64
604         ldd     [%o7+8*3],$two96
605         ldd     [%o7+8*4],$two130
606         ldd     [%o7+8*5],$five_two130
607
608         ldd     [$ctx+8*0],$h0lo                ! load [biased] hash value
609         ldd     [$ctx+8*1],$h1lo
610         ldd     [$ctx+8*2],$h2lo
611         ldd     [$ctx+8*3],$h3lo
612
613         std     $two0,[%sp+LOCALS+8*0]          ! input "template"
614         sethi   %hi((1023+52+96)<<20),$in3
615         std     $two32,[%sp+LOCALS+8*1]
616         or      $padbit,$in3,$in3
617         std     $two64,[%sp+LOCALS+8*2]
618         st      $in3,[%sp+LOCALS+8*3]
619
620         and     $inp,7,$shr
621         andn    $inp,7,$inp                     ! align pointer
622         mov     8,$i1
623         sll     $shr,3,$shr
624         mov     16,$step
625         neg     $shr,$shl
626
627         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian input
628         brz     $shr,.Linp_aligned_fma
629         ldxa    [$inp+$i1]0x88,$in2
630
631         ldxa    [$inp+$step]0x88,$in4
632         add     $inp,8,$inp
633
634         srlx    $in0,$shr,$in0                  ! align data
635         sllx    $in2,$shl,$in1
636         srlx    $in2,$shr,$in2
637         or      $in1,$in0,$in0
638         sllx    $in4,$shl,$in3
639         srlx    $in4,$shr,$in4                  ! pre-shift
640         or      $in3,$in2,$in2
641
642 .Linp_aligned_fma:
643         srlx    $in0,32,$in1
644         movrz   $len,0,$step
645         srlx    $in2,32,$in3
646         add     $step,$inp,$inp                 ! conditional advance
647
648         st      $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
649         st      $in1,[%sp+LOCALS+8*1+4]
650         st      $in2,[%sp+LOCALS+8*2+4]
651         st      $in3,[%sp+LOCALS+8*3+4]
652
653         ldd     [$ctx+8*4],$r0lo                ! load key
654         ldd     [$ctx+8*5],$r0hi
655         ldd     [$ctx+8*6],$r1lo
656         ldd     [$ctx+8*7],$r1hi
657         ldd     [$ctx+8*8],$r2lo
658         ldd     [$ctx+8*9],$r2hi
659         ldd     [$ctx+8*10],$r3lo
660         ldd     [$ctx+8*11],$r3hi
661         ldd     [$ctx+8*12],$s1lo
662         ldd     [$ctx+8*13],$s1hi
663         ldd     [$ctx+8*14],$s2lo
664         ldd     [$ctx+8*15],$s2hi
665         ldd     [$ctx+8*16],$s3lo
666         ldd     [$ctx+8*17],$s3hi
667
668         stx     %fsr,[%sp+LOCALS+8*4]           ! save original %fsr
669         ldx     [%o7+8*6],%fsr                  ! load new %fsr
670
671         subcc   $len,1,$len
672         movrz   $len,0,$step
673
674         ldd     [%sp+LOCALS+8*0],$x0            ! load biased input
675         ldd     [%sp+LOCALS+8*1],$x1
676         ldd     [%sp+LOCALS+8*2],$x2
677         ldd     [%sp+LOCALS+8*3],$x3
678
679         fsubd   $h0lo,$two0, $h0lo              ! de-bias hash value
680         fsubd   $h1lo,$two32,$h1lo
681          ldxa   [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
682         fsubd   $h2lo,$two64,$h2lo
683         fsubd   $h3lo,$two96,$h3lo
684          ldxa   [$inp+$i1]0x88,$in2
685
686         fsubd   $x0,$two0, $x0                  ! de-bias input
687         fsubd   $x1,$two32,$x1
688         fsubd   $x2,$two64,$x2
689         fsubd   $x3,$two96,$x3
690
691         brz     $shr,.Linp_aligned_fma2
692         add     $step,$inp,$inp                 ! conditional advance
693
694         sllx    $in0,$shl,$in1                  ! align data
695         srlx    $in0,$shr,$in3
696         or      $in1,$in4,$in0
697         sllx    $in2,$shl,$in1
698         srlx    $in2,$shr,$in4                  ! pre-shift
699         or      $in3,$in1,$in2
700 .Linp_aligned_fma2:
701         srlx    $in0,32,$in1
702         srlx    $in2,32,$in3
703
704         faddd   $h0lo,$x0,$x0                   ! accumulate input
705          stw    $in0,[%sp+LOCALS+8*0+4]
706         faddd   $h1lo,$x1,$x1
707          stw    $in1,[%sp+LOCALS+8*1+4]
708         faddd   $h2lo,$x2,$x2
709          stw    $in2,[%sp+LOCALS+8*2+4]
710         faddd   $h3lo,$x3,$x3
711          stw    $in3,[%sp+LOCALS+8*3+4]
712
713         b       .Lentry_fma
714         nop
715
716 .align  16
717 .Loop_fma:
718         ldxa    [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
719         ldxa    [$inp+$i1]0x88,$in2
720         movrz   $len,0,$step
721
722         faddd   $y0,$h0lo,$h0lo                 ! accumulate input
723         faddd   $y1,$h0hi,$h0hi
724         faddd   $y2,$h2lo,$h2lo
725         faddd   $y3,$h2hi,$h2hi
726
727         brz,pn  $shr,.Linp_aligned_fma3
728         add     $step,$inp,$inp                 ! conditional advance
729
730         sllx    $in0,$shl,$in1                  ! align data
731         srlx    $in0,$shr,$in3
732         or      $in1,$in4,$in0
733         sllx    $in2,$shl,$in1
734         srlx    $in2,$shr,$in4                  ! pre-shift
735         or      $in3,$in1,$in2
736
737 .Linp_aligned_fma3:
738         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
739         faddd   $two64,$h1lo,$c1lo
740          srlx   $in0,32,$in1
741         faddd   $two64,$h1hi,$c1hi
742          srlx   $in2,32,$in3
743         faddd   $two130,$h3lo,$c3lo
744          st     $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
745         faddd   $two130,$h3hi,$c3hi
746          st     $in1,[%sp+LOCALS+8*1+4]
747         faddd   $two32,$h0lo,$c0lo
748          st     $in2,[%sp+LOCALS+8*2+4]
749         faddd   $two32,$h0hi,$c0hi
750          st     $in3,[%sp+LOCALS+8*3+4]
751         faddd   $two96,$h2lo,$c2lo
752         faddd   $two96,$h2hi,$c2hi
753
754         fsubd   $c1lo,$two64,$c1lo
755         fsubd   $c1hi,$two64,$c1hi
756         fsubd   $c3lo,$two130,$c3lo
757         fsubd   $c3hi,$two130,$c3hi
758         fsubd   $c0lo,$two32,$c0lo
759         fsubd   $c0hi,$two32,$c0hi
760         fsubd   $c2lo,$two96,$c2lo
761         fsubd   $c2hi,$two96,$c2hi
762
763         fsubd   $h1lo,$c1lo,$h1lo
764         fsubd   $h1hi,$c1hi,$h1hi
765         fsubd   $h3lo,$c3lo,$h3lo
766         fsubd   $h3hi,$c3hi,$h3hi
767         fsubd   $h2lo,$c2lo,$h2lo
768         fsubd   $h2hi,$c2hi,$h2hi
769         fsubd   $h0lo,$c0lo,$h0lo
770         fsubd   $h0hi,$c0hi,$h0hi
771
772         faddd   $h1lo,$c0lo,$h1lo
773         faddd   $h1hi,$c0hi,$h1hi
774         faddd   $h3lo,$c2lo,$h3lo
775         faddd   $h3hi,$c2hi,$h3hi
776         faddd   $h2lo,$c1lo,$h2lo
777         faddd   $h2hi,$c1hi,$h2hi
778         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
779         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
780
781         faddd   $h1lo,$h1hi,$x1
782          ldd    [$ctx+8*12],$s1lo               ! reload constants
783         faddd   $h3lo,$h3hi,$x3
784          ldd    [$ctx+8*13],$s1hi
785         faddd   $h2lo,$h2hi,$x2
786          ldd    [$ctx+8*10],$r3lo
787         faddd   $h0lo,$h0hi,$x0
788          ldd    [$ctx+8*11],$r3hi
789
790 .Lentry_fma:
791         fmuld   $x1,$s3lo,$h0lo
792         fmuld   $x1,$s3hi,$h0hi
793         fmuld   $x1,$r1lo,$h2lo
794         fmuld   $x1,$r1hi,$h2hi
795         fmuld   $x1,$r0lo,$h1lo
796         fmuld   $x1,$r0hi,$h1hi
797         fmuld   $x1,$r2lo,$h3lo
798         fmuld   $x1,$r2hi,$h3hi
799
800         fmaddd  $x3,$s1lo,$h0lo,$h0lo
801         fmaddd  $x3,$s1hi,$h0hi,$h0hi
802         fmaddd  $x3,$s3lo,$h2lo,$h2lo
803         fmaddd  $x3,$s3hi,$h2hi,$h2hi
804         fmaddd  $x3,$s2lo,$h1lo,$h1lo
805         fmaddd  $x3,$s2hi,$h1hi,$h1hi
806         fmaddd  $x3,$r0lo,$h3lo,$h3lo
807         fmaddd  $x3,$r0hi,$h3hi,$h3hi
808
809         fmaddd  $x2,$s2lo,$h0lo,$h0lo
810         fmaddd  $x2,$s2hi,$h0hi,$h0hi
811         fmaddd  $x2,$r0lo,$h2lo,$h2lo
812         fmaddd  $x2,$r0hi,$h2hi,$h2hi
813         fmaddd  $x2,$s3lo,$h1lo,$h1lo
814          ldd    [%sp+LOCALS+8*0],$y0            ! load [biased] input
815         fmaddd  $x2,$s3hi,$h1hi,$h1hi
816          ldd    [%sp+LOCALS+8*1],$y1
817         fmaddd  $x2,$r1lo,$h3lo,$h3lo
818          ldd    [%sp+LOCALS+8*2],$y2
819         fmaddd  $x2,$r1hi,$h3hi,$h3hi
820          ldd    [%sp+LOCALS+8*3],$y3
821
822         fmaddd  $x0,$r0lo,$h0lo,$h0lo
823          fsubd  $y0,$two0, $y0                  ! de-bias input
824         fmaddd  $x0,$r0hi,$h0hi,$h0hi
825          fsubd  $y1,$two32,$y1
826         fmaddd  $x0,$r2lo,$h2lo,$h2lo
827          fsubd  $y2,$two64,$y2
828         fmaddd  $x0,$r2hi,$h2hi,$h2hi
829          fsubd  $y3,$two96,$y3
830         fmaddd  $x0,$r1lo,$h1lo,$h1lo
831         fmaddd  $x0,$r1hi,$h1hi,$h1hi
832         fmaddd  $x0,$r3lo,$h3lo,$h3lo
833         fmaddd  $x0,$r3hi,$h3hi,$h3hi
834
835         bcc     SIZE_T_CC,.Loop_fma
836         subcc   $len,1,$len
837
838         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
839         faddd   $h0lo,$two32,$c0lo
840         faddd   $h0hi,$two32,$c0hi
841         faddd   $h2lo,$two96,$c2lo
842         faddd   $h2hi,$two96,$c2hi
843         faddd   $h1lo,$two64,$c1lo
844         faddd   $h1hi,$two64,$c1hi
845         faddd   $h3lo,$two130,$c3lo
846         faddd   $h3hi,$two130,$c3hi
847
848         fsubd   $c0lo,$two32,$c0lo
849         fsubd   $c0hi,$two32,$c0hi
850         fsubd   $c2lo,$two96,$c2lo
851         fsubd   $c2hi,$two96,$c2hi
852         fsubd   $c1lo,$two64,$c1lo
853         fsubd   $c1hi,$two64,$c1hi
854         fsubd   $c3lo,$two130,$c3lo
855         fsubd   $c3hi,$two130,$c3hi
856
857         fsubd   $h1lo,$c1lo,$h1lo
858         fsubd   $h1hi,$c1hi,$h1hi
859         fsubd   $h3lo,$c3lo,$h3lo
860         fsubd   $h3hi,$c3hi,$h3hi
861         fsubd   $h2lo,$c2lo,$h2lo
862         fsubd   $h2hi,$c2hi,$h2hi
863         fsubd   $h0lo,$c0lo,$h0lo
864         fsubd   $h0hi,$c0hi,$h0hi
865
866         faddd   $h1lo,$c0lo,$h1lo
867         faddd   $h1hi,$c0hi,$h1hi
868         faddd   $h3lo,$c2lo,$h3lo
869         faddd   $h3hi,$c2hi,$h3hi
870         faddd   $h2lo,$c1lo,$h2lo
871         faddd   $h2hi,$c1hi,$h2hi
872         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
873         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
874
875         faddd   $h1lo,$h1hi,$x1
876         faddd   $h3lo,$h3hi,$x3
877         faddd   $h2lo,$h2hi,$x2
878         faddd   $h0lo,$h0hi,$x0
879
880         faddd   $x1,$two32,$x1                  ! bias
881         faddd   $x3,$two96,$x3
882         faddd   $x2,$two64,$x2
883         faddd   $x0,$two0, $x0
884
885         ldx     [%sp+LOCALS+8*4],%fsr           ! restore saved %fsr
886
887         std     $x1,[$ctx+8*1]                  ! store [biased] hash value
888         std     $x3,[$ctx+8*3]
889         std     $x2,[$ctx+8*2]
890         std     $x0,[$ctx+8*0]
891
892 .Labort:
893         ret
894         restore
895 .size   poly1305_blocks_fma,.-poly1305_blocks_fma
896 ___
897 {
898 my ($mac,$nonce)=($inp,$len);
899
900 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
901    ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
902
903 $code.=<<___;
904 .align  32
905 poly1305_emit_fma:
906         save    %sp,-STACK_FRAME,%sp
907
908         ld      [$ctx+8*0+0],$d0                ! load hash
909         ld      [$ctx+8*0+4],$h0
910         ld      [$ctx+8*1+0],$d1
911         ld      [$ctx+8*1+4],$h1
912         ld      [$ctx+8*2+0],$d2
913         ld      [$ctx+8*2+4],$h2
914         ld      [$ctx+8*3+0],$d3
915         ld      [$ctx+8*3+4],$h3
916
917         sethi   %hi(0xfff00000),$mask
918         andn    $d0,$mask,$d0                   ! mask exponent
919         andn    $d1,$mask,$d1
920         andn    $d2,$mask,$d2
921         andn    $d3,$mask,$d3                   ! can be partially reduced...
922         mov     3,$mask
923
924         srl     $d3,2,$padbit                   ! ... so reduce
925         and     $d3,$mask,$h4
926         andn    $d3,$mask,$d3
927         add     $padbit,$d3,$d3
928
929         addcc   $d3,$h0,$h0
930         addccc  $d0,$h1,$h1
931         addccc  $d1,$h2,$h2
932         addccc  $d2,$h3,$h3
933         addc    %g0,$h4,$h4
934
935         addcc   $h0,5,$d0                       ! compare to modulus
936         addccc  $h1,0,$d1
937         addccc  $h2,0,$d2
938         addccc  $h3,0,$d3
939         addc    $h4,0,$mask
940
941         srl     $mask,2,$mask                   ! did it carry/borrow?
942         neg     $mask,$mask
943         sra     $mask,31,$mask                  ! mask
944
945         andn    $h0,$mask,$h0
946         and     $d0,$mask,$d0
947         andn    $h1,$mask,$h1
948         and     $d1,$mask,$d1
949         or      $d0,$h0,$h0
950         ld      [$nonce+0],$d0                  ! load nonce
951         andn    $h2,$mask,$h2
952         and     $d2,$mask,$d2
953         or      $d1,$h1,$h1
954         ld      [$nonce+4],$d1
955         andn    $h3,$mask,$h3
956         and     $d3,$mask,$d3
957         or      $d2,$h2,$h2
958         ld      [$nonce+8],$d2
959         or      $d3,$h3,$h3
960         ld      [$nonce+12],$d3
961
962         addcc   $d0,$h0,$h0                     ! accumulate nonce
963         addccc  $d1,$h1,$h1
964         addccc  $d2,$h2,$h2
965         addc    $d3,$h3,$h3
966
967         stb     $h0,[$mac+0]                    ! write little-endian result
968         srl     $h0,8,$h0
969         stb     $h1,[$mac+4]
970         srl     $h1,8,$h1
971         stb     $h2,[$mac+8]
972         srl     $h2,8,$h2
973         stb     $h3,[$mac+12]
974         srl     $h3,8,$h3
975
976         stb     $h0,[$mac+1]
977         srl     $h0,8,$h0
978         stb     $h1,[$mac+5]
979         srl     $h1,8,$h1
980         stb     $h2,[$mac+9]
981         srl     $h2,8,$h2
982         stb     $h3,[$mac+13]
983         srl     $h3,8,$h3
984
985         stb     $h0,[$mac+2]
986         srl     $h0,8,$h0
987         stb     $h1,[$mac+6]
988         srl     $h1,8,$h1
989         stb     $h2,[$mac+10]
990         srl     $h2,8,$h2
991         stb     $h3,[$mac+14]
992         srl     $h3,8,$h3
993
994         stb     $h0,[$mac+3]
995         stb     $h1,[$mac+7]
996         stb     $h2,[$mac+11]
997         stb     $h3,[$mac+15]
998
999         ret
1000         restore
1001 .size   poly1305_emit_fma,.-poly1305_emit_fma
1002 ___
1003 }
1004
1005 $code.=<<___;
1006 .align  64
1007 .Lconsts_fma:
1008 .word   0x43300000,0x00000000           ! 2^(52+0)
1009 .word   0x45300000,0x00000000           ! 2^(52+32)
1010 .word   0x47300000,0x00000000           ! 2^(52+64)
1011 .word   0x49300000,0x00000000           ! 2^(52+96)
1012 .word   0x4b500000,0x00000000           ! 2^(52+130)
1013
1014 .word   0x37f40000,0x00000000           ! 5/2^130
1015 .word   0,1<<30                         ! fsr: truncate, no exceptions
1016
1017 .word   0x44300000,0x00000000           ! 2^(52+16+0)
1018 .word   0x46300000,0x00000000           ! 2^(52+16+32)
1019 .word   0x48300000,0x00000000           ! 2^(52+16+64)
1020 .word   0x4a300000,0x00000000           ! 2^(52+16+96)
1021 .word   0x3e300000,0x00000000           ! 2^(52+16+0-96)
1022 .word   0x40300000,0x00000000           ! 2^(52+16+32-96)
1023 .word   0x42300000,0x00000000           ! 2^(52+16+64-96)
1024 .asciz  "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1025 .align  4
1026 ___
1027 }
1028 \f
1029 # Purpose of these subroutines is to explicitly encode VIS instructions,
1030 # so that one can compile the module without having to specify VIS
1031 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1032 # Idea is to reserve for option to produce "universal" binary and let
1033 # programmer detect if current CPU is VIS capable at run-time.
1034 sub unvis3 {
1035 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1036 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1037 my ($ref,$opf);
1038 my %visopf = (  "addxc"         => 0x011,
1039                 "addxccc"       => 0x013,
1040                 "umulxhi"       => 0x016        );
1041
1042     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1043
1044     if ($opf=$visopf{$mnemonic}) {
1045         foreach ($rs1,$rs2,$rd) {
1046             return $ref if (!/%([goli])([0-9])/);
1047             $_=$bias{$1}+$2;
1048         }
1049
1050         return  sprintf ".word\t0x%08x !%s",
1051                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1052                         $ref;
1053     } else {
1054         return $ref;
1055     }
1056 }
1057
1058 sub unfma {
1059 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1060 my ($ref,$opf);
1061 my %fmaopf = (  "fmadds"        => 0x1,
1062                 "fmaddd"        => 0x2,
1063                 "fmsubs"        => 0x5,
1064                 "fmsubd"        => 0x6          );
1065
1066     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1067
1068     if ($opf=$fmaopf{$mnemonic}) {
1069         foreach ($rs1,$rs2,$rs3,$rd) {
1070             return $ref if (!/%f([0-9]{1,2})/);
1071             $_=$1;
1072             if ($1>=32) {
1073                 return $ref if ($1&1);
1074                 # re-encode for upper double register addressing
1075                 $_=($1|$1>>5)&31;
1076             }
1077         }
1078
1079         return  sprintf ".word\t0x%08x !%s",
1080                         0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1081                         $ref;
1082     } else {
1083         return $ref;
1084     }
1085 }
1086
1087 foreach (split("\n",$code)) {
1088         s/\`([^\`]*)\`/eval $1/ge;
1089
1090         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1091                 &unvis3($1,$2,$3,$4)
1092          /ge    or
1093         s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1094                 &unfma($1,$2,$3,$4,$5)
1095          /ge;
1096
1097         print $_,"\n";
1098 }
1099
1100 close STDOUT;