SPARCv9 assembly pack: fine-tune run-time switch.
[openssl.git] / crypto / poly1305 / asm / poly1305-sparcv9.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements Poly1305 hash for SPARCv9, vanilla, as well
11 # as VIS3 and FMA extensions.
12 #
13 # May, August 2015
14 #
15 # Numbers are cycles per processed byte with poly1305_blocks alone.
16 #
17 #                       IALU(*)         FMA
18 #
19 # UltraSPARC III        12.3(**)
20 # SPARC T3              7.92
21 # SPARC T4              1.70(***)       6.55
22 # SPARC64 X             5.60            3.64
23 #
24 # (*)   Comparison to compiler-generated code is really problematic,
25 #       because latter's performance varies too much depending on too
26 #       many variables. For example, one can measure from 5x to 15x
27 #       improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
28 #       unfair comparison, because compiler doesn't use VIS3, but
29 #       given same initial conditions coefficient varies from 3x to 9x.
30 # (**)  Pre-III performance should be even worse; floating-point
31 #       performance for UltraSPARC I-IV on the other hand is reported
32 #       to be 4.25 for hand-coded assembly, but they are just too old
33 #       to care about.
34 # (***) Multi-process benchmark saturates at ~12.5x single-process
35 #       result on 8-core processor, or ~21GBps per 2.85GHz socket.
36
37 my $output = pop;
38 open STDOUT,">$output";
39
40 my ($ctx,$inp,$len,$padbit,$shl,$shr)   = map("%i$_",(0..5));
41 my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4)    = map("%l$_",(0..7));
42 my ($h0,$h1,$h2,$h3, $t0,$t1,$t2)       = map("%o$_",(0..5,7));
43 my ($d0,$d1,$d2,$d3)                    = map("%g$_",(1..4));
44
45 my $output = pop;
46 open STDOUT,">$stdout";
47
48 $code.=<<___;
49 #include "sparc_arch.h"
50
51 #ifdef  __arch64__
52 .register       %g2,#scratch
53 .register       %g3,#scratch
54 # define        STPTR   stx
55 # define        SIZE_T  8
56 #else
57 # define        STPTR   st
58 # define        SIZE_T  4
59 #endif
60 #define LOCALS  (STACK_BIAS+STACK_FRAME)
61
62 .section        ".text",#alloc,#execinstr
63
64 #ifdef __PIC__
65 SPARC_PIC_THUNK(%g1)
66 #endif
67
68 .globl  poly1305_init
69 .align  32
70 poly1305_init:
71         save    %sp,-STACK_FRAME-16,%sp
72         nop
73
74         SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
75         ld      [%g1],%g1
76
77         and     %g1,SPARCV9_FMADD|SPARCV9_VIS3,%g1
78         cmp     %g1,SPARCV9_FMADD
79         be      .Lpoly1305_init_fma
80         nop
81
82         stx     %g0,[$ctx+0]
83         stx     %g0,[$ctx+8]            ! zero hash value
84         brz,pn  $inp,.Lno_key
85         stx     %g0,[$ctx+16]
86
87         and     $inp,7,$shr             ! alignment factor
88         andn    $inp,7,$inp
89         sll     $shr,3,$shr             ! *8
90         neg     $shr,$shl
91
92         sethi   %hi(0x0ffffffc),$t0
93         set     8,$h1
94         or      $t0,%lo(0x0ffffffc),$t0
95         set     16,$h2
96         sllx    $t0,32,$t1
97         or      $t0,$t1,$t1             ! 0x0ffffffc0ffffffc
98         or      $t1,3,$t0               ! 0x0ffffffc0fffffff
99
100         ldxa    [$inp+%g0]0x88,$h0      ! load little-endian key
101         brz,pt  $shr,.Lkey_aligned
102         ldxa    [$inp+$h1]0x88,$h1
103
104         ldxa    [$inp+$h2]0x88,$h2
105         srlx    $h0,$shr,$h0
106         sllx    $h1,$shl,$t2
107         srlx    $h1,$shr,$h1
108         or      $t2,$h0,$h0
109         sllx    $h2,$shl,$h2
110         or      $h2,$h1,$h1
111
112 .Lkey_aligned:
113         and     $t0,$h0,$h0
114         and     $t1,$h1,$h1
115         stx     $h0,[$ctx+32+0]         ! store key
116         stx     $h1,[$ctx+32+8]
117
118         andcc   %g1,SPARCV9_VIS3,%g0
119         be      .Lno_key
120         nop
121
122 1:      call    .+8
123         add     %o7,poly1305_blocks_vis3-1b,%o7
124
125         add     %o7,poly1305_emit-poly1305_blocks_vis3,%o5
126         STPTR   %o7,[%i2]
127         STPTR   %o5,[%i2+SIZE_T]
128
129         ret
130         restore %g0,1,%o0               ! return 1
131
132 .Lno_key:
133         ret
134         restore %g0,%g0,%o0             ! return 0
135 .size   poly1305_init,.-poly1305_init
136
137 .globl  poly1305_blocks
138 .align  32
139 poly1305_blocks:
140         save    %sp,-STACK_FRAME,%sp
141         andn    $len,15,$len
142
143         brz,pn  $len,.Lno_data
144         nop
145
146         ld      [$ctx+32+0],$r1         ! load key
147         ld      [$ctx+32+4],$r0
148         ld      [$ctx+32+8],$r3
149         ld      [$ctx+32+12],$r2
150
151         ld      [$ctx+0],$h1            ! load hash value
152         ld      [$ctx+4],$h0
153         ld      [$ctx+8],$h3
154         ld      [$ctx+12],$h2
155         ld      [$ctx+16],$h4
156
157         and     $inp,7,$shr             ! alignment factor
158         andn    $inp,7,$inp
159         set     8,$d1
160         sll     $shr,3,$shr             ! *8
161         set     16,$d2
162         neg     $shr,$shl
163
164         srl     $r1,2,$s1
165         srl     $r2,2,$s2
166         add     $r1,$s1,$s1
167         srl     $r3,2,$s3
168         add     $r2,$s2,$s2
169         add     $r3,$s3,$s3
170
171 .Loop:
172         ldxa    [$inp+%g0]0x88,$d0      ! load little-endian input
173         brz,pt  $shr,.Linp_aligned
174         ldxa    [$inp+$d1]0x88,$d1
175
176         ldxa    [$inp+$d2]0x88,$d2
177         srlx    $d0,$shr,$d0
178         sllx    $d1,$shl,$t1
179         srlx    $d1,$shr,$d1
180         or      $t1,$d0,$d0
181         sllx    $d2,$shl,$d2
182         or      $d2,$d1,$d1
183
184 .Linp_aligned:
185         srlx    $d0,32,$t0
186         addcc   $d0,$h0,$h0             ! accumulate input
187         srlx    $d1,32,$t1
188         addccc  $t0,$h1,$h1
189         addccc  $d1,$h2,$h2
190         addccc  $t1,$h3,$h3
191         addc    $padbit,$h4,$h4
192
193         umul    $r0,$h0,$d0
194         umul    $r1,$h0,$d1
195         umul    $r2,$h0,$d2
196         umul    $r3,$h0,$d3
197          sub    $len,16,$len
198          add    $inp,16,$inp
199
200         umul    $s3,$h1,$t0
201         umul    $r0,$h1,$t1
202         umul    $r1,$h1,$t2
203         add     $t0,$d0,$d0
204         add     $t1,$d1,$d1
205         umul    $r2,$h1,$t0
206         add     $t2,$d2,$d2
207         add     $t0,$d3,$d3
208
209         umul    $s2,$h2,$t1
210         umul    $s3,$h2,$t2
211         umul    $r0,$h2,$t0
212         add     $t1,$d0,$d0
213         add     $t2,$d1,$d1
214         umul    $r1,$h2,$t1
215         add     $t0,$d2,$d2
216         add     $t1,$d3,$d3
217
218         umul    $s1,$h3,$t2
219         umul    $s2,$h3,$t0
220         umul    $s3,$h3,$t1
221         add     $t2,$d0,$d0
222         add     $t0,$d1,$d1
223         umul    $r0,$h3,$t2
224         add     $t1,$d2,$d2
225         add     $t2,$d3,$d3
226
227         umul    $s1,$h4,$t0
228         umul    $s2,$h4,$t1
229         umul    $s3,$h4,$t2
230         umul    $r0,$h4,$h4
231         add     $t0,$d1,$d1
232         add     $t1,$d2,$d2
233         srlx    $d0,32,$h1
234         add     $t2,$d3,$d3
235         srlx    $d1,32,$h2
236
237         addcc   $d1,$h1,$h1
238         srlx    $d2,32,$h3
239          set    8,$d1
240         addccc  $d2,$h2,$h2
241         srlx    $d3,32,$t0
242          set    16,$d2
243         addccc  $d3,$h3,$h3
244         addc    $t0,$h4,$h4
245
246         srl     $h4,2,$t0               ! final reduction step
247         andn    $h4,3,$t1
248         and     $h4,3,$h4
249         add     $t1,$t0,$t0
250
251         addcc   $t0,$d0,$h0
252         addccc  %g0,$h1,$h1
253         addccc  %g0,$h2,$h2
254         addccc  %g0,$h3,$h3
255         brnz,pt $len,.Loop
256         addc    %g0,$h4,$h4
257
258         st      $h1,[$ctx+0]            ! store hash value
259         st      $h0,[$ctx+4]
260         st      $h3,[$ctx+8]
261         st      $h2,[$ctx+12]
262         st      $h4,[$ctx+16]
263
264 .Lno_data:
265         ret
266         restore
267 .size   poly1305_blocks,.-poly1305_blocks
268 ___
269 ########################################################################
270 # VIS3 has umulxhi and addxc...
271 {
272 my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
273 my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
274
275 $code.=<<___;
276 .align  32
277 poly1305_blocks_vis3:
278         save    %sp,-STACK_FRAME,%sp
279         andn    $len,15,$len
280
281         brz,pn  $len,.Lno_data
282         nop
283
284         ldx     [$ctx+32+0],$R0         ! load key
285         ldx     [$ctx+32+8],$R1
286
287         ldx     [$ctx+0],$H0            ! load hash value
288         ldx     [$ctx+8],$H1
289         ld      [$ctx+16],$H2
290
291         and     $inp,7,$shr             ! alignment factor
292         andn    $inp,7,$inp
293         set     8,$r1
294         sll     $shr,3,$shr             ! *8
295         set     16,$r2
296         neg     $shr,$shl
297
298         srlx    $R1,2,$S1
299         b       .Loop_vis3
300         add     $R1,$S1,$S1
301
302 .Loop_vis3:
303         ldxa    [$inp+%g0]0x88,$D0      ! load little-endian input
304         brz,pt  $shr,.Linp_aligned_vis3
305         ldxa    [$inp+$r1]0x88,$D1
306
307         ldxa    [$inp+$r2]0x88,$D2
308         srlx    $D0,$shr,$D0
309         sllx    $D1,$shl,$T1
310         srlx    $D1,$shr,$D1
311         or      $T1,$D0,$D0
312         sllx    $D2,$shl,$D2
313         or      $D2,$D1,$D1
314
315 .Linp_aligned_vis3:
316         addcc   $D0,$H0,$H0             ! accumulate input
317          sub    $len,16,$len
318         addxccc $D1,$H1,$H1
319          add    $inp,16,$inp
320
321         mulx    $R0,$H0,$D0             ! r0*h0
322         addxc   $padbit,$H2,$H2
323         umulxhi $R0,$H0,$D1
324         mulx    $S1,$H1,$T0             ! s1*h1
325         umulxhi $S1,$H1,$T1
326         addcc   $T0,$D0,$D0
327         mulx    $R1,$H0,$T0             ! r1*h0
328         addxc   $T1,$D1,$D1
329         umulxhi $R1,$H0,$D2
330         addcc   $T0,$D1,$D1
331         mulx    $R0,$H1,$T0             ! r0*h1
332         addxc   %g0,$D2,$D2
333         umulxhi $R0,$H1,$T1
334         addcc   $T0,$D1,$D1
335         mulx    $S1,$H2,$T0             ! s1*h2
336         addxc   $T1,$D2,$D2
337         mulx    $R0,$H2,$T1             ! r0*h2
338         addcc   $T0,$D1,$D1
339         addxc   $T1,$D2,$D2
340
341         srlx    $D2,2,$T0               ! final reduction step
342         andn    $D2,3,$T1
343         and     $D2,3,$H2
344         add     $T1,$T0,$T0
345
346         addcc   $T0,$D0,$H0
347         addxccc %g0,$D1,$H1
348         brnz,pt $len,.Loop_vis3
349         addxc   %g0,$H2,$H2
350
351         stx     $H0,[$ctx+0]            ! store hash value
352         stx     $H1,[$ctx+8]
353         st      $H2,[$ctx+16]
354
355         ret
356         restore
357 .size   poly1305_blocks_vis3,.-poly1305_blocks_vis3
358 ___
359 }
360 my ($mac,$nonce) = ($inp,$len);
361
362 $code.=<<___;
363 .globl  poly1305_emit
364 .align  32
365 poly1305_emit:
366         save    %sp,-STACK_FRAME,%sp
367
368         ld      [$ctx+0],$h1            ! load hash value
369         ld      [$ctx+4],$h0
370         ld      [$ctx+8],$h3
371         ld      [$ctx+12],$h2
372         ld      [$ctx+16],$h4
373
374         addcc   $h0,5,$r0               ! compare to modulus
375         addccc  $h1,0,$r1
376         addccc  $h2,0,$r2
377         addccc  $h3,0,$r3
378         addc    $h4,0,$h4
379         andcc   $h4,4,%g0               ! did it carry/borrow?
380
381         movnz   %icc,$r0,$h0
382         ld      [$nonce+0],$r0          ! load nonce
383         movnz   %icc,$r1,$h1
384         ld      [$nonce+4],$r1
385         movnz   %icc,$r2,$h2
386         ld      [$nonce+8],$r2
387         movnz   %icc,$r3,$h3
388         ld      [$nonce+12],$r3
389
390         addcc   $r0,$h0,$h0             ! accumulate nonce
391         addccc  $r1,$h1,$h1
392         addccc  $r2,$h2,$h2
393         addc    $r3,$h3,$h3
394
395         srl     $h0,8,$r0
396         stb     $h0,[$mac+0]            ! store little-endian result
397         srl     $h0,16,$r1
398         stb     $r0,[$mac+1]
399         srl     $h0,24,$r2
400         stb     $r1,[$mac+2]
401         stb     $r2,[$mac+3]
402
403         srl     $h1,8,$r0
404         stb     $h1,[$mac+4]
405         srl     $h1,16,$r1
406         stb     $r0,[$mac+5]
407         srl     $h1,24,$r2
408         stb     $r1,[$mac+6]
409         stb     $r2,[$mac+7]
410
411         srl     $h2,8,$r0
412         stb     $h2,[$mac+8]
413         srl     $h2,16,$r1
414         stb     $r0,[$mac+9]
415         srl     $h2,24,$r2
416         stb     $r1,[$mac+10]
417         stb     $r2,[$mac+11]
418
419         srl     $h3,8,$r0
420         stb     $h3,[$mac+12]
421         srl     $h3,16,$r1
422         stb     $r0,[$mac+13]
423         srl     $h3,24,$r2
424         stb     $r1,[$mac+14]
425         stb     $r2,[$mac+15]
426
427         ret
428         restore
429 .size   poly1305_emit,.-poly1305_emit
430 ___
431
432 {
433 my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
434 my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
435 my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
436 my $i2=$step;
437
438 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
439     $two0,$two32,$two64,$two96,$two130,$five_two130,
440     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
441     $s2lo,$s2hi,$s3lo,$s3hi,
442     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
443 # borrowings
444 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
445 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
446 my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
447
448 $code.=<<___;
449 .align  32
450 poly1305_init_fma:
451         save    %sp,-STACK_FRAME-16,%sp
452         nop
453
454 .Lpoly1305_init_fma:
455 1:      call    .+8
456         add     %o7,.Lconsts_fma-1b,%o7
457
458         ldd     [%o7+8*0],$two0                 ! load constants
459         ldd     [%o7+8*1],$two32
460         ldd     [%o7+8*2],$two64
461         ldd     [%o7+8*3],$two96
462         ldd     [%o7+8*5],$five_two130
463
464         std     $two0,[$ctx+8*0]                ! initial hash value, biased 0
465         std     $two32,[$ctx+8*1]
466         std     $two64,[$ctx+8*2]
467         std     $two96,[$ctx+8*3]
468
469         brz,pn  $inp,.Lno_key_fma
470         nop
471
472         stx     %fsr,[%sp+LOCALS]               ! save original %fsr
473         ldx     [%o7+8*6],%fsr                  ! load new %fsr
474
475         std     $two0,[$ctx+8*4]                ! key "template"
476         std     $two32,[$ctx+8*5]
477         std     $two64,[$ctx+8*6]
478         std     $two96,[$ctx+8*7]
479
480         and     $inp,7,$shr
481         andn    $inp,7,$inp                     ! align pointer
482         mov     8,$i1
483         sll     $shr,3,$shr
484         mov     16,$i2
485         neg     $shr,$shl
486
487         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian key
488         ldxa    [$inp+$i1]0x88,$in2
489
490         brz     $shr,.Lkey_aligned_fma
491         sethi   %hi(0xf0000000),$i1             !   0xf0000000
492
493         ldxa    [$inp+$i2]0x88,$in4
494
495         srlx    $in0,$shr,$in0                  ! align data
496         sllx    $in2,$shl,$in1
497         srlx    $in2,$shr,$in2
498         or      $in1,$in0,$in0
499         sllx    $in4,$shl,$in3
500         or      $in3,$in2,$in2
501
502 .Lkey_aligned_fma:
503         or      $i1,3,$i2                       !   0xf0000003
504         srlx    $in0,32,$in1
505         andn    $in0,$i1,$in0                   ! &=0x0fffffff
506         andn    $in1,$i2,$in1                   ! &=0x0ffffffc
507         srlx    $in2,32,$in3
508         andn    $in2,$i2,$in2
509         andn    $in3,$i2,$in3
510
511         st      $in0,[$ctx+`8*4+4`]             ! fill "template"
512         st      $in1,[$ctx+`8*5+4`]
513         st      $in2,[$ctx+`8*6+4`]
514         st      $in3,[$ctx+`8*7+4`]
515
516         ldd     [$ctx+8*4],$h0lo                ! load [biased] key
517         ldd     [$ctx+8*5],$h1lo
518         ldd     [$ctx+8*6],$h2lo
519         ldd     [$ctx+8*7],$h3lo
520
521         fsubd   $h0lo,$two0, $h0lo              ! r0
522          ldd    [%o7+8*7],$two0                 ! more constants
523         fsubd   $h1lo,$two32,$h1lo              ! r1
524          ldd    [%o7+8*8],$two32
525         fsubd   $h2lo,$two64,$h2lo              ! r2
526          ldd    [%o7+8*9],$two64
527         fsubd   $h3lo,$two96,$h3lo              ! r3
528          ldd    [%o7+8*10],$two96
529
530         fmuld   $five_two130,$h1lo,$s1lo        ! s1
531         fmuld   $five_two130,$h2lo,$s2lo        ! s2
532         fmuld   $five_two130,$h3lo,$s3lo        ! s3
533
534         faddd   $h0lo,$two0, $h0hi
535         faddd   $h1lo,$two32,$h1hi
536         faddd   $h2lo,$two64,$h2hi
537         faddd   $h3lo,$two96,$h3hi
538
539         fsubd   $h0hi,$two0, $h0hi
540          ldd    [%o7+8*11],$two0                ! more constants
541         fsubd   $h1hi,$two32,$h1hi
542          ldd    [%o7+8*12],$two32
543         fsubd   $h2hi,$two64,$h2hi
544          ldd    [%o7+8*13],$two64
545         fsubd   $h3hi,$two96,$h3hi
546
547         fsubd   $h0lo,$h0hi,$h0lo
548          std    $h0hi,[$ctx+8*5]                ! r0hi
549         fsubd   $h1lo,$h1hi,$h1lo
550          std    $h1hi,[$ctx+8*7]                ! r1hi
551         fsubd   $h2lo,$h2hi,$h2lo
552          std    $h2hi,[$ctx+8*9]                ! r2hi
553         fsubd   $h3lo,$h3hi,$h3lo
554          std    $h3hi,[$ctx+8*11]               ! r3hi
555
556         faddd   $s1lo,$two0, $s1hi
557         faddd   $s2lo,$two32,$s2hi
558         faddd   $s3lo,$two64,$s3hi
559
560         fsubd   $s1hi,$two0, $s1hi
561         fsubd   $s2hi,$two32,$s2hi
562         fsubd   $s3hi,$two64,$s3hi
563
564         fsubd   $s1lo,$s1hi,$s1lo
565         fsubd   $s2lo,$s2hi,$s2lo
566         fsubd   $s3lo,$s3hi,$s3lo
567
568         ldx     [%sp+LOCALS],%fsr               ! restore %fsr
569
570         std     $h0lo,[$ctx+8*4]                ! r0lo
571         std     $h1lo,[$ctx+8*6]                ! r1lo
572         std     $h2lo,[$ctx+8*8]                ! r2lo
573         std     $h3lo,[$ctx+8*10]               ! r3lo
574
575         std     $s1hi,[$ctx+8*13]
576         std     $s2hi,[$ctx+8*15]
577         std     $s3hi,[$ctx+8*17]
578
579         std     $s1lo,[$ctx+8*12]
580         std     $s2lo,[$ctx+8*14]
581         std     $s3lo,[$ctx+8*16]
582
583         add     %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
584         add     %o7,poly1305_emit_fma-.Lconsts_fma,%o1
585         STPTR   %o0,[%i2]
586         STPTR   %o1,[%i2+SIZE_T]
587
588         ret
589         restore %g0,1,%o0                       ! return 1
590
591 .Lno_key_fma:
592         ret
593         restore %g0,%g0,%o0                     ! return 0
594 .size   poly1305_init_fma,.-poly1305_init_fma
595
596 .align  32
597 poly1305_blocks_fma:
598         save    %sp,-STACK_FRAME-48,%sp
599         srlx    $len,4,$len
600
601         brz,pn  $len,.Labort
602         sub     $len,1,$len
603
604 1:      call    .+8
605         add     %o7,.Lconsts_fma-1b,%o7
606
607         ldd     [%o7+8*0],$two0                 ! load constants
608         ldd     [%o7+8*1],$two32
609         ldd     [%o7+8*2],$two64
610         ldd     [%o7+8*3],$two96
611         ldd     [%o7+8*4],$two130
612         ldd     [%o7+8*5],$five_two130
613
614         ldd     [$ctx+8*0],$h0lo                ! load [biased] hash value
615         ldd     [$ctx+8*1],$h1lo
616         ldd     [$ctx+8*2],$h2lo
617         ldd     [$ctx+8*3],$h3lo
618
619         std     $two0,[%sp+LOCALS+8*0]          ! input "template"
620         sethi   %hi((1023+52+96)<<20),$in3
621         std     $two32,[%sp+LOCALS+8*1]
622         or      $padbit,$in3,$in3
623         std     $two64,[%sp+LOCALS+8*2]
624         st      $in3,[%sp+LOCALS+8*3]
625
626         and     $inp,7,$shr
627         andn    $inp,7,$inp                     ! align pointer
628         mov     8,$i1
629         sll     $shr,3,$shr
630         mov     16,$step
631         neg     $shr,$shl
632
633         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian input
634         brz     $shr,.Linp_aligned_fma
635         ldxa    [$inp+$i1]0x88,$in2
636
637         ldxa    [$inp+$step]0x88,$in4
638         add     $inp,8,$inp
639
640         srlx    $in0,$shr,$in0                  ! align data
641         sllx    $in2,$shl,$in1
642         srlx    $in2,$shr,$in2
643         or      $in1,$in0,$in0
644         sllx    $in4,$shl,$in3
645         srlx    $in4,$shr,$in4                  ! pre-shift
646         or      $in3,$in2,$in2
647
648 .Linp_aligned_fma:
649         srlx    $in0,32,$in1
650         movrz   $len,0,$step
651         srlx    $in2,32,$in3
652         add     $step,$inp,$inp                 ! conditional advance
653
654         st      $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
655         st      $in1,[%sp+LOCALS+8*1+4]
656         st      $in2,[%sp+LOCALS+8*2+4]
657         st      $in3,[%sp+LOCALS+8*3+4]
658
659         ldd     [$ctx+8*4],$r0lo                ! load key
660         ldd     [$ctx+8*5],$r0hi
661         ldd     [$ctx+8*6],$r1lo
662         ldd     [$ctx+8*7],$r1hi
663         ldd     [$ctx+8*8],$r2lo
664         ldd     [$ctx+8*9],$r2hi
665         ldd     [$ctx+8*10],$r3lo
666         ldd     [$ctx+8*11],$r3hi
667         ldd     [$ctx+8*12],$s1lo
668         ldd     [$ctx+8*13],$s1hi
669         ldd     [$ctx+8*14],$s2lo
670         ldd     [$ctx+8*15],$s2hi
671         ldd     [$ctx+8*16],$s3lo
672         ldd     [$ctx+8*17],$s3hi
673
674         stx     %fsr,[%sp+LOCALS+8*4]           ! save original %fsr
675         ldx     [%o7+8*6],%fsr                  ! load new %fsr
676
677         subcc   $len,1,$len
678         movrz   $len,0,$step
679
680         ldd     [%sp+LOCALS+8*0],$x0            ! load biased input
681         ldd     [%sp+LOCALS+8*1],$x1
682         ldd     [%sp+LOCALS+8*2],$x2
683         ldd     [%sp+LOCALS+8*3],$x3
684
685         fsubd   $h0lo,$two0, $h0lo              ! de-bias hash value
686         fsubd   $h1lo,$two32,$h1lo
687          ldxa   [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
688         fsubd   $h2lo,$two64,$h2lo
689         fsubd   $h3lo,$two96,$h3lo
690          ldxa   [$inp+$i1]0x88,$in2
691
692         fsubd   $x0,$two0, $x0                  ! de-bias input
693         fsubd   $x1,$two32,$x1
694         fsubd   $x2,$two64,$x2
695         fsubd   $x3,$two96,$x3
696
697         brz     $shr,.Linp_aligned_fma2
698         add     $step,$inp,$inp                 ! conditional advance
699
700         sllx    $in0,$shl,$in1                  ! align data
701         srlx    $in0,$shr,$in3
702         or      $in1,$in4,$in0
703         sllx    $in2,$shl,$in1
704         srlx    $in2,$shr,$in4                  ! pre-shift
705         or      $in3,$in1,$in2
706 .Linp_aligned_fma2:
707         srlx    $in0,32,$in1
708         srlx    $in2,32,$in3
709
710         faddd   $h0lo,$x0,$x0                   ! accumulate input
711          stw    $in0,[%sp+LOCALS+8*0+4]
712         faddd   $h1lo,$x1,$x1
713          stw    $in1,[%sp+LOCALS+8*1+4]
714         faddd   $h2lo,$x2,$x2
715          stw    $in2,[%sp+LOCALS+8*2+4]
716         faddd   $h3lo,$x3,$x3
717          stw    $in3,[%sp+LOCALS+8*3+4]
718
719         b       .Lentry_fma
720         nop
721
722 .align  16
723 .Loop_fma:
724         ldxa    [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
725         ldxa    [$inp+$i1]0x88,$in2
726         movrz   $len,0,$step
727
728         faddd   $y0,$h0lo,$h0lo                 ! accumulate input
729         faddd   $y1,$h0hi,$h0hi
730         faddd   $y2,$h2lo,$h2lo
731         faddd   $y3,$h2hi,$h2hi
732
733         brz,pn  $shr,.Linp_aligned_fma3
734         add     $step,$inp,$inp                 ! conditional advance
735
736         sllx    $in0,$shl,$in1                  ! align data
737         srlx    $in0,$shr,$in3
738         or      $in1,$in4,$in0
739         sllx    $in2,$shl,$in1
740         srlx    $in2,$shr,$in4                  ! pre-shift
741         or      $in3,$in1,$in2
742
743 .Linp_aligned_fma3:
744         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
745         faddd   $two64,$h1lo,$c1lo
746          srlx   $in0,32,$in1
747         faddd   $two64,$h1hi,$c1hi
748          srlx   $in2,32,$in3
749         faddd   $two130,$h3lo,$c3lo
750          st     $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
751         faddd   $two130,$h3hi,$c3hi
752          st     $in1,[%sp+LOCALS+8*1+4]
753         faddd   $two32,$h0lo,$c0lo
754          st     $in2,[%sp+LOCALS+8*2+4]
755         faddd   $two32,$h0hi,$c0hi
756          st     $in3,[%sp+LOCALS+8*3+4]
757         faddd   $two96,$h2lo,$c2lo
758         faddd   $two96,$h2hi,$c2hi
759
760         fsubd   $c1lo,$two64,$c1lo
761         fsubd   $c1hi,$two64,$c1hi
762         fsubd   $c3lo,$two130,$c3lo
763         fsubd   $c3hi,$two130,$c3hi
764         fsubd   $c0lo,$two32,$c0lo
765         fsubd   $c0hi,$two32,$c0hi
766         fsubd   $c2lo,$two96,$c2lo
767         fsubd   $c2hi,$two96,$c2hi
768
769         fsubd   $h1lo,$c1lo,$h1lo
770         fsubd   $h1hi,$c1hi,$h1hi
771         fsubd   $h3lo,$c3lo,$h3lo
772         fsubd   $h3hi,$c3hi,$h3hi
773         fsubd   $h2lo,$c2lo,$h2lo
774         fsubd   $h2hi,$c2hi,$h2hi
775         fsubd   $h0lo,$c0lo,$h0lo
776         fsubd   $h0hi,$c0hi,$h0hi
777
778         faddd   $h1lo,$c0lo,$h1lo
779         faddd   $h1hi,$c0hi,$h1hi
780         faddd   $h3lo,$c2lo,$h3lo
781         faddd   $h3hi,$c2hi,$h3hi
782         faddd   $h2lo,$c1lo,$h2lo
783         faddd   $h2hi,$c1hi,$h2hi
784         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
785         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
786
787         faddd   $h1lo,$h1hi,$x1
788          ldd    [$ctx+8*12],$s1lo               ! reload constants
789         faddd   $h3lo,$h3hi,$x3
790          ldd    [$ctx+8*13],$s1hi
791         faddd   $h2lo,$h2hi,$x2
792          ldd    [$ctx+8*10],$r3lo
793         faddd   $h0lo,$h0hi,$x0
794          ldd    [$ctx+8*11],$r3hi
795
796 .Lentry_fma:
797         fmuld   $x1,$s3lo,$h0lo
798         fmuld   $x1,$s3hi,$h0hi
799         fmuld   $x1,$r1lo,$h2lo
800         fmuld   $x1,$r1hi,$h2hi
801         fmuld   $x1,$r0lo,$h1lo
802         fmuld   $x1,$r0hi,$h1hi
803         fmuld   $x1,$r2lo,$h3lo
804         fmuld   $x1,$r2hi,$h3hi
805
806         fmaddd  $x3,$s1lo,$h0lo,$h0lo
807         fmaddd  $x3,$s1hi,$h0hi,$h0hi
808         fmaddd  $x3,$s3lo,$h2lo,$h2lo
809         fmaddd  $x3,$s3hi,$h2hi,$h2hi
810         fmaddd  $x3,$s2lo,$h1lo,$h1lo
811         fmaddd  $x3,$s2hi,$h1hi,$h1hi
812         fmaddd  $x3,$r0lo,$h3lo,$h3lo
813         fmaddd  $x3,$r0hi,$h3hi,$h3hi
814
815         fmaddd  $x2,$s2lo,$h0lo,$h0lo
816         fmaddd  $x2,$s2hi,$h0hi,$h0hi
817         fmaddd  $x2,$r0lo,$h2lo,$h2lo
818         fmaddd  $x2,$r0hi,$h2hi,$h2hi
819         fmaddd  $x2,$s3lo,$h1lo,$h1lo
820          ldd    [%sp+LOCALS+8*0],$y0            ! load [biased] input
821         fmaddd  $x2,$s3hi,$h1hi,$h1hi
822          ldd    [%sp+LOCALS+8*1],$y1
823         fmaddd  $x2,$r1lo,$h3lo,$h3lo
824          ldd    [%sp+LOCALS+8*2],$y2
825         fmaddd  $x2,$r1hi,$h3hi,$h3hi
826          ldd    [%sp+LOCALS+8*3],$y3
827
828         fmaddd  $x0,$r0lo,$h0lo,$h0lo
829          fsubd  $y0,$two0, $y0                  ! de-bias input
830         fmaddd  $x0,$r0hi,$h0hi,$h0hi
831          fsubd  $y1,$two32,$y1
832         fmaddd  $x0,$r2lo,$h2lo,$h2lo
833          fsubd  $y2,$two64,$y2
834         fmaddd  $x0,$r2hi,$h2hi,$h2hi
835          fsubd  $y3,$two96,$y3
836         fmaddd  $x0,$r1lo,$h1lo,$h1lo
837         fmaddd  $x0,$r1hi,$h1hi,$h1hi
838         fmaddd  $x0,$r3lo,$h3lo,$h3lo
839         fmaddd  $x0,$r3hi,$h3hi,$h3hi
840
841         bcc     SIZE_T_CC,.Loop_fma
842         subcc   $len,1,$len
843
844         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
845         faddd   $h0lo,$two32,$c0lo
846         faddd   $h0hi,$two32,$c0hi
847         faddd   $h2lo,$two96,$c2lo
848         faddd   $h2hi,$two96,$c2hi
849         faddd   $h1lo,$two64,$c1lo
850         faddd   $h1hi,$two64,$c1hi
851         faddd   $h3lo,$two130,$c3lo
852         faddd   $h3hi,$two130,$c3hi
853
854         fsubd   $c0lo,$two32,$c0lo
855         fsubd   $c0hi,$two32,$c0hi
856         fsubd   $c2lo,$two96,$c2lo
857         fsubd   $c2hi,$two96,$c2hi
858         fsubd   $c1lo,$two64,$c1lo
859         fsubd   $c1hi,$two64,$c1hi
860         fsubd   $c3lo,$two130,$c3lo
861         fsubd   $c3hi,$two130,$c3hi
862
863         fsubd   $h1lo,$c1lo,$h1lo
864         fsubd   $h1hi,$c1hi,$h1hi
865         fsubd   $h3lo,$c3lo,$h3lo
866         fsubd   $h3hi,$c3hi,$h3hi
867         fsubd   $h2lo,$c2lo,$h2lo
868         fsubd   $h2hi,$c2hi,$h2hi
869         fsubd   $h0lo,$c0lo,$h0lo
870         fsubd   $h0hi,$c0hi,$h0hi
871
872         faddd   $h1lo,$c0lo,$h1lo
873         faddd   $h1hi,$c0hi,$h1hi
874         faddd   $h3lo,$c2lo,$h3lo
875         faddd   $h3hi,$c2hi,$h3hi
876         faddd   $h2lo,$c1lo,$h2lo
877         faddd   $h2hi,$c1hi,$h2hi
878         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
879         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
880
881         faddd   $h1lo,$h1hi,$x1
882         faddd   $h3lo,$h3hi,$x3
883         faddd   $h2lo,$h2hi,$x2
884         faddd   $h0lo,$h0hi,$x0
885
886         faddd   $x1,$two32,$x1                  ! bias
887         faddd   $x3,$two96,$x3
888         faddd   $x2,$two64,$x2
889         faddd   $x0,$two0, $x0
890
891         ldx     [%sp+LOCALS+8*4],%fsr           ! restore saved %fsr
892
893         std     $x1,[$ctx+8*1]                  ! store [biased] hash value
894         std     $x3,[$ctx+8*3]
895         std     $x2,[$ctx+8*2]
896         std     $x0,[$ctx+8*0]
897
898 .Labort:
899         ret
900         restore
901 .size   poly1305_blocks_fma,.-poly1305_blocks_fma
902 ___
903 {
904 my ($mac,$nonce)=($inp,$len);
905
906 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
907    ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
908
909 $code.=<<___;
910 .align  32
911 poly1305_emit_fma:
912         save    %sp,-STACK_FRAME,%sp
913
914         ld      [$ctx+8*0+0],$d0                ! load hash
915         ld      [$ctx+8*0+4],$h0
916         ld      [$ctx+8*1+0],$d1
917         ld      [$ctx+8*1+4],$h1
918         ld      [$ctx+8*2+0],$d2
919         ld      [$ctx+8*2+4],$h2
920         ld      [$ctx+8*3+0],$d3
921         ld      [$ctx+8*3+4],$h3
922
923         sethi   %hi(0xfff00000),$mask
924         andn    $d0,$mask,$d0                   ! mask exponent
925         andn    $d1,$mask,$d1
926         andn    $d2,$mask,$d2
927         andn    $d3,$mask,$d3                   ! can be partially reduced...
928         mov     3,$mask
929
930         srl     $d3,2,$padbit                   ! ... so reduce
931         and     $d3,$mask,$h4
932         andn    $d3,$mask,$d3
933         add     $padbit,$d3,$d3
934
935         addcc   $d3,$h0,$h0
936         addccc  $d0,$h1,$h1
937         addccc  $d1,$h2,$h2
938         addccc  $d2,$h3,$h3
939         addc    %g0,$h4,$h4
940
941         addcc   $h0,5,$d0                       ! compare to modulus
942         addccc  $h1,0,$d1
943         addccc  $h2,0,$d2
944         addccc  $h3,0,$d3
945         addc    $h4,0,$mask
946
947         srl     $mask,2,$mask                   ! did it carry/borrow?
948         neg     $mask,$mask
949         sra     $mask,31,$mask                  ! mask
950
951         andn    $h0,$mask,$h0
952         and     $d0,$mask,$d0
953         andn    $h1,$mask,$h1
954         and     $d1,$mask,$d1
955         or      $d0,$h0,$h0
956         ld      [$nonce+0],$d0                  ! load nonce
957         andn    $h2,$mask,$h2
958         and     $d2,$mask,$d2
959         or      $d1,$h1,$h1
960         ld      [$nonce+4],$d1
961         andn    $h3,$mask,$h3
962         and     $d3,$mask,$d3
963         or      $d2,$h2,$h2
964         ld      [$nonce+8],$d2
965         or      $d3,$h3,$h3
966         ld      [$nonce+12],$d3
967
968         addcc   $d0,$h0,$h0                     ! accumulate nonce
969         addccc  $d1,$h1,$h1
970         addccc  $d2,$h2,$h2
971         addc    $d3,$h3,$h3
972
973         stb     $h0,[$mac+0]                    ! write little-endian result
974         srl     $h0,8,$h0
975         stb     $h1,[$mac+4]
976         srl     $h1,8,$h1
977         stb     $h2,[$mac+8]
978         srl     $h2,8,$h2
979         stb     $h3,[$mac+12]
980         srl     $h3,8,$h3
981
982         stb     $h0,[$mac+1]
983         srl     $h0,8,$h0
984         stb     $h1,[$mac+5]
985         srl     $h1,8,$h1
986         stb     $h2,[$mac+9]
987         srl     $h2,8,$h2
988         stb     $h3,[$mac+13]
989         srl     $h3,8,$h3
990
991         stb     $h0,[$mac+2]
992         srl     $h0,8,$h0
993         stb     $h1,[$mac+6]
994         srl     $h1,8,$h1
995         stb     $h2,[$mac+10]
996         srl     $h2,8,$h2
997         stb     $h3,[$mac+14]
998         srl     $h3,8,$h3
999
1000         stb     $h0,[$mac+3]
1001         stb     $h1,[$mac+7]
1002         stb     $h2,[$mac+11]
1003         stb     $h3,[$mac+15]
1004
1005         ret
1006         restore
1007 .size   poly1305_emit_fma,.-poly1305_emit_fma
1008 ___
1009 }
1010
1011 $code.=<<___;
1012 .align  64
1013 .Lconsts_fma:
1014 .word   0x43300000,0x00000000           ! 2^(52+0)
1015 .word   0x45300000,0x00000000           ! 2^(52+32)
1016 .word   0x47300000,0x00000000           ! 2^(52+64)
1017 .word   0x49300000,0x00000000           ! 2^(52+96)
1018 .word   0x4b500000,0x00000000           ! 2^(52+130)
1019
1020 .word   0x37f40000,0x00000000           ! 5/2^130
1021 .word   0,1<<30                         ! fsr: truncate, no exceptions
1022
1023 .word   0x44300000,0x00000000           ! 2^(52+16+0)
1024 .word   0x46300000,0x00000000           ! 2^(52+16+32)
1025 .word   0x48300000,0x00000000           ! 2^(52+16+64)
1026 .word   0x4a300000,0x00000000           ! 2^(52+16+96)
1027 .word   0x3e300000,0x00000000           ! 2^(52+16+0-96)
1028 .word   0x40300000,0x00000000           ! 2^(52+16+32-96)
1029 .word   0x42300000,0x00000000           ! 2^(52+16+64-96)
1030 .asciz  "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1031 .align  4
1032 ___
1033 }
1034 \f
1035 # Purpose of these subroutines is to explicitly encode VIS instructions,
1036 # so that one can compile the module without having to specify VIS
1037 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1038 # Idea is to reserve for option to produce "universal" binary and let
1039 # programmer detect if current CPU is VIS capable at run-time.
1040 sub unvis3 {
1041 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1042 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1043 my ($ref,$opf);
1044 my %visopf = (  "addxc"         => 0x011,
1045                 "addxccc"       => 0x013,
1046                 "umulxhi"       => 0x016        );
1047
1048     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1049
1050     if ($opf=$visopf{$mnemonic}) {
1051         foreach ($rs1,$rs2,$rd) {
1052             return $ref if (!/%([goli])([0-9])/);
1053             $_=$bias{$1}+$2;
1054         }
1055
1056         return  sprintf ".word\t0x%08x !%s",
1057                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1058                         $ref;
1059     } else {
1060         return $ref;
1061     }
1062 }
1063
1064 sub unfma {
1065 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1066 my ($ref,$opf);
1067 my %fmaopf = (  "fmadds"        => 0x1,
1068                 "fmaddd"        => 0x2,
1069                 "fmsubs"        => 0x5,
1070                 "fmsubd"        => 0x6          );
1071
1072     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1073
1074     if ($opf=$fmaopf{$mnemonic}) {
1075         foreach ($rs1,$rs2,$rs3,$rd) {
1076             return $ref if (!/%f([0-9]{1,2})/);
1077             $_=$1;
1078             if ($1>=32) {
1079                 return $ref if ($1&1);
1080                 # re-encode for upper double register addressing
1081                 $_=($1|$1>>5)&31;
1082             }
1083         }
1084
1085         return  sprintf ".word\t0x%08x !%s",
1086                         0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1087                         $ref;
1088     } else {
1089         return $ref;
1090     }
1091 }
1092
1093 foreach (split("\n",$code)) {
1094         s/\`([^\`]*)\`/eval $1/ge;
1095
1096         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1097                 &unvis3($1,$2,$3,$4)
1098          /ge    or
1099         s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1100                 &unfma($1,$2,$3,$4,$5)
1101          /ge;
1102
1103         print $_,"\n";
1104 }
1105
1106 close STDOUT;