poly1305/asm/poly1305-x86_64.pl: fix mingw64 build.
[openssl.git] / crypto / poly1305 / asm / poly1305-sparcv9.pl
1 #!/usr/bin/env perl
2 #
3 # ====================================================================
4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9 #
10 # This module implements Poly1305 hash for SPARCv9, vanilla, as well
11 # as VIS3 and FMA extensions.
12 #
13 # May, August 2015
14 #
15 # Numbers are cycles per processed byte with poly1305_blocks alone.
16 #
17 #                       IALU(*)         FMA
18 #
19 # UltraSPARC III        11.9(**)
20 # SPARC T3              7.85
21 # SPARC T4              1.67(***)       6.55
22 # SPARC64 X             5.54            3.64
23 #
24 # (*)   Comparison to compiler-generated code is really problematic,
25 #       because latter's performance varies too much depending on too
26 #       many variables. For example, one can measure from 5x to 15x
27 #       improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
28 #       unfair comparison, because compiler doesn't use VIS3, but
29 #       given same initial conditions coefficient varies from 3x to 9x.
30 # (**)  Pre-III performance should be even worse; floating-point
31 #       performance for UltraSPARC I-IV on the other hand is reported
32 #       to be 4.25 for hand-coded assembly, but they are just too old
33 #       to care about.
34 # (***) Multi-process benchmark saturates at ~12.5x single-process
35 #       result on 8-core processor, or ~21GBps per 2.85GHz socket.
36
37 my ($ctx,$inp,$len,$padbit,$shl,$shr)   = map("%i$_",(0..5));
38 my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4)    = map("%l$_",(0..7));
39 my ($h0,$h1,$h2,$h3, $t0,$t1,$t2)       = map("%o$_",(0..5,7));
40 my ($d0,$d1,$d2,$d3)                    = map("%g$_",(1..4));
41
42 $code.=<<___;
43 #include "sparc_arch.h"
44
45 #ifdef  __arch64__
46 .register       %g2,#scratch
47 .register       %g3,#scratch
48 # define        STPTR   stx
49 # define        SIZE_T  8
50 #else
51 # define        STPTR   st
52 # define        SIZE_T  4
53 #endif
54 #define LOCALS  (STACK_BIAS+STACK_FRAME)
55
56 .section        ".text",#alloc,#execinstr
57
58 #ifdef __PIC__
59 SPARC_PIC_THUNK(%g1)
60 #endif
61
62 .globl  poly1305_init
63 .align  32
64 poly1305_init:
65         save    %sp,-STACK_FRAME-16,%sp
66         nop
67
68         SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
69         ld      [%g1],%g1
70
71         and     %g1,SPARCV9_FMADD|SPARCV9_PREFER_FPU|SPARCV9_VIS3,%g1
72         cmp     %g1,SPARCV9_FMADD|SPARCV9_PREFER_FPU
73         be      .Lpoly1305_init_fma
74         nop
75
76         stx     %g0,[$ctx+0]
77         stx     %g0,[$ctx+8]            ! zero hash value
78         brz,pn  $inp,.Lno_key
79         stx     %g0,[$ctx+16]
80
81         and     $inp,7,$shr             ! alignment factor
82         andn    $inp,7,$inp
83         sll     $shr,3,$shr             ! *8
84         neg     $shr,$shl
85
86         sethi   %hi(0x0ffffffc),$t0
87         set     8,$h1
88         or      $t0,%lo(0x0ffffffc),$t0
89         set     16,$h2
90         sllx    $t0,32,$t1
91         or      $t0,$t1,$t1             ! 0x0ffffffc0ffffffc
92         or      $t1,3,$t0               ! 0x0ffffffc0fffffff
93
94         ldxa    [$inp+%g0]0x88,$h0      ! load little-endian key
95         brz,pt  $shr,.Lkey_aligned
96         ldxa    [$inp+$h1]0x88,$h1
97
98         ldxa    [$inp+$h2]0x88,$h2
99         srlx    $h0,$shr,$h0
100         sllx    $h1,$shl,$t2
101         srlx    $h1,$shr,$h1
102         or      $t2,$h0,$h0
103         sllx    $h2,$shl,$h2
104         or      $h2,$h1,$h1
105
106 .Lkey_aligned:
107         and     $t0,$h0,$h0
108         and     $t1,$h1,$h1
109         stx     $h0,[$ctx+32+0]         ! store key
110         stx     $h1,[$ctx+32+8]
111
112         andcc   %g1,SPARCV9_VIS3,%g0
113         be      .Lno_key
114         nop
115
116 1:      call    .+8
117         add     %o7,poly1305_blocks_vis3-1b,%o7
118
119         add     %o7,poly1305_emit-poly1305_blocks_vis3,%o5
120         STPTR   %o7,[%i2]
121         STPTR   %o5,[%i2+SIZE_T]
122
123         ret
124         restore %g0,1,%o0               ! return 1
125
126 .Lno_key:
127         ret
128         restore %g0,%g0,%o0             ! return 0
129 .size   poly1305_init,.-poly1305_init
130
131 .globl  poly1305_blocks
132 .align  32
133 poly1305_blocks:
134         save    %sp,-STACK_FRAME,%sp
135         andn    $len,15,$len
136
137         brz,pn  $len,.Lno_data
138         nop
139
140         ld      [$ctx+32+0],$r1         ! load key
141         ld      [$ctx+32+4],$r0
142         ld      [$ctx+32+8],$r3
143         ld      [$ctx+32+12],$r2
144
145         ld      [$ctx+0],$h1            ! load hash value
146         ld      [$ctx+4],$h0
147         ld      [$ctx+8],$h3
148         ld      [$ctx+12],$h2
149         ld      [$ctx+16],$h4
150
151         and     $inp,7,$shr             ! alignment factor
152         andn    $inp,7,$inp
153         set     8,$d1
154         sll     $shr,3,$shr             ! *8
155         set     16,$d2
156         neg     $shr,$shl
157
158         srl     $r1,2,$s1
159         srl     $r2,2,$s2
160         add     $r1,$s1,$s1
161         srl     $r3,2,$s3
162         add     $r2,$s2,$s2
163         add     $r3,$s3,$s3
164
165 .Loop:
166         ldxa    [$inp+%g0]0x88,$d0      ! load little-endian input
167         brz,pt  $shr,.Linp_aligned
168         ldxa    [$inp+$d1]0x88,$d1
169
170         ldxa    [$inp+$d2]0x88,$d2
171         srlx    $d0,$shr,$d0
172         sllx    $d1,$shl,$t1
173         srlx    $d1,$shr,$d1
174         or      $t1,$d0,$d0
175         sllx    $d2,$shl,$d2
176         or      $d2,$d1,$d1
177
178 .Linp_aligned:
179         srlx    $d0,32,$t0
180         addcc   $d0,$h0,$h0             ! accumulate input
181         srlx    $d1,32,$t1
182         addccc  $t0,$h1,$h1
183         addccc  $d1,$h2,$h2
184         addccc  $t1,$h3,$h3
185         addc    $padbit,$h4,$h4
186
187         umul    $r0,$h0,$d0
188         umul    $r1,$h0,$d1
189         umul    $r2,$h0,$d2
190         umul    $r3,$h0,$d3
191          sub    $len,16,$len
192          add    $inp,16,$inp
193
194         umul    $s3,$h1,$t0
195         umul    $r0,$h1,$t1
196         umul    $r1,$h1,$t2
197         add     $t0,$d0,$d0
198         add     $t1,$d1,$d1
199         umul    $r2,$h1,$t0
200         add     $t2,$d2,$d2
201         add     $t0,$d3,$d3
202
203         umul    $s2,$h2,$t1
204         umul    $s3,$h2,$t2
205         umul    $r0,$h2,$t0
206         add     $t1,$d0,$d0
207         add     $t2,$d1,$d1
208         umul    $r1,$h2,$t1
209         add     $t0,$d2,$d2
210         add     $t1,$d3,$d3
211
212         umul    $s1,$h3,$t2
213         umul    $s2,$h3,$t0
214         umul    $s3,$h3,$t1
215         add     $t2,$d0,$d0
216         add     $t0,$d1,$d1
217         umul    $r0,$h3,$t2
218         add     $t1,$d2,$d2
219         add     $t2,$d3,$d3
220
221         umul    $s1,$h4,$t0
222         umul    $s2,$h4,$t1
223         umul    $s3,$h4,$t2
224         umul    $r0,$h4,$h4
225         add     $t0,$d1,$d1
226         add     $t1,$d2,$d2
227         srlx    $d0,32,$h1
228         add     $t2,$d3,$d3
229         srlx    $d1,32,$h2
230
231         addcc   $d1,$h1,$h1
232         srlx    $d2,32,$h3
233          set    8,$d1
234         addccc  $d2,$h2,$h2
235         srlx    $d3,32,$t0
236          set    16,$d2
237         addccc  $d3,$h3,$h3
238         addc    $t0,$h4,$h4
239
240         srl     $h4,2,$t0               ! final reduction step
241         andn    $h4,3,$t1
242         and     $h4,3,$h4
243         add     $t1,$t0,$t0
244
245         addcc   $t0,$d0,$h0
246         addccc  %g0,$h1,$h1
247         addccc  %g0,$h2,$h2
248         brnz,pt $len,.Loop
249         addc    %g0,$h3,$h3
250
251         st      $h1,[$ctx+0]            ! store hash value
252         st      $h0,[$ctx+4]
253         st      $h3,[$ctx+8]
254         st      $h2,[$ctx+12]
255         st      $h4,[$ctx+16]
256
257 .Lno_data:
258         ret
259         restore
260 .size   poly1305_blocks,.-poly1305_blocks
261 ___
262 ########################################################################
263 # VIS3 has umulxhi and addxc...
264 {
265 my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
266 my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
267
268 $code.=<<___;
269 .align  32
270 poly1305_blocks_vis3:
271         save    %sp,-STACK_FRAME,%sp
272         andn    $len,15,$len
273
274         brz,pn  $len,.Lno_data
275         nop
276
277         ldx     [$ctx+32+0],$R0         ! load key
278         ldx     [$ctx+32+8],$R1
279
280         ldx     [$ctx+0],$H0            ! load hash value
281         ldx     [$ctx+8],$H1
282         ld      [$ctx+16],$H2
283
284         and     $inp,7,$shr             ! alignment factor
285         andn    $inp,7,$inp
286         set     8,$r1
287         sll     $shr,3,$shr             ! *8
288         set     16,$r2
289         neg     $shr,$shl
290
291         srlx    $R1,2,$S1
292         add     $R1,$S1,$S1
293
294 .Loop_vis3:
295         ldxa    [$inp+%g0]0x88,$D0      ! load little-endian input
296         brz,pt  $shr,.Linp_aligned_vis3
297         ldxa    [$inp+$r1]0x88,$D1
298
299         ldxa    [$inp+$r2]0x88,$D2
300         srlx    $D0,$shr,$D0
301         sllx    $D1,$shl,$T1
302         srlx    $D1,$shr,$D1
303         or      $T1,$D0,$D0
304         sllx    $D2,$shl,$D2
305         or      $D2,$D1,$D1
306
307 .Linp_aligned_vis3:
308         addcc   $D0,$H0,$H0             ! accumulate input
309          sub    $len,16,$len
310         addxccc $D1,$H1,$H1
311          add    $inp,16,$inp
312
313         mulx    $R0,$H0,$D0             ! r0*h0
314         addxc   $padbit,$H2,$H2
315         umulxhi $R0,$H0,$D1
316         mulx    $S1,$H1,$T0             ! s1*h1
317         umulxhi $S1,$H1,$T1
318         addcc   $T0,$D0,$D0
319         mulx    $R1,$H0,$T0             ! r1*h0
320         addxc   $T1,$D1,$D1
321         umulxhi $R1,$H0,$D2
322         addcc   $T0,$D1,$D1
323         mulx    $R0,$H1,$T0             ! r0*h1
324         addxc   %g0,$D2,$D2
325         umulxhi $R0,$H1,$T1
326         addcc   $T0,$D1,$D1
327         mulx    $S1,$H2,$T0             ! s1*h2
328         addxc   $T1,$D2,$D2
329         mulx    $R0,$H2,$T1             ! r0*h2
330         addcc   $T0,$D1,$D1
331         addxc   $T1,$D2,$D2
332
333         srlx    $D2,2,$T0               ! final reduction step
334         andn    $D2,3,$T1
335         and     $D2,3,$H2
336         add     $T1,$T0,$T0
337
338         addcc   $T0,$D0,$H0
339         brnz,pt $len,.Loop_vis3
340         addxc   %g0,$D1,$H1
341
342         stx     $H0,[$ctx+0]            ! store hash value
343         stx     $H1,[$ctx+8]
344         st      $H2,[$ctx+16]
345
346         ret
347         restore
348 .size   poly1305_blocks_vis3,.-poly1305_blocks_vis3
349 ___
350 }
351 my ($mac,$nonce) = ($inp,$len);
352
353 $code.=<<___;
354 .globl  poly1305_emit
355 .align  32
356 poly1305_emit:
357         save    %sp,-STACK_FRAME,%sp
358
359         ld      [$ctx+0],$h1            ! load hash value
360         ld      [$ctx+4],$h0
361         ld      [$ctx+8],$h3
362         ld      [$ctx+12],$h2
363         ld      [$ctx+16],$h4
364
365         addcc   $h0,5,$r0               ! compare to modulus
366         addccc  $h1,0,$r1
367         addccc  $h2,0,$r2
368         addccc  $h3,0,$r3
369         addc    $h4,0,$h4
370         andcc   $h4,4,%g0               ! did it carry/borrow?
371
372         movnz   %icc,$r0,$h0
373         ld      [$nonce+0],$r0          ! load nonce
374         movnz   %icc,$r1,$h1
375         ld      [$nonce+4],$r1
376         movnz   %icc,$r2,$h2
377         ld      [$nonce+8],$r2
378         movnz   %icc,$r3,$h3
379         ld      [$nonce+12],$r3
380
381         addcc   $r0,$h0,$h0             ! accumulate nonce
382         addccc  $r1,$h1,$h1
383         addccc  $r2,$h2,$h2
384         addc    $r3,$h3,$h3
385
386         srl     $h0,8,$r0
387         stb     $h0,[$mac+0]            ! store little-endian result
388         srl     $h0,16,$r1
389         stb     $r0,[$mac+1]
390         srl     $h0,24,$r2
391         stb     $r1,[$mac+2]
392         stb     $r2,[$mac+3]
393
394         srl     $h1,8,$r0
395         stb     $h1,[$mac+4]
396         srl     $h1,16,$r1
397         stb     $r0,[$mac+5]
398         srl     $h1,24,$r2
399         stb     $r1,[$mac+6]
400         stb     $r2,[$mac+7]
401
402         srl     $h2,8,$r0
403         stb     $h2,[$mac+8]
404         srl     $h2,16,$r1
405         stb     $r0,[$mac+9]
406         srl     $h2,24,$r2
407         stb     $r1,[$mac+10]
408         stb     $r2,[$mac+11]
409
410         srl     $h3,8,$r0
411         stb     $h3,[$mac+12]
412         srl     $h3,16,$r1
413         stb     $r0,[$mac+13]
414         srl     $h3,24,$r2
415         stb     $r1,[$mac+14]
416         stb     $r2,[$mac+15]
417
418         ret
419         restore
420 .size   poly1305_emit,.-poly1305_emit
421 ___
422
423 {
424 my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
425 my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
426 my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
427 my $i2=$step;
428
429 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
430     $two0,$two32,$two64,$two96,$two130,$five_two130,
431     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
432     $s2lo,$s2hi,$s3lo,$s3hi,
433     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
434 # borrowings
435 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
436 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
437 my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
438
439 $code.=<<___;
440 .align  32
441 poly1305_init_fma:
442         save    %sp,-STACK_FRAME-16,%sp
443         nop
444
445 .Lpoly1305_init_fma:
446 1:      call    .+8
447         add     %o7,.Lconsts_fma-1b,%o7
448
449         ldd     [%o7+8*0],$two0                 ! load constants
450         ldd     [%o7+8*1],$two32
451         ldd     [%o7+8*2],$two64
452         ldd     [%o7+8*3],$two96
453         ldd     [%o7+8*5],$five_two130
454
455         std     $two0,[$ctx+8*0]                ! initial hash value, biased 0
456         std     $two32,[$ctx+8*1]
457         std     $two64,[$ctx+8*2]
458         std     $two96,[$ctx+8*3]
459
460         brz,pn  $inp,.Lno_key_fma
461         nop
462
463         stx     %fsr,[%sp+LOCALS]               ! save original %fsr
464         ldx     [%o7+8*6],%fsr                  ! load new %fsr
465
466         std     $two0,[$ctx+8*4]                ! key "template"
467         std     $two32,[$ctx+8*5]
468         std     $two64,[$ctx+8*6]
469         std     $two96,[$ctx+8*7]
470
471         and     $inp,7,$shr
472         andn    $inp,7,$inp                     ! align pointer
473         mov     8,$i1
474         sll     $shr,3,$shr
475         mov     16,$i2
476         neg     $shr,$shl
477
478         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian key
479         ldxa    [$inp+$i1]0x88,$in2
480
481         brz     $shr,.Lkey_aligned_fma
482         sethi   %hi(0xf0000000),$i1             !   0xf0000000
483
484         ldxa    [$inp+$i2]0x88,$in4
485
486         srlx    $in0,$shr,$in0                  ! align data
487         sllx    $in2,$shl,$in1
488         srlx    $in2,$shr,$in2
489         or      $in1,$in0,$in0
490         sllx    $in4,$shl,$in3
491         or      $in3,$in2,$in2
492
493 .Lkey_aligned_fma:
494         or      $i1,3,$i2                       !   0xf0000003
495         srlx    $in0,32,$in1
496         andn    $in0,$i1,$in0                   ! &=0x0fffffff
497         andn    $in1,$i2,$in1                   ! &=0x0ffffffc
498         srlx    $in2,32,$in3
499         andn    $in2,$i2,$in2
500         andn    $in3,$i2,$in3
501
502         st      $in0,[$ctx+`8*4+4`]             ! fill "template"
503         st      $in1,[$ctx+`8*5+4`]
504         st      $in2,[$ctx+`8*6+4`]
505         st      $in3,[$ctx+`8*7+4`]
506
507         ldd     [$ctx+8*4],$h0lo                ! load [biased] key
508         ldd     [$ctx+8*5],$h1lo
509         ldd     [$ctx+8*6],$h2lo
510         ldd     [$ctx+8*7],$h3lo
511
512         fsubd   $h0lo,$two0, $h0lo              ! r0
513          ldd    [%o7+8*7],$two0                 ! more constants
514         fsubd   $h1lo,$two32,$h1lo              ! r1
515          ldd    [%o7+8*8],$two32
516         fsubd   $h2lo,$two64,$h2lo              ! r2
517          ldd    [%o7+8*9],$two64
518         fsubd   $h3lo,$two96,$h3lo              ! r3
519          ldd    [%o7+8*10],$two96
520
521         fmuld   $five_two130,$h1lo,$s1lo        ! s1
522         fmuld   $five_two130,$h2lo,$s2lo        ! s2
523         fmuld   $five_two130,$h3lo,$s3lo        ! s3
524
525         faddd   $h0lo,$two0, $h0hi
526         faddd   $h1lo,$two32,$h1hi
527         faddd   $h2lo,$two64,$h2hi
528         faddd   $h3lo,$two96,$h3hi
529
530         fsubd   $h0hi,$two0, $h0hi
531          ldd    [%o7+8*11],$two0                ! more constants
532         fsubd   $h1hi,$two32,$h1hi
533          ldd    [%o7+8*12],$two32
534         fsubd   $h2hi,$two64,$h2hi
535          ldd    [%o7+8*13],$two64
536         fsubd   $h3hi,$two96,$h3hi
537
538         fsubd   $h0lo,$h0hi,$h0lo
539          std    $h0hi,[$ctx+8*5]                ! r0hi
540         fsubd   $h1lo,$h1hi,$h1lo
541          std    $h1hi,[$ctx+8*7]                ! r1hi
542         fsubd   $h2lo,$h2hi,$h2lo
543          std    $h2hi,[$ctx+8*9]                ! r2hi
544         fsubd   $h3lo,$h3hi,$h3lo
545          std    $h3hi,[$ctx+8*11]               ! r3hi
546
547         faddd   $s1lo,$two0, $s1hi
548         faddd   $s2lo,$two32,$s2hi
549         faddd   $s3lo,$two64,$s3hi
550
551         fsubd   $s1hi,$two0, $s1hi
552         fsubd   $s2hi,$two32,$s2hi
553         fsubd   $s3hi,$two64,$s3hi
554
555         fsubd   $s1lo,$s1hi,$s1lo
556         fsubd   $s2lo,$s2hi,$s2lo
557         fsubd   $s3lo,$s3hi,$s3lo
558
559         ldx     [%sp+LOCALS],%fsr               ! restore %fsr
560
561         std     $h0lo,[$ctx+8*4]                ! r0lo
562         std     $h1lo,[$ctx+8*6]                ! r1lo
563         std     $h2lo,[$ctx+8*8]                ! r2lo
564         std     $h3lo,[$ctx+8*10]               ! r3lo
565
566         std     $s1hi,[$ctx+8*13]
567         std     $s2hi,[$ctx+8*15]
568         std     $s3hi,[$ctx+8*17]
569
570         std     $s1lo,[$ctx+8*12]
571         std     $s2lo,[$ctx+8*14]
572         std     $s3lo,[$ctx+8*16]
573
574         add     %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
575         add     %o7,poly1305_emit_fma-.Lconsts_fma,%o1
576         STPTR   %o0,[%i2]
577         STPTR   %o1,[%i2+SIZE_T]
578
579         ret
580         restore %g0,1,%o0                       ! return 1
581
582 .Lno_key_fma:
583         ret
584         restore %g0,%g0,%o0                     ! return 0
585 .size   poly1305_init_fma,.-poly1305_init_fma
586
587 .align  32
588 poly1305_blocks_fma:
589         save    %sp,-STACK_FRAME-48,%sp
590         srlx    $len,4,$len
591
592         brz,pn  $len,.Labort
593         sub     $len,1,$len
594
595 1:      call    .+8
596         add     %o7,.Lconsts_fma-1b,%o7
597
598         ldd     [%o7+8*0],$two0                 ! load constants
599         ldd     [%o7+8*1],$two32
600         ldd     [%o7+8*2],$two64
601         ldd     [%o7+8*3],$two96
602         ldd     [%o7+8*4],$two130
603         ldd     [%o7+8*5],$five_two130
604
605         ldd     [$ctx+8*0],$h0lo                ! load [biased] hash value
606         ldd     [$ctx+8*1],$h1lo
607         ldd     [$ctx+8*2],$h2lo
608         ldd     [$ctx+8*3],$h3lo
609
610         std     $two0,[%sp+LOCALS+8*0]          ! input "template"
611         sethi   %hi((1023+52+96)<<20),$in3
612         std     $two32,[%sp+LOCALS+8*1]
613         or      $padbit,$in3,$in3
614         std     $two64,[%sp+LOCALS+8*2]
615         st      $in3,[%sp+LOCALS+8*3]
616
617         and     $inp,7,$shr
618         andn    $inp,7,$inp                     ! align pointer
619         mov     8,$i1
620         sll     $shr,3,$shr
621         mov     16,$step
622         neg     $shr,$shl
623
624         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian input
625         brz     $shr,.Linp_aligned_fma
626         ldxa    [$inp+$i1]0x88,$in2
627
628         ldxa    [$inp+$step]0x88,$in4
629         add     $inp,8,$inp
630
631         srlx    $in0,$shr,$in0                  ! align data
632         sllx    $in2,$shl,$in1
633         srlx    $in2,$shr,$in2
634         or      $in1,$in0,$in0
635         sllx    $in4,$shl,$in3
636         srlx    $in4,$shr,$in4                  ! pre-shift
637         or      $in3,$in2,$in2
638
639 .Linp_aligned_fma:
640         srlx    $in0,32,$in1
641         movrz   $len,0,$step
642         srlx    $in2,32,$in3
643         add     $step,$inp,$inp                 ! conditional advance
644
645         st      $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
646         st      $in1,[%sp+LOCALS+8*1+4]
647         st      $in2,[%sp+LOCALS+8*2+4]
648         st      $in3,[%sp+LOCALS+8*3+4]
649
650         ldd     [$ctx+8*4],$r0lo                ! load key
651         ldd     [$ctx+8*5],$r0hi
652         ldd     [$ctx+8*6],$r1lo
653         ldd     [$ctx+8*7],$r1hi
654         ldd     [$ctx+8*8],$r2lo
655         ldd     [$ctx+8*9],$r2hi
656         ldd     [$ctx+8*10],$r3lo
657         ldd     [$ctx+8*11],$r3hi
658         ldd     [$ctx+8*12],$s1lo
659         ldd     [$ctx+8*13],$s1hi
660         ldd     [$ctx+8*14],$s2lo
661         ldd     [$ctx+8*15],$s2hi
662         ldd     [$ctx+8*16],$s3lo
663         ldd     [$ctx+8*17],$s3hi
664
665         stx     %fsr,[%sp+LOCALS+8*4]           ! save original %fsr
666         ldx     [%o7+8*6],%fsr                  ! load new %fsr
667
668         subcc   $len,1,$len
669         movrz   $len,0,$step
670
671         ldd     [%sp+LOCALS+8*0],$x0            ! load biased input
672         ldd     [%sp+LOCALS+8*1],$x1
673         ldd     [%sp+LOCALS+8*2],$x2
674         ldd     [%sp+LOCALS+8*3],$x3
675
676         fsubd   $h0lo,$two0, $h0lo              ! de-bias hash value
677         fsubd   $h1lo,$two32,$h1lo
678          ldxa   [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
679         fsubd   $h2lo,$two64,$h2lo
680         fsubd   $h3lo,$two96,$h3lo
681          ldxa   [$inp+$i1]0x88,$in2
682
683         fsubd   $x0,$two0, $x0                  ! de-bias input
684         fsubd   $x1,$two32,$x1
685         fsubd   $x2,$two64,$x2
686         fsubd   $x3,$two96,$x3
687
688         brz     $shr,.Linp_aligned_fma2
689         add     $step,$inp,$inp                 ! conditional advance
690
691         sllx    $in0,$shl,$in1                  ! align data
692         srlx    $in0,$shr,$in3
693         or      $in1,$in4,$in0
694         sllx    $in2,$shl,$in1
695         srlx    $in2,$shr,$in4                  ! pre-shift
696         or      $in3,$in1,$in2
697 .Linp_aligned_fma2:
698         srlx    $in0,32,$in1
699         srlx    $in2,32,$in3
700
701         faddd   $h0lo,$x0,$x0                   ! accumulate input
702          stw    $in0,[%sp+LOCALS+8*0+4]
703         faddd   $h1lo,$x1,$x1
704          stw    $in1,[%sp+LOCALS+8*1+4]
705         faddd   $h2lo,$x2,$x2
706          stw    $in2,[%sp+LOCALS+8*2+4]
707         faddd   $h3lo,$x3,$x3
708          stw    $in3,[%sp+LOCALS+8*3+4]
709
710         b       .Lentry_fma
711         nop
712
713 .align  16
714 .Loop_fma:
715         ldxa    [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
716         ldxa    [$inp+$i1]0x88,$in2
717         movrz   $len,0,$step
718
719         faddd   $y0,$h0lo,$h0lo                 ! accumulate input
720         faddd   $y1,$h0hi,$h0hi
721         faddd   $y2,$h2lo,$h2lo
722         faddd   $y3,$h2hi,$h2hi
723
724         brz,pn  $shr,.Linp_aligned_fma3
725         add     $step,$inp,$inp                 ! conditional advance
726
727         sllx    $in0,$shl,$in1                  ! align data
728         srlx    $in0,$shr,$in3
729         or      $in1,$in4,$in0
730         sllx    $in2,$shl,$in1
731         srlx    $in2,$shr,$in4                  ! pre-shift
732         or      $in3,$in1,$in2
733
734 .Linp_aligned_fma3:
735         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
736         faddd   $two64,$h1lo,$c1lo
737          srlx   $in0,32,$in1
738         faddd   $two64,$h1hi,$c1hi
739          srlx   $in2,32,$in3
740         faddd   $two130,$h3lo,$c3lo
741          st     $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
742         faddd   $two130,$h3hi,$c3hi
743          st     $in1,[%sp+LOCALS+8*1+4]
744         faddd   $two32,$h0lo,$c0lo
745          st     $in2,[%sp+LOCALS+8*2+4]
746         faddd   $two32,$h0hi,$c0hi
747          st     $in3,[%sp+LOCALS+8*3+4]
748         faddd   $two96,$h2lo,$c2lo
749         faddd   $two96,$h2hi,$c2hi
750
751         fsubd   $c1lo,$two64,$c1lo
752         fsubd   $c1hi,$two64,$c1hi
753         fsubd   $c3lo,$two130,$c3lo
754         fsubd   $c3hi,$two130,$c3hi
755         fsubd   $c0lo,$two32,$c0lo
756         fsubd   $c0hi,$two32,$c0hi
757         fsubd   $c2lo,$two96,$c2lo
758         fsubd   $c2hi,$two96,$c2hi
759
760         fsubd   $h1lo,$c1lo,$h1lo
761         fsubd   $h1hi,$c1hi,$h1hi
762         fsubd   $h3lo,$c3lo,$h3lo
763         fsubd   $h3hi,$c3hi,$h3hi
764         fsubd   $h2lo,$c2lo,$h2lo
765         fsubd   $h2hi,$c2hi,$h2hi
766         fsubd   $h0lo,$c0lo,$h0lo
767         fsubd   $h0hi,$c0hi,$h0hi
768
769         faddd   $h1lo,$c0lo,$h1lo
770         faddd   $h1hi,$c0hi,$h1hi
771         faddd   $h3lo,$c2lo,$h3lo
772         faddd   $h3hi,$c2hi,$h3hi
773         faddd   $h2lo,$c1lo,$h2lo
774         faddd   $h2hi,$c1hi,$h2hi
775         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
776         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
777
778         faddd   $h1lo,$h1hi,$x1
779          ldd    [$ctx+8*12],$s1lo               ! reload constants
780         faddd   $h3lo,$h3hi,$x3
781          ldd    [$ctx+8*13],$s1hi
782         faddd   $h2lo,$h2hi,$x2
783          ldd    [$ctx+8*10],$r3lo
784         faddd   $h0lo,$h0hi,$x0
785          ldd    [$ctx+8*11],$r3hi
786
787 .Lentry_fma:
788         fmuld   $x1,$s3lo,$h0lo
789         fmuld   $x1,$s3hi,$h0hi
790         fmuld   $x1,$r1lo,$h2lo
791         fmuld   $x1,$r1hi,$h2hi
792         fmuld   $x1,$r0lo,$h1lo
793         fmuld   $x1,$r0hi,$h1hi
794         fmuld   $x1,$r2lo,$h3lo
795         fmuld   $x1,$r2hi,$h3hi
796
797         fmaddd  $x3,$s1lo,$h0lo,$h0lo
798         fmaddd  $x3,$s1hi,$h0hi,$h0hi
799         fmaddd  $x3,$s3lo,$h2lo,$h2lo
800         fmaddd  $x3,$s3hi,$h2hi,$h2hi
801         fmaddd  $x3,$s2lo,$h1lo,$h1lo
802         fmaddd  $x3,$s2hi,$h1hi,$h1hi
803         fmaddd  $x3,$r0lo,$h3lo,$h3lo
804         fmaddd  $x3,$r0hi,$h3hi,$h3hi
805
806         fmaddd  $x2,$s2lo,$h0lo,$h0lo
807         fmaddd  $x2,$s2hi,$h0hi,$h0hi
808         fmaddd  $x2,$r0lo,$h2lo,$h2lo
809         fmaddd  $x2,$r0hi,$h2hi,$h2hi
810         fmaddd  $x2,$s3lo,$h1lo,$h1lo
811          ldd    [%sp+LOCALS+8*0],$y0            ! load [biased] input
812         fmaddd  $x2,$s3hi,$h1hi,$h1hi
813          ldd    [%sp+LOCALS+8*1],$y1
814         fmaddd  $x2,$r1lo,$h3lo,$h3lo
815          ldd    [%sp+LOCALS+8*2],$y2
816         fmaddd  $x2,$r1hi,$h3hi,$h3hi
817          ldd    [%sp+LOCALS+8*3],$y3
818
819         fmaddd  $x0,$r0lo,$h0lo,$h0lo
820          fsubd  $y0,$two0, $y0                  ! de-bias input
821         fmaddd  $x0,$r0hi,$h0hi,$h0hi
822          fsubd  $y1,$two32,$y1
823         fmaddd  $x0,$r2lo,$h2lo,$h2lo
824          fsubd  $y2,$two64,$y2
825         fmaddd  $x0,$r2hi,$h2hi,$h2hi
826          fsubd  $y3,$two96,$y3
827         fmaddd  $x0,$r1lo,$h1lo,$h1lo
828         fmaddd  $x0,$r1hi,$h1hi,$h1hi
829         fmaddd  $x0,$r3lo,$h3lo,$h3lo
830         fmaddd  $x0,$r3hi,$h3hi,$h3hi
831
832         bcc     SIZE_T_CC,.Loop_fma
833         subcc   $len,1,$len
834
835         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
836         faddd   $h0lo,$two32,$c0lo
837         faddd   $h0hi,$two32,$c0hi
838         faddd   $h2lo,$two96,$c2lo
839         faddd   $h2hi,$two96,$c2hi
840         faddd   $h1lo,$two64,$c1lo
841         faddd   $h1hi,$two64,$c1hi
842         faddd   $h3lo,$two130,$c3lo
843         faddd   $h3hi,$two130,$c3hi
844
845         fsubd   $c0lo,$two32,$c0lo
846         fsubd   $c0hi,$two32,$c0hi
847         fsubd   $c2lo,$two96,$c2lo
848         fsubd   $c2hi,$two96,$c2hi
849         fsubd   $c1lo,$two64,$c1lo
850         fsubd   $c1hi,$two64,$c1hi
851         fsubd   $c3lo,$two130,$c3lo
852         fsubd   $c3hi,$two130,$c3hi
853
854         fsubd   $h1lo,$c1lo,$h1lo
855         fsubd   $h1hi,$c1hi,$h1hi
856         fsubd   $h3lo,$c3lo,$h3lo
857         fsubd   $h3hi,$c3hi,$h3hi
858         fsubd   $h2lo,$c2lo,$h2lo
859         fsubd   $h2hi,$c2hi,$h2hi
860         fsubd   $h0lo,$c0lo,$h0lo
861         fsubd   $h0hi,$c0hi,$h0hi
862
863         faddd   $h1lo,$c0lo,$h1lo
864         faddd   $h1hi,$c0hi,$h1hi
865         faddd   $h3lo,$c2lo,$h3lo
866         faddd   $h3hi,$c2hi,$h3hi
867         faddd   $h2lo,$c1lo,$h2lo
868         faddd   $h2hi,$c1hi,$h2hi
869         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
870         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
871
872         faddd   $h1lo,$h1hi,$x1
873         faddd   $h3lo,$h3hi,$x3
874         faddd   $h2lo,$h2hi,$x2
875         faddd   $h0lo,$h0hi,$x0
876
877         faddd   $x1,$two32,$x1                  ! bias
878         faddd   $x3,$two96,$x3
879         faddd   $x2,$two64,$x2
880         faddd   $x0,$two0, $x0
881
882         ldx     [%sp+LOCALS+8*4],%fsr           ! restore saved %fsr
883
884         std     $x1,[$ctx+8*1]                  ! store [biased] hash value
885         std     $x3,[$ctx+8*3]
886         std     $x2,[$ctx+8*2]
887         std     $x0,[$ctx+8*0]
888
889 .Labort:
890         ret
891         restore
892 .size   poly1305_blocks_fma,.-poly1305_blocks_fma
893 ___
894 {
895 my ($mac,$nonce)=($inp,$len);
896
897 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
898    ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
899
900 $code.=<<___;
901 .align  32
902 poly1305_emit_fma:
903         save    %sp,-STACK_FRAME,%sp
904
905         ld      [$ctx+8*0+0],$d0                ! load hash
906         ld      [$ctx+8*0+4],$h0
907         ld      [$ctx+8*1+0],$d1
908         ld      [$ctx+8*1+4],$h1
909         ld      [$ctx+8*2+0],$d2
910         ld      [$ctx+8*2+4],$h2
911         ld      [$ctx+8*3+0],$d3
912         ld      [$ctx+8*3+4],$h3
913
914         sethi   %hi(0xfff00000),$mask
915         andn    $d0,$mask,$d0                   ! mask exponent
916         andn    $d1,$mask,$d1
917         andn    $d2,$mask,$d2
918         andn    $d3,$mask,$d3                   ! can be partially reduced...
919         mov     3,$mask
920
921         srl     $d3,2,$padbit                   ! ... so reduce
922         and     $d3,$mask,$h4
923         andn    $d3,$mask,$d3
924         add     $padbit,$d3,$d3
925
926         addcc   $d3,$h0,$h0
927         addccc  $d0,$h1,$h1
928         addccc  $d1,$h2,$h2
929         addccc  $d2,$h3,$h3
930         addc    %g0,$h4,$h4
931
932         addcc   $h0,5,$d0                       ! compare to modulus
933         addccc  $h1,0,$d1
934         addccc  $h2,0,$d2
935         addccc  $h3,0,$d3
936         addc    $h4,0,$mask
937
938         srl     $mask,2,$mask                   ! did it carry/borrow?
939         neg     $mask,$mask
940         sra     $mask,31,$mask                  ! mask
941
942         andn    $h0,$mask,$h0
943         and     $d0,$mask,$d0
944         andn    $h1,$mask,$h1
945         and     $d1,$mask,$d1
946         or      $d0,$h0,$h0
947         ld      [$nonce+0],$d0                  ! load nonce
948         andn    $h2,$mask,$h2
949         and     $d2,$mask,$d2
950         or      $d1,$h1,$h1
951         ld      [$nonce+4],$d1
952         andn    $h3,$mask,$h3
953         and     $d3,$mask,$d3
954         or      $d2,$h2,$h2
955         ld      [$nonce+8],$d2
956         or      $d3,$h3,$h3
957         ld      [$nonce+12],$d3
958
959         addcc   $d0,$h0,$h0                     ! accumulate nonce
960         addccc  $d1,$h1,$h1
961         addccc  $d2,$h2,$h2
962         addc    $d3,$h3,$h3
963
964         stb     $h0,[$mac+0]                    ! write little-endian result
965         srl     $h0,8,$h0
966         stb     $h1,[$mac+4]
967         srl     $h1,8,$h1
968         stb     $h2,[$mac+8]
969         srl     $h2,8,$h2
970         stb     $h3,[$mac+12]
971         srl     $h3,8,$h3
972
973         stb     $h0,[$mac+1]
974         srl     $h0,8,$h0
975         stb     $h1,[$mac+5]
976         srl     $h1,8,$h1
977         stb     $h2,[$mac+9]
978         srl     $h2,8,$h2
979         stb     $h3,[$mac+13]
980         srl     $h3,8,$h3
981
982         stb     $h0,[$mac+2]
983         srl     $h0,8,$h0
984         stb     $h1,[$mac+6]
985         srl     $h1,8,$h1
986         stb     $h2,[$mac+10]
987         srl     $h2,8,$h2
988         stb     $h3,[$mac+14]
989         srl     $h3,8,$h3
990
991         stb     $h0,[$mac+3]
992         stb     $h1,[$mac+7]
993         stb     $h2,[$mac+11]
994         stb     $h3,[$mac+15]
995
996         ret
997         restore
998 .size   poly1305_emit_fma,.-poly1305_emit_fma
999 ___
1000 }
1001
1002 $code.=<<___;
1003 .align  64
1004 .Lconsts_fma:
1005 .word   0x43300000,0x00000000           ! 2^(52+0)
1006 .word   0x45300000,0x00000000           ! 2^(52+32)
1007 .word   0x47300000,0x00000000           ! 2^(52+64)
1008 .word   0x49300000,0x00000000           ! 2^(52+96)
1009 .word   0x4b500000,0x00000000           ! 2^(52+130)
1010
1011 .word   0x37f40000,0x00000000           ! 5/2^130
1012 .word   0,1<<30                         ! fsr: truncate, no exceptions
1013
1014 .word   0x44300000,0x00000000           ! 2^(52+16+0)
1015 .word   0x46300000,0x00000000           ! 2^(52+16+32)
1016 .word   0x48300000,0x00000000           ! 2^(52+16+64)
1017 .word   0x4a300000,0x00000000           ! 2^(52+16+96)
1018 .word   0x3e300000,0x00000000           ! 2^(52+16+0-96)
1019 .word   0x40300000,0x00000000           ! 2^(52+16+32-96)
1020 .word   0x42300000,0x00000000           ! 2^(52+16+64-96)
1021 .asciz  "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1022 .align  4
1023 ___
1024 }
1025 \f
1026 # Purpose of these subroutines is to explicitly encode VIS instructions,
1027 # so that one can compile the module without having to specify VIS
1028 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1029 # Idea is to reserve for option to produce "universal" binary and let
1030 # programmer detect if current CPU is VIS capable at run-time.
1031 sub unvis3 {
1032 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1033 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1034 my ($ref,$opf);
1035 my %visopf = (  "addxc"         => 0x011,
1036                 "addxccc"       => 0x013,
1037                 "umulxhi"       => 0x016        );
1038
1039     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1040
1041     if ($opf=$visopf{$mnemonic}) {
1042         foreach ($rs1,$rs2,$rd) {
1043             return $ref if (!/%([goli])([0-9])/);
1044             $_=$bias{$1}+$2;
1045         }
1046
1047         return  sprintf ".word\t0x%08x !%s",
1048                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1049                         $ref;
1050     } else {
1051         return $ref;
1052     }
1053 }
1054
1055 sub unfma {
1056 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1057 my ($ref,$opf);
1058 my %fmaopf = (  "fmadds"        => 0x1,
1059                 "fmaddd"        => 0x2,
1060                 "fmsubs"        => 0x5,
1061                 "fmsubd"        => 0x6          );
1062
1063     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1064
1065     if ($opf=$fmaopf{$mnemonic}) {
1066         foreach ($rs1,$rs2,$rs3,$rd) {
1067             return $ref if (!/%f([0-9]{1,2})/);
1068             $_=$1;
1069             if ($1>=32) {
1070                 return $ref if ($1&1);
1071                 # re-encode for upper double register addressing
1072                 $_=($1|$1>>5)&31;
1073             }
1074         }
1075
1076         return  sprintf ".word\t0x%08x !%s",
1077                         0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1078                         $ref;
1079     } else {
1080         return $ref;
1081     }
1082 }
1083
1084 foreach (split("\n",$code)) {
1085         s/\`([^\`]*)\`/eval $1/ge;
1086
1087         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1088                 &unvis3($1,$2,$3,$4)
1089          /ge    or
1090         s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1091                 &unfma($1,$2,$3,$4,$5)
1092          /ge;
1093
1094         print $_,"\n";
1095 }
1096
1097 close STDOUT;