crypto/poly1305/asm/poly1305-sparcv9.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 # ====================================================================
   9 #
  10 # This module implements Poly1305 hash for SPARCv9, vanilla, as well
  11 # as VIS3 and FMA extensions.
  12 #
  13 # May, August 2015
  14 #
  15 # Numbers are cycles per processed byte with poly1305_blocks alone.
  16 #
  17 #                       IALU(*)         FMA
  18 #
  19 # UltraSPARC III        11.9(**)
  20 # SPARC T3              7.85
  21 # SPARC T4              1.67(***)       6.55
  22 # SPARC64 X             5.54            3.64
  23 #
  24 # (*)   Comparison to compiler-generated code is really problematic,
  25 #       because latter's performance varies too much depending on too
  26 #       many variables. For example, one can measure from 5x to 15x
  27 #       improvement on T4 for gcc-4.6. Well, in T4 case it's a bit
  28 #       unfair comparison, because compiler doesn't use VIS3, but
  29 #       given same initial conditions coefficient varies from 3x to 9x.
  30 # (**)  Pre-III performance should be even worse; floating-point
  31 #       performance for UltraSPARC I-IV on the other hand is reported
  32 #       to be 4.25 for hand-coded assembly, but they are just too old
  33 #       to care about.
  34 # (***) Multi-process benchmark saturates at ~12.5x single-process
  35 #       result on 8-core processor, or ~21GBps per 2.85GHz socket.
  36
  37 my $output = pop;
  38 open STDOUT,">$output";
  39
  40 my ($ctx,$inp,$len,$padbit,$shl,$shr)   = map("%i$_",(0..5));
  41 my ($r0,$r1,$r2,$r3,$s1,$s2,$s3,$h4)    = map("%l$_",(0..7));
  42 my ($h0,$h1,$h2,$h3, $t0,$t1,$t2)       = map("%o$_",(0..5,7));
  43 my ($d0,$d1,$d2,$d3)                    = map("%g$_",(1..4));
  44
  45 $code.=<<___;
  46 #include "sparc_arch.h"
  47
  48 #ifdef  __arch64__
  49 .register       %g2,#scratch
  50 .register       %g3,#scratch
  51 # define        STPTR   stx
  52 # define        SIZE_T  8
  53 #else
  54 # define        STPTR   st
  55 # define        SIZE_T  4
  56 #endif
  57 #define LOCALS  (STACK_BIAS+STACK_FRAME)
  58
  59 .section        ".text",#alloc,#execinstr
  60
  61 #ifdef __PIC__
  62 SPARC_PIC_THUNK(%g1)
  63 #endif
  64
  65 .globl  poly1305_init
  66 .align  32
  67 poly1305_init:
  68         save    %sp,-STACK_FRAME-16,%sp
  69         nop
  70
  71         SPARC_LOAD_ADDRESS(OPENSSL_sparcv9cap_P,%g1)
  72         ld      [%g1],%g1
  73
  74         and     %g1,SPARCV9_FMADD|SPARCV9_PREFER_FPU|SPARCV9_VIS3,%g1
  75         cmp     %g1,SPARCV9_FMADD|SPARCV9_PREFER_FPU
  76         be      .Lpoly1305_init_fma
  77         nop
  78
  79         stx     %g0,[$ctx+0]
  80         stx     %g0,[$ctx+8]            ! zero hash value
  81         brz,pn  $inp,.Lno_key
  82         stx     %g0,[$ctx+16]
  83
  84         and     $inp,7,$shr             ! alignment factor
  85         andn    $inp,7,$inp
  86         sll     $shr,3,$shr             ! *8
  87         neg     $shr,$shl
  88
  89         sethi   %hi(0x0ffffffc),$t0
  90         set     8,$h1
  91         or      $t0,%lo(0x0ffffffc),$t0
  92         set     16,$h2
  93         sllx    $t0,32,$t1
  94         or      $t0,$t1,$t1             ! 0x0ffffffc0ffffffc
  95         or      $t1,3,$t0               ! 0x0ffffffc0fffffff
  96
  97         ldxa    [$inp+%g0]0x88,$h0      ! load little-endian key
  98         brz,pt  $shr,.Lkey_aligned
  99         ldxa    [$inp+$h1]0x88,$h1
 100
 101         ldxa    [$inp+$h2]0x88,$h2
 102         srlx    $h0,$shr,$h0
 103         sllx    $h1,$shl,$t2
 104         srlx    $h1,$shr,$h1
 105         or      $t2,$h0,$h0
 106         sllx    $h2,$shl,$h2
 107         or      $h2,$h1,$h1
 108
 109 .Lkey_aligned:
 110         and     $t0,$h0,$h0
 111         and     $t1,$h1,$h1
 112         stx     $h0,[$ctx+32+0]         ! store key
 113         stx     $h1,[$ctx+32+8]
 114
 115         andcc   %g1,SPARCV9_VIS3,%g0
 116         be      .Lno_key
 117         nop
 118
 119 1:      call    .+8
 120         add     %o7,poly1305_blocks_vis3-1b,%o7
 121
 122         add     %o7,poly1305_emit-poly1305_blocks_vis3,%o5
 123         STPTR   %o7,[%i2]
 124         STPTR   %o5,[%i2+SIZE_T]
 125
 126         ret
 127         restore %g0,1,%o0               ! return 1
 128
 129 .Lno_key:
 130         ret
 131         restore %g0,%g0,%o0             ! return 0
 132 .size   poly1305_init,.-poly1305_init
 133
 134 .globl  poly1305_blocks
 135 .align  32
 136 poly1305_blocks:
 137         save    %sp,-STACK_FRAME,%sp
 138         andn    $len,15,$len
 139
 140         brz,pn  $len,.Lno_data
 141         nop
 142
 143         ld      [$ctx+32+0],$r1         ! load key
 144         ld      [$ctx+32+4],$r0
 145         ld      [$ctx+32+8],$r3
 146         ld      [$ctx+32+12],$r2
 147
 148         ld      [$ctx+0],$h1            ! load hash value
 149         ld      [$ctx+4],$h0
 150         ld      [$ctx+8],$h3
 151         ld      [$ctx+12],$h2
 152         ld      [$ctx+16],$h4
 153
 154         and     $inp,7,$shr             ! alignment factor
 155         andn    $inp,7,$inp
 156         set     8,$d1
 157         sll     $shr,3,$shr             ! *8
 158         set     16,$d2
 159         neg     $shr,$shl
 160
 161         srl     $r1,2,$s1
 162         srl     $r2,2,$s2
 163         add     $r1,$s1,$s1
 164         srl     $r3,2,$s3
 165         add     $r2,$s2,$s2
 166         add     $r3,$s3,$s3
 167
 168 .Loop:
 169         ldxa    [$inp+%g0]0x88,$d0      ! load little-endian input
 170         brz,pt  $shr,.Linp_aligned
 171         ldxa    [$inp+$d1]0x88,$d1
 172
 173         ldxa    [$inp+$d2]0x88,$d2
 174         srlx    $d0,$shr,$d0
 175         sllx    $d1,$shl,$t1
 176         srlx    $d1,$shr,$d1
 177         or      $t1,$d0,$d0
 178         sllx    $d2,$shl,$d2
 179         or      $d2,$d1,$d1
 180
 181 .Linp_aligned:
 182         srlx    $d0,32,$t0
 183         addcc   $d0,$h0,$h0             ! accumulate input
 184         srlx    $d1,32,$t1
 185         addccc  $t0,$h1,$h1
 186         addccc  $d1,$h2,$h2
 187         addccc  $t1,$h3,$h3
 188         addc    $padbit,$h4,$h4
 189
 190         umul    $r0,$h0,$d0
 191         umul    $r1,$h0,$d1
 192         umul    $r2,$h0,$d2
 193         umul    $r3,$h0,$d3
 194          sub    $len,16,$len
 195          add    $inp,16,$inp
 196
 197         umul    $s3,$h1,$t0
 198         umul    $r0,$h1,$t1
 199         umul    $r1,$h1,$t2
 200         add     $t0,$d0,$d0
 201         add     $t1,$d1,$d1
 202         umul    $r2,$h1,$t0
 203         add     $t2,$d2,$d2
 204         add     $t0,$d3,$d3
 205
 206         umul    $s2,$h2,$t1
 207         umul    $s3,$h2,$t2
 208         umul    $r0,$h2,$t0
 209         add     $t1,$d0,$d0
 210         add     $t2,$d1,$d1
 211         umul    $r1,$h2,$t1
 212         add     $t0,$d2,$d2
 213         add     $t1,$d3,$d3
 214
 215         umul    $s1,$h3,$t2
 216         umul    $s2,$h3,$t0
 217         umul    $s3,$h3,$t1
 218         add     $t2,$d0,$d0
 219         add     $t0,$d1,$d1
 220         umul    $r0,$h3,$t2
 221         add     $t1,$d2,$d2
 222         add     $t2,$d3,$d3
 223
 224         umul    $s1,$h4,$t0
 225         umul    $s2,$h4,$t1
 226         umul    $s3,$h4,$t2
 227         umul    $r0,$h4,$h4
 228         add     $t0,$d1,$d1
 229         add     $t1,$d2,$d2
 230         srlx    $d0,32,$h1
 231         add     $t2,$d3,$d3
 232         srlx    $d1,32,$h2
 233
 234         addcc   $d1,$h1,$h1
 235         srlx    $d2,32,$h3
 236          set    8,$d1
 237         addccc  $d2,$h2,$h2
 238         srlx    $d3,32,$t0
 239          set    16,$d2
 240         addccc  $d3,$h3,$h3
 241         addc    $t0,$h4,$h4
 242
 243         srl     $h4,2,$t0               ! final reduction step
 244         andn    $h4,3,$t1
 245         and     $h4,3,$h4
 246         add     $t1,$t0,$t0
 247
 248         addcc   $t0,$d0,$h0
 249         addccc  %g0,$h1,$h1
 250         addccc  %g0,$h2,$h2
 251         brnz,pt $len,.Loop
 252         addc    %g0,$h3,$h3
 253
 254         st      $h1,[$ctx+0]            ! store hash value
 255         st      $h0,[$ctx+4]
 256         st      $h3,[$ctx+8]
 257         st      $h2,[$ctx+12]
 258         st      $h4,[$ctx+16]
 259
 260 .Lno_data:
 261         ret
 262         restore
 263 .size   poly1305_blocks,.-poly1305_blocks
 264 ___
 265 ########################################################################
 266 # VIS3 has umulxhi and addxc...
 267 {
 268 my ($H0,$H1,$H2,$R0,$R1,$S1,$T1) = map("%o$_",(0..5,7));
 269 my ($D0,$D1,$D2,$T0) = map("%g$_",(1..4));
 270
 271 $code.=<<___;
 272 .align  32
 273 poly1305_blocks_vis3:
 274         save    %sp,-STACK_FRAME,%sp
 275         andn    $len,15,$len
 276
 277         brz,pn  $len,.Lno_data
 278         nop
 279
 280         ldx     [$ctx+32+0],$R0         ! load key
 281         ldx     [$ctx+32+8],$R1
 282
 283         ldx     [$ctx+0],$H0            ! load hash value
 284         ldx     [$ctx+8],$H1
 285         ld      [$ctx+16],$H2
 286
 287         and     $inp,7,$shr             ! alignment factor
 288         andn    $inp,7,$inp
 289         set     8,$r1
 290         sll     $shr,3,$shr             ! *8
 291         set     16,$r2
 292         neg     $shr,$shl
 293
 294         srlx    $R1,2,$S1
 295         add     $R1,$S1,$S1
 296
 297 .Loop_vis3:
 298         ldxa    [$inp+%g0]0x88,$D0      ! load little-endian input
 299         brz,pt  $shr,.Linp_aligned_vis3
 300         ldxa    [$inp+$r1]0x88,$D1
 301
 302         ldxa    [$inp+$r2]0x88,$D2
 303         srlx    $D0,$shr,$D0
 304         sllx    $D1,$shl,$T1
 305         srlx    $D1,$shr,$D1
 306         or      $T1,$D0,$D0
 307         sllx    $D2,$shl,$D2
 308         or      $D2,$D1,$D1
 309
 310 .Linp_aligned_vis3:
 311         addcc   $D0,$H0,$H0             ! accumulate input
 312          sub    $len,16,$len
 313         addxccc $D1,$H1,$H1
 314          add    $inp,16,$inp
 315
 316         mulx    $R0,$H0,$D0             ! r0*h0
 317         addxc   $padbit,$H2,$H2
 318         umulxhi $R0,$H0,$D1
 319         mulx    $S1,$H1,$T0             ! s1*h1
 320         umulxhi $S1,$H1,$T1
 321         addcc   $T0,$D0,$D0
 322         mulx    $R1,$H0,$T0             ! r1*h0
 323         addxc   $T1,$D1,$D1
 324         umulxhi $R1,$H0,$D2
 325         addcc   $T0,$D1,$D1
 326         mulx    $R0,$H1,$T0             ! r0*h1
 327         addxc   %g0,$D2,$D2
 328         umulxhi $R0,$H1,$T1
 329         addcc   $T0,$D1,$D1
 330         mulx    $S1,$H2,$T0             ! s1*h2
 331         addxc   $T1,$D2,$D2
 332         mulx    $R0,$H2,$T1             ! r0*h2
 333         addcc   $T0,$D1,$D1
 334         addxc   $T1,$D2,$D2
 335
 336         srlx    $D2,2,$T0               ! final reduction step
 337         andn    $D2,3,$T1
 338         and     $D2,3,$H2
 339         add     $T1,$T0,$T0
 340
 341         addcc   $T0,$D0,$H0
 342         brnz,pt $len,.Loop_vis3
 343         addxc   %g0,$D1,$H1
 344
 345         stx     $H0,[$ctx+0]            ! store hash value
 346         stx     $H1,[$ctx+8]
 347         st      $H2,[$ctx+16]
 348
 349         ret
 350         restore
 351 .size   poly1305_blocks_vis3,.-poly1305_blocks_vis3
 352 ___
 353 }
 354 my ($mac,$nonce) = ($inp,$len);
 355
 356 $code.=<<___;
 357 .globl  poly1305_emit
 358 .align  32
 359 poly1305_emit:
 360         save    %sp,-STACK_FRAME,%sp
 361
 362         ld      [$ctx+0],$h1            ! load hash value
 363         ld      [$ctx+4],$h0
 364         ld      [$ctx+8],$h3
 365         ld      [$ctx+12],$h2
 366         ld      [$ctx+16],$h4
 367
 368         addcc   $h0,5,$r0               ! compare to modulus
 369         addccc  $h1,0,$r1
 370         addccc  $h2,0,$r2
 371         addccc  $h3,0,$r3
 372         addc    $h4,0,$h4
 373         andcc   $h4,4,%g0               ! did it carry/borrow?
 374
 375         movnz   %icc,$r0,$h0
 376         ld      [$nonce+0],$r0          ! load nonce
 377         movnz   %icc,$r1,$h1
 378         ld      [$nonce+4],$r1
 379         movnz   %icc,$r2,$h2
 380         ld      [$nonce+8],$r2
 381         movnz   %icc,$r3,$h3
 382         ld      [$nonce+12],$r3
 383
 384         addcc   $r0,$h0,$h0             ! accumulate nonce
 385         addccc  $r1,$h1,$h1
 386         addccc  $r2,$h2,$h2
 387         addc    $r3,$h3,$h3
 388
 389         srl     $h0,8,$r0
 390         stb     $h0,[$mac+0]            ! store little-endian result
 391         srl     $h0,16,$r1
 392         stb     $r0,[$mac+1]
 393         srl     $h0,24,$r2
 394         stb     $r1,[$mac+2]
 395         stb     $r2,[$mac+3]
 396
 397         srl     $h1,8,$r0
 398         stb     $h1,[$mac+4]
 399         srl     $h1,16,$r1
 400         stb     $r0,[$mac+5]
 401         srl     $h1,24,$r2
 402         stb     $r1,[$mac+6]
 403         stb     $r2,[$mac+7]
 404
 405         srl     $h2,8,$r0
 406         stb     $h2,[$mac+8]
 407         srl     $h2,16,$r1
 408         stb     $r0,[$mac+9]
 409         srl     $h2,24,$r2
 410         stb     $r1,[$mac+10]
 411         stb     $r2,[$mac+11]
 412
 413         srl     $h3,8,$r0
 414         stb     $h3,[$mac+12]
 415         srl     $h3,16,$r1
 416         stb     $r0,[$mac+13]
 417         srl     $h3,24,$r2
 418         stb     $r1,[$mac+14]
 419         stb     $r2,[$mac+15]
 420
 421         ret
 422         restore
 423 .size   poly1305_emit,.-poly1305_emit
 424 ___
 425
 426 {
 427 my ($ctx,$inp,$len,$padbit) = map("%i$_",(0..3));
 428 my ($in0,$in1,$in2,$in3,$in4) = map("%o$_",(0..4));
 429 my ($i1,$step,$shr,$shl) = map("%l$_",(0..7));
 430 my $i2=$step;
 431
 432 my ($h0lo,$h0hi,$h1lo,$h1hi,$h2lo,$h2hi,$h3lo,$h3hi,
 433     $two0,$two32,$two64,$two96,$two130,$five_two130,
 434     $r0lo,$r0hi,$r1lo,$r1hi,$r2lo,$r2hi,
 435     $s2lo,$s2hi,$s3lo,$s3hi,
 436     $c0lo,$c0hi,$c1lo,$c1hi,$c2lo,$c2hi,$c3lo,$c3hi) = map("%f".2*$_,(0..31));
 437 # borrowings
 438 my ($r3lo,$r3hi,$s1lo,$s1hi) = ($c0lo,$c0hi,$c1lo,$c1hi);
 439 my ($x0,$x1,$x2,$x3) = ($c2lo,$c2hi,$c3lo,$c3hi);
 440 my ($y0,$y1,$y2,$y3) = ($c1lo,$c1hi,$c3hi,$c3lo);
 441
 442 $code.=<<___;
 443 .align  32
 444 poly1305_init_fma:
 445         save    %sp,-STACK_FRAME-16,%sp
 446         nop
 447
 448 .Lpoly1305_init_fma:
 449 1:      call    .+8
 450         add     %o7,.Lconsts_fma-1b,%o7
 451
 452         ldd     [%o7+8*0],$two0                 ! load constants
 453         ldd     [%o7+8*1],$two32
 454         ldd     [%o7+8*2],$two64
 455         ldd     [%o7+8*3],$two96
 456         ldd     [%o7+8*5],$five_two130
 457
 458         std     $two0,[$ctx+8*0]                ! initial hash value, biased 0
 459         std     $two32,[$ctx+8*1]
 460         std     $two64,[$ctx+8*2]
 461         std     $two96,[$ctx+8*3]
 462
 463         brz,pn  $inp,.Lno_key_fma
 464         nop
 465
 466         stx     %fsr,[%sp+LOCALS]               ! save original %fsr
 467         ldx     [%o7+8*6],%fsr                  ! load new %fsr
 468
 469         std     $two0,[$ctx+8*4]                ! key "template"
 470         std     $two32,[$ctx+8*5]
 471         std     $two64,[$ctx+8*6]
 472         std     $two96,[$ctx+8*7]
 473
 474         and     $inp,7,$shr
 475         andn    $inp,7,$inp                     ! align pointer
 476         mov     8,$i1
 477         sll     $shr,3,$shr
 478         mov     16,$i2
 479         neg     $shr,$shl
 480
 481         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian key
 482         ldxa    [$inp+$i1]0x88,$in2
 483
 484         brz     $shr,.Lkey_aligned_fma
 485         sethi   %hi(0xf0000000),$i1             !   0xf0000000
 486
 487         ldxa    [$inp+$i2]0x88,$in4
 488
 489         srlx    $in0,$shr,$in0                  ! align data
 490         sllx    $in2,$shl,$in1
 491         srlx    $in2,$shr,$in2
 492         or      $in1,$in0,$in0
 493         sllx    $in4,$shl,$in3
 494         or      $in3,$in2,$in2
 495
 496 .Lkey_aligned_fma:
 497         or      $i1,3,$i2                       !   0xf0000003
 498         srlx    $in0,32,$in1
 499         andn    $in0,$i1,$in0                   ! &=0x0fffffff
 500         andn    $in1,$i2,$in1                   ! &=0x0ffffffc
 501         srlx    $in2,32,$in3
 502         andn    $in2,$i2,$in2
 503         andn    $in3,$i2,$in3
 504
 505         st      $in0,[$ctx+`8*4+4`]             ! fill "template"
 506         st      $in1,[$ctx+`8*5+4`]
 507         st      $in2,[$ctx+`8*6+4`]
 508         st      $in3,[$ctx+`8*7+4`]
 509
 510         ldd     [$ctx+8*4],$h0lo                ! load [biased] key
 511         ldd     [$ctx+8*5],$h1lo
 512         ldd     [$ctx+8*6],$h2lo
 513         ldd     [$ctx+8*7],$h3lo
 514
 515         fsubd   $h0lo,$two0, $h0lo              ! r0
 516          ldd    [%o7+8*7],$two0                 ! more constants
 517         fsubd   $h1lo,$two32,$h1lo              ! r1
 518          ldd    [%o7+8*8],$two32
 519         fsubd   $h2lo,$two64,$h2lo              ! r2
 520          ldd    [%o7+8*9],$two64
 521         fsubd   $h3lo,$two96,$h3lo              ! r3
 522          ldd    [%o7+8*10],$two96
 523
 524         fmuld   $five_two130,$h1lo,$s1lo        ! s1
 525         fmuld   $five_two130,$h2lo,$s2lo        ! s2
 526         fmuld   $five_two130,$h3lo,$s3lo        ! s3
 527
 528         faddd   $h0lo,$two0, $h0hi
 529         faddd   $h1lo,$two32,$h1hi
 530         faddd   $h2lo,$two64,$h2hi
 531         faddd   $h3lo,$two96,$h3hi
 532
 533         fsubd   $h0hi,$two0, $h0hi
 534          ldd    [%o7+8*11],$two0                ! more constants
 535         fsubd   $h1hi,$two32,$h1hi
 536          ldd    [%o7+8*12],$two32
 537         fsubd   $h2hi,$two64,$h2hi
 538          ldd    [%o7+8*13],$two64
 539         fsubd   $h3hi,$two96,$h3hi
 540
 541         fsubd   $h0lo,$h0hi,$h0lo
 542          std    $h0hi,[$ctx+8*5]                ! r0hi
 543         fsubd   $h1lo,$h1hi,$h1lo
 544          std    $h1hi,[$ctx+8*7]                ! r1hi
 545         fsubd   $h2lo,$h2hi,$h2lo
 546          std    $h2hi,[$ctx+8*9]                ! r2hi
 547         fsubd   $h3lo,$h3hi,$h3lo
 548          std    $h3hi,[$ctx+8*11]               ! r3hi
 549
 550         faddd   $s1lo,$two0, $s1hi
 551         faddd   $s2lo,$two32,$s2hi
 552         faddd   $s3lo,$two64,$s3hi
 553
 554         fsubd   $s1hi,$two0, $s1hi
 555         fsubd   $s2hi,$two32,$s2hi
 556         fsubd   $s3hi,$two64,$s3hi
 557
 558         fsubd   $s1lo,$s1hi,$s1lo
 559         fsubd   $s2lo,$s2hi,$s2lo
 560         fsubd   $s3lo,$s3hi,$s3lo
 561
 562         ldx     [%sp+LOCALS],%fsr               ! restore %fsr
 563
 564         std     $h0lo,[$ctx+8*4]                ! r0lo
 565         std     $h1lo,[$ctx+8*6]                ! r1lo
 566         std     $h2lo,[$ctx+8*8]                ! r2lo
 567         std     $h3lo,[$ctx+8*10]               ! r3lo
 568
 569         std     $s1hi,[$ctx+8*13]
 570         std     $s2hi,[$ctx+8*15]
 571         std     $s3hi,[$ctx+8*17]
 572
 573         std     $s1lo,[$ctx+8*12]
 574         std     $s2lo,[$ctx+8*14]
 575         std     $s3lo,[$ctx+8*16]
 576
 577         add     %o7,poly1305_blocks_fma-.Lconsts_fma,%o0
 578         add     %o7,poly1305_emit_fma-.Lconsts_fma,%o1
 579         STPTR   %o0,[%i2]
 580         STPTR   %o1,[%i2+SIZE_T]
 581
 582         ret
 583         restore %g0,1,%o0                       ! return 1
 584
 585 .Lno_key_fma:
 586         ret
 587         restore %g0,%g0,%o0                     ! return 0
 588 .size   poly1305_init_fma,.-poly1305_init_fma
 589
 590 .align  32
 591 poly1305_blocks_fma:
 592         save    %sp,-STACK_FRAME-48,%sp
 593         srlx    $len,4,$len
 594
 595         brz,pn  $len,.Labort
 596         sub     $len,1,$len
 597
 598 1:      call    .+8
 599         add     %o7,.Lconsts_fma-1b,%o7
 600
 601         ldd     [%o7+8*0],$two0                 ! load constants
 602         ldd     [%o7+8*1],$two32
 603         ldd     [%o7+8*2],$two64
 604         ldd     [%o7+8*3],$two96
 605         ldd     [%o7+8*4],$two130
 606         ldd     [%o7+8*5],$five_two130
 607
 608         ldd     [$ctx+8*0],$h0lo                ! load [biased] hash value
 609         ldd     [$ctx+8*1],$h1lo
 610         ldd     [$ctx+8*2],$h2lo
 611         ldd     [$ctx+8*3],$h3lo
 612
 613         std     $two0,[%sp+LOCALS+8*0]          ! input "template"
 614         sethi   %hi((1023+52+96)<<20),$in3
 615         std     $two32,[%sp+LOCALS+8*1]
 616         or      $padbit,$in3,$in3
 617         std     $two64,[%sp+LOCALS+8*2]
 618         st      $in3,[%sp+LOCALS+8*3]
 619
 620         and     $inp,7,$shr
 621         andn    $inp,7,$inp                     ! align pointer
 622         mov     8,$i1
 623         sll     $shr,3,$shr
 624         mov     16,$step
 625         neg     $shr,$shl
 626
 627         ldxa    [$inp+%g0]0x88,$in0             ! load little-endian input
 628         brz     $shr,.Linp_aligned_fma
 629         ldxa    [$inp+$i1]0x88,$in2
 630
 631         ldxa    [$inp+$step]0x88,$in4
 632         add     $inp,8,$inp
 633
 634         srlx    $in0,$shr,$in0                  ! align data
 635         sllx    $in2,$shl,$in1
 636         srlx    $in2,$shr,$in2
 637         or      $in1,$in0,$in0
 638         sllx    $in4,$shl,$in3
 639         srlx    $in4,$shr,$in4                  ! pre-shift
 640         or      $in3,$in2,$in2
 641
 642 .Linp_aligned_fma:
 643         srlx    $in0,32,$in1
 644         movrz   $len,0,$step
 645         srlx    $in2,32,$in3
 646         add     $step,$inp,$inp                 ! conditional advance
 647
 648         st      $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
 649         st      $in1,[%sp+LOCALS+8*1+4]
 650         st      $in2,[%sp+LOCALS+8*2+4]
 651         st      $in3,[%sp+LOCALS+8*3+4]
 652
 653         ldd     [$ctx+8*4],$r0lo                ! load key
 654         ldd     [$ctx+8*5],$r0hi
 655         ldd     [$ctx+8*6],$r1lo
 656         ldd     [$ctx+8*7],$r1hi
 657         ldd     [$ctx+8*8],$r2lo
 658         ldd     [$ctx+8*9],$r2hi
 659         ldd     [$ctx+8*10],$r3lo
 660         ldd     [$ctx+8*11],$r3hi
 661         ldd     [$ctx+8*12],$s1lo
 662         ldd     [$ctx+8*13],$s1hi
 663         ldd     [$ctx+8*14],$s2lo
 664         ldd     [$ctx+8*15],$s2hi
 665         ldd     [$ctx+8*16],$s3lo
 666         ldd     [$ctx+8*17],$s3hi
 667
 668         stx     %fsr,[%sp+LOCALS+8*4]           ! save original %fsr
 669         ldx     [%o7+8*6],%fsr                  ! load new %fsr
 670
 671         subcc   $len,1,$len
 672         movrz   $len,0,$step
 673
 674         ldd     [%sp+LOCALS+8*0],$x0            ! load biased input
 675         ldd     [%sp+LOCALS+8*1],$x1
 676         ldd     [%sp+LOCALS+8*2],$x2
 677         ldd     [%sp+LOCALS+8*3],$x3
 678
 679         fsubd   $h0lo,$two0, $h0lo              ! de-bias hash value
 680         fsubd   $h1lo,$two32,$h1lo
 681          ldxa   [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
 682         fsubd   $h2lo,$two64,$h2lo
 683         fsubd   $h3lo,$two96,$h3lo
 684          ldxa   [$inp+$i1]0x88,$in2
 685
 686         fsubd   $x0,$two0, $x0                  ! de-bias input
 687         fsubd   $x1,$two32,$x1
 688         fsubd   $x2,$two64,$x2
 689         fsubd   $x3,$two96,$x3
 690
 691         brz     $shr,.Linp_aligned_fma2
 692         add     $step,$inp,$inp                 ! conditional advance
 693
 694         sllx    $in0,$shl,$in1                  ! align data
 695         srlx    $in0,$shr,$in3
 696         or      $in1,$in4,$in0
 697         sllx    $in2,$shl,$in1
 698         srlx    $in2,$shr,$in4                  ! pre-shift
 699         or      $in3,$in1,$in2
 700 .Linp_aligned_fma2:
 701         srlx    $in0,32,$in1
 702         srlx    $in2,32,$in3
 703
 704         faddd   $h0lo,$x0,$x0                   ! accumulate input
 705          stw    $in0,[%sp+LOCALS+8*0+4]
 706         faddd   $h1lo,$x1,$x1
 707          stw    $in1,[%sp+LOCALS+8*1+4]
 708         faddd   $h2lo,$x2,$x2
 709          stw    $in2,[%sp+LOCALS+8*2+4]
 710         faddd   $h3lo,$x3,$x3
 711          stw    $in3,[%sp+LOCALS+8*3+4]
 712
 713         b       .Lentry_fma
 714         nop
 715
 716 .align  16
 717 .Loop_fma:
 718         ldxa    [$inp+%g0]0x88,$in0             ! modulo-scheduled input load
 719         ldxa    [$inp+$i1]0x88,$in2
 720         movrz   $len,0,$step
 721
 722         faddd   $y0,$h0lo,$h0lo                 ! accumulate input
 723         faddd   $y1,$h0hi,$h0hi
 724         faddd   $y2,$h2lo,$h2lo
 725         faddd   $y3,$h2hi,$h2hi
 726
 727         brz,pn  $shr,.Linp_aligned_fma3
 728         add     $step,$inp,$inp                 ! conditional advance
 729
 730         sllx    $in0,$shl,$in1                  ! align data
 731         srlx    $in0,$shr,$in3
 732         or      $in1,$in4,$in0
 733         sllx    $in2,$shl,$in1
 734         srlx    $in2,$shr,$in4                  ! pre-shift
 735         or      $in3,$in1,$in2
 736
 737 .Linp_aligned_fma3:
 738         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
 739         faddd   $two64,$h1lo,$c1lo
 740          srlx   $in0,32,$in1
 741         faddd   $two64,$h1hi,$c1hi
 742          srlx   $in2,32,$in3
 743         faddd   $two130,$h3lo,$c3lo
 744          st     $in0,[%sp+LOCALS+8*0+4]         ! fill "template"
 745         faddd   $two130,$h3hi,$c3hi
 746          st     $in1,[%sp+LOCALS+8*1+4]
 747         faddd   $two32,$h0lo,$c0lo
 748          st     $in2,[%sp+LOCALS+8*2+4]
 749         faddd   $two32,$h0hi,$c0hi
 750          st     $in3,[%sp+LOCALS+8*3+4]
 751         faddd   $two96,$h2lo,$c2lo
 752         faddd   $two96,$h2hi,$c2hi
 753
 754         fsubd   $c1lo,$two64,$c1lo
 755         fsubd   $c1hi,$two64,$c1hi
 756         fsubd   $c3lo,$two130,$c3lo
 757         fsubd   $c3hi,$two130,$c3hi
 758         fsubd   $c0lo,$two32,$c0lo
 759         fsubd   $c0hi,$two32,$c0hi
 760         fsubd   $c2lo,$two96,$c2lo
 761         fsubd   $c2hi,$two96,$c2hi
 762
 763         fsubd   $h1lo,$c1lo,$h1lo
 764         fsubd   $h1hi,$c1hi,$h1hi
 765         fsubd   $h3lo,$c3lo,$h3lo
 766         fsubd   $h3hi,$c3hi,$h3hi
 767         fsubd   $h2lo,$c2lo,$h2lo
 768         fsubd   $h2hi,$c2hi,$h2hi
 769         fsubd   $h0lo,$c0lo,$h0lo
 770         fsubd   $h0hi,$c0hi,$h0hi
 771
 772         faddd   $h1lo,$c0lo,$h1lo
 773         faddd   $h1hi,$c0hi,$h1hi
 774         faddd   $h3lo,$c2lo,$h3lo
 775         faddd   $h3hi,$c2hi,$h3hi
 776         faddd   $h2lo,$c1lo,$h2lo
 777         faddd   $h2hi,$c1hi,$h2hi
 778         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
 779         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
 780
 781         faddd   $h1lo,$h1hi,$x1
 782          ldd    [$ctx+8*12],$s1lo               ! reload constants
 783         faddd   $h3lo,$h3hi,$x3
 784          ldd    [$ctx+8*13],$s1hi
 785         faddd   $h2lo,$h2hi,$x2
 786          ldd    [$ctx+8*10],$r3lo
 787         faddd   $h0lo,$h0hi,$x0
 788          ldd    [$ctx+8*11],$r3hi
 789
 790 .Lentry_fma:
 791         fmuld   $x1,$s3lo,$h0lo
 792         fmuld   $x1,$s3hi,$h0hi
 793         fmuld   $x1,$r1lo,$h2lo
 794         fmuld   $x1,$r1hi,$h2hi
 795         fmuld   $x1,$r0lo,$h1lo
 796         fmuld   $x1,$r0hi,$h1hi
 797         fmuld   $x1,$r2lo,$h3lo
 798         fmuld   $x1,$r2hi,$h3hi
 799
 800         fmaddd  $x3,$s1lo,$h0lo,$h0lo
 801         fmaddd  $x3,$s1hi,$h0hi,$h0hi
 802         fmaddd  $x3,$s3lo,$h2lo,$h2lo
 803         fmaddd  $x3,$s3hi,$h2hi,$h2hi
 804         fmaddd  $x3,$s2lo,$h1lo,$h1lo
 805         fmaddd  $x3,$s2hi,$h1hi,$h1hi
 806         fmaddd  $x3,$r0lo,$h3lo,$h3lo
 807         fmaddd  $x3,$r0hi,$h3hi,$h3hi
 808
 809         fmaddd  $x2,$s2lo,$h0lo,$h0lo
 810         fmaddd  $x2,$s2hi,$h0hi,$h0hi
 811         fmaddd  $x2,$r0lo,$h2lo,$h2lo
 812         fmaddd  $x2,$r0hi,$h2hi,$h2hi
 813         fmaddd  $x2,$s3lo,$h1lo,$h1lo
 814          ldd    [%sp+LOCALS+8*0],$y0            ! load [biased] input
 815         fmaddd  $x2,$s3hi,$h1hi,$h1hi
 816          ldd    [%sp+LOCALS+8*1],$y1
 817         fmaddd  $x2,$r1lo,$h3lo,$h3lo
 818          ldd    [%sp+LOCALS+8*2],$y2
 819         fmaddd  $x2,$r1hi,$h3hi,$h3hi
 820          ldd    [%sp+LOCALS+8*3],$y3
 821
 822         fmaddd  $x0,$r0lo,$h0lo,$h0lo
 823          fsubd  $y0,$two0, $y0                  ! de-bias input
 824         fmaddd  $x0,$r0hi,$h0hi,$h0hi
 825          fsubd  $y1,$two32,$y1
 826         fmaddd  $x0,$r2lo,$h2lo,$h2lo
 827          fsubd  $y2,$two64,$y2
 828         fmaddd  $x0,$r2hi,$h2hi,$h2hi
 829          fsubd  $y3,$two96,$y3
 830         fmaddd  $x0,$r1lo,$h1lo,$h1lo
 831         fmaddd  $x0,$r1hi,$h1hi,$h1hi
 832         fmaddd  $x0,$r3lo,$h3lo,$h3lo
 833         fmaddd  $x0,$r3hi,$h3hi,$h3hi
 834
 835         bcc     SIZE_T_CC,.Loop_fma
 836         subcc   $len,1,$len
 837
 838         !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! base 2^48 -> base 2^32
 839         faddd   $h0lo,$two32,$c0lo
 840         faddd   $h0hi,$two32,$c0hi
 841         faddd   $h2lo,$two96,$c2lo
 842         faddd   $h2hi,$two96,$c2hi
 843         faddd   $h1lo,$two64,$c1lo
 844         faddd   $h1hi,$two64,$c1hi
 845         faddd   $h3lo,$two130,$c3lo
 846         faddd   $h3hi,$two130,$c3hi
 847
 848         fsubd   $c0lo,$two32,$c0lo
 849         fsubd   $c0hi,$two32,$c0hi
 850         fsubd   $c2lo,$two96,$c2lo
 851         fsubd   $c2hi,$two96,$c2hi
 852         fsubd   $c1lo,$two64,$c1lo
 853         fsubd   $c1hi,$two64,$c1hi
 854         fsubd   $c3lo,$two130,$c3lo
 855         fsubd   $c3hi,$two130,$c3hi
 856
 857         fsubd   $h1lo,$c1lo,$h1lo
 858         fsubd   $h1hi,$c1hi,$h1hi
 859         fsubd   $h3lo,$c3lo,$h3lo
 860         fsubd   $h3hi,$c3hi,$h3hi
 861         fsubd   $h2lo,$c2lo,$h2lo
 862         fsubd   $h2hi,$c2hi,$h2hi
 863         fsubd   $h0lo,$c0lo,$h0lo
 864         fsubd   $h0hi,$c0hi,$h0hi
 865
 866         faddd   $h1lo,$c0lo,$h1lo
 867         faddd   $h1hi,$c0hi,$h1hi
 868         faddd   $h3lo,$c2lo,$h3lo
 869         faddd   $h3hi,$c2hi,$h3hi
 870         faddd   $h2lo,$c1lo,$h2lo
 871         faddd   $h2hi,$c1hi,$h2hi
 872         fmaddd  $five_two130,$c3lo,$h0lo,$h0lo
 873         fmaddd  $five_two130,$c3hi,$h0hi,$h0hi
 874
 875         faddd   $h1lo,$h1hi,$x1
 876         faddd   $h3lo,$h3hi,$x3
 877         faddd   $h2lo,$h2hi,$x2
 878         faddd   $h0lo,$h0hi,$x0
 879
 880         faddd   $x1,$two32,$x1                  ! bias
 881         faddd   $x3,$two96,$x3
 882         faddd   $x2,$two64,$x2
 883         faddd   $x0,$two0, $x0
 884
 885         ldx     [%sp+LOCALS+8*4],%fsr           ! restore saved %fsr
 886
 887         std     $x1,[$ctx+8*1]                  ! store [biased] hash value
 888         std     $x3,[$ctx+8*3]
 889         std     $x2,[$ctx+8*2]
 890         std     $x0,[$ctx+8*0]
 891
 892 .Labort:
 893         ret
 894         restore
 895 .size   poly1305_blocks_fma,.-poly1305_blocks_fma
 896 ___
 897 {
 898 my ($mac,$nonce)=($inp,$len);
 899
 900 my ($h0,$h1,$h2,$h3,$h4, $d0,$d1,$d2,$d3, $mask
 901    ) = (map("%l$_",(0..5)),map("%o$_",(0..4)));
 902
 903 $code.=<<___;
 904 .align  32
 905 poly1305_emit_fma:
 906         save    %sp,-STACK_FRAME,%sp
 907
 908         ld      [$ctx+8*0+0],$d0                ! load hash
 909         ld      [$ctx+8*0+4],$h0
 910         ld      [$ctx+8*1+0],$d1
 911         ld      [$ctx+8*1+4],$h1
 912         ld      [$ctx+8*2+0],$d2
 913         ld      [$ctx+8*2+4],$h2
 914         ld      [$ctx+8*3+0],$d3
 915         ld      [$ctx+8*3+4],$h3
 916
 917         sethi   %hi(0xfff00000),$mask
 918         andn    $d0,$mask,$d0                   ! mask exponent
 919         andn    $d1,$mask,$d1
 920         andn    $d2,$mask,$d2
 921         andn    $d3,$mask,$d3                   ! can be partially reduced...
 922         mov     3,$mask
 923
 924         srl     $d3,2,$padbit                   ! ... so reduce
 925         and     $d3,$mask,$h4
 926         andn    $d3,$mask,$d3
 927         add     $padbit,$d3,$d3
 928
 929         addcc   $d3,$h0,$h0
 930         addccc  $d0,$h1,$h1
 931         addccc  $d1,$h2,$h2
 932         addccc  $d2,$h3,$h3
 933         addc    %g0,$h4,$h4
 934
 935         addcc   $h0,5,$d0                       ! compare to modulus
 936         addccc  $h1,0,$d1
 937         addccc  $h2,0,$d2
 938         addccc  $h3,0,$d3
 939         addc    $h4,0,$mask
 940
 941         srl     $mask,2,$mask                   ! did it carry/borrow?
 942         neg     $mask,$mask
 943         sra     $mask,31,$mask                  ! mask
 944
 945         andn    $h0,$mask,$h0
 946         and     $d0,$mask,$d0
 947         andn    $h1,$mask,$h1
 948         and     $d1,$mask,$d1
 949         or      $d0,$h0,$h0
 950         ld      [$nonce+0],$d0                  ! load nonce
 951         andn    $h2,$mask,$h2
 952         and     $d2,$mask,$d2
 953         or      $d1,$h1,$h1
 954         ld      [$nonce+4],$d1
 955         andn    $h3,$mask,$h3
 956         and     $d3,$mask,$d3
 957         or      $d2,$h2,$h2
 958         ld      [$nonce+8],$d2
 959         or      $d3,$h3,$h3
 960         ld      [$nonce+12],$d3
 961
 962         addcc   $d0,$h0,$h0                     ! accumulate nonce
 963         addccc  $d1,$h1,$h1
 964         addccc  $d2,$h2,$h2
 965         addc    $d3,$h3,$h3
 966
 967         stb     $h0,[$mac+0]                    ! write little-endian result
 968         srl     $h0,8,$h0
 969         stb     $h1,[$mac+4]
 970         srl     $h1,8,$h1
 971         stb     $h2,[$mac+8]
 972         srl     $h2,8,$h2
 973         stb     $h3,[$mac+12]
 974         srl     $h3,8,$h3
 975
 976         stb     $h0,[$mac+1]
 977         srl     $h0,8,$h0
 978         stb     $h1,[$mac+5]
 979         srl     $h1,8,$h1
 980         stb     $h2,[$mac+9]
 981         srl     $h2,8,$h2
 982         stb     $h3,[$mac+13]
 983         srl     $h3,8,$h3
 984
 985         stb     $h0,[$mac+2]
 986         srl     $h0,8,$h0
 987         stb     $h1,[$mac+6]
 988         srl     $h1,8,$h1
 989         stb     $h2,[$mac+10]
 990         srl     $h2,8,$h2
 991         stb     $h3,[$mac+14]
 992         srl     $h3,8,$h3
 993
 994         stb     $h0,[$mac+3]
 995         stb     $h1,[$mac+7]
 996         stb     $h2,[$mac+11]
 997         stb     $h3,[$mac+15]
 998
 999         ret
1000         restore
1001 .size   poly1305_emit_fma,.-poly1305_emit_fma
1002 ___
1003 }
1004
1005 $code.=<<___;
1006 .align  64
1007 .Lconsts_fma:
1008 .word   0x43300000,0x00000000           ! 2^(52+0)
1009 .word   0x45300000,0x00000000           ! 2^(52+32)
1010 .word   0x47300000,0x00000000           ! 2^(52+64)
1011 .word   0x49300000,0x00000000           ! 2^(52+96)
1012 .word   0x4b500000,0x00000000           ! 2^(52+130)
1013
1014 .word   0x37f40000,0x00000000           ! 5/2^130
1015 .word   0,1<<30                         ! fsr: truncate, no exceptions
1016
1017 .word   0x44300000,0x00000000           ! 2^(52+16+0)
1018 .word   0x46300000,0x00000000           ! 2^(52+16+32)
1019 .word   0x48300000,0x00000000           ! 2^(52+16+64)
1020 .word   0x4a300000,0x00000000           ! 2^(52+16+96)
1021 .word   0x3e300000,0x00000000           ! 2^(52+16+0-96)
1022 .word   0x40300000,0x00000000           ! 2^(52+16+32-96)
1023 .word   0x42300000,0x00000000           ! 2^(52+16+64-96)
1024 .asciz  "Poly1305 for SPARCv9/VIS3/FMA, CRYPTOGAMS by <appro\@openssl.org>"
1025 .align  4
1026 ___
1027 }
1028 \f
1029 # Purpose of these subroutines is to explicitly encode VIS instructions,
1030 # so that one can compile the module without having to specify VIS
1031 # extensions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
1032 # Idea is to reserve for option to produce "universal" binary and let
1033 # programmer detect if current CPU is VIS capable at run-time.
1034 sub unvis3 {
1035 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1036 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
1037 my ($ref,$opf);
1038 my %visopf = (  "addxc"         => 0x011,
1039                 "addxccc"       => 0x013,
1040                 "umulxhi"       => 0x016        );
1041
1042     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1043
1044     if ($opf=$visopf{$mnemonic}) {
1045         foreach ($rs1,$rs2,$rd) {
1046             return $ref if (!/%([goli])([0-9])/);
1047             $_=$bias{$1}+$2;
1048         }
1049
1050         return  sprintf ".word\t0x%08x !%s",
1051                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
1052                         $ref;
1053     } else {
1054         return $ref;
1055     }
1056 }
1057
1058 sub unfma {
1059 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1060 my ($ref,$opf);
1061 my %fmaopf = (  "fmadds"        => 0x1,
1062                 "fmaddd"        => 0x2,
1063                 "fmsubs"        => 0x5,
1064                 "fmsubd"        => 0x6          );
1065
1066     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1067
1068     if ($opf=$fmaopf{$mnemonic}) {
1069         foreach ($rs1,$rs2,$rs3,$rd) {
1070             return $ref if (!/%f([0-9]{1,2})/);
1071             $_=$1;
1072             if ($1>=32) {
1073                 return $ref if ($1&1);
1074                 # re-encode for upper double register addressing
1075                 $_=($1|$1>>5)&31;
1076             }
1077         }
1078
1079         return  sprintf ".word\t0x%08x !%s",
1080                         0x81b80000|$rd<<25|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
1081                         $ref;
1082     } else {
1083         return $ref;
1084     }
1085 }
1086
1087 foreach (split("\n",$code)) {
1088         s/\`([^\`]*)\`/eval $1/ge;
1089
1090         s/\b(umulxhi|addxc[c]{0,2})\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1091                 &unvis3($1,$2,$3,$4)
1092          /ge    or
1093         s/\b(fmadd[sd])\s+(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+),\s*(%f[0-9]+)/
1094                 &unfma($1,$2,$3,$4,$5)
1095          /ge;
1096
1097         print $_,"\n";
1098 }
1099
1100 close STDOUT;