crypto/perlasm/sparcv9_modes.pl

   1 #!/usr/bin/env perl
   2
   3 # Specific modes implementations for SPARC Architecture 2011. There
   4 # is T4 dependency though, an ASI value that is not specified in the
   5 # Architecture Manual. But as SPARC universe is rather monocultural,
   6 # we imply that processor capable of executing crypto instructions
   7 # can handle the ASI in question as well. This means that we ought to
   8 # keep eyes open when new processors emerge...
   9 #
  10 # As for above mentioned ASI. It's so called "block initializing
  11 # store" which cancels "read" in "read-update-write" on cache lines.
  12 # This is "cooperative" optimization, as it reduces overall pressure
  13 # on memory interface. Benefits can't be observed/quantified with
  14 # usual benchmarks, on the contrary you can notice that single-thread
  15 # performance for parallelizable modes is ~1.5% worse for largest
  16 # block sizes [though few percent better for not so long ones]. All
  17 # this based on suggestions from David Miller.
  18
  19 my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
  20 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
  21
  22 sub alg_cbc_encrypt_implement {
  23 my ($alg,$bits) = @_;
  24
  25 $::code.=<<___;
  26 .globl  ${alg}${bits}_t4_cbc_encrypt
  27 .align  32
  28 ${alg}${bits}_t4_cbc_encrypt:
  29         save            %sp, -$::frame, %sp
  30         sub             $inp, $out, $blk_init   ! $inp!=$out
  31 ___
  32 $::code.=<<___ if (!$::evp);
  33         andcc           $ivec, 7, $ivoff
  34         alignaddr       $ivec, %g0, $ivec
  35
  36         ldd             [$ivec + 0], %f0        ! load ivec
  37         bz,pt           %icc, 1f
  38         ldd             [$ivec + 8], %f2
  39         ldd             [$ivec + 16], %f4
  40         faligndata      %f0, %f2, %f0
  41         faligndata      %f2, %f4, %f2
  42 1:
  43 ___
  44 $::code.=<<___ if ($::evp);
  45         ld              [$ivec + 0], %f0
  46         ld              [$ivec + 4], %f1
  47         ld              [$ivec + 8], %f2
  48         ld              [$ivec + 12], %f3
  49 ___
  50 $::code.=<<___;
  51         prefetch        [$inp], 20
  52         prefetch        [$inp + 63], 20
  53         call            _${alg}${bits}_load_enckey
  54         and             $inp, 7, $ileft
  55         andn            $inp, 7, $inp
  56         sll             $ileft, 3, $ileft
  57         mov             64, $iright
  58         mov             0xff, $omask
  59         sub             $iright, $ileft, $iright
  60         and             $out, 7, $ooff
  61         cmp             $len, 127
  62         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
  63         movleu          $::size_t_cc, 0, $blk_init      !       $len<128 ||
  64         brnz,pn         $blk_init, .L${bits}cbc_enc_blk !       $inp==$out)
  65         srl             $omask, $ooff, $omask
  66
  67         alignaddrl      $out, %g0, $out
  68         srlx            $len, 4, $len
  69         prefetch        [$out], 22
  70
  71 .L${bits}_cbc_enc_loop:
  72         ldx             [$inp + 0], %o0
  73         brz,pt          $ileft, 4f
  74         ldx             [$inp + 8], %o1
  75
  76         ldx             [$inp + 16], %o2
  77         sllx            %o0, $ileft, %o0
  78         srlx            %o1, $iright, %g1
  79         sllx            %o1, $ileft, %o1
  80         or              %g1, %o0, %o0
  81         srlx            %o2, $iright, %o2
  82         or              %o2, %o1, %o1
  83 4:
  84         xor             %g4, %o0, %o0           ! ^= rk[0]
  85         xor             %g5, %o1, %o1
  86         movxtod         %o0, %f12
  87         movxtod         %o1, %f14
  88
  89         fxor            %f12, %f0, %f0          ! ^= ivec
  90         fxor            %f14, %f2, %f2
  91         prefetch        [$out + 63], 22
  92         prefetch        [$inp + 16+63], 20
  93         call            _${alg}${bits}_encrypt_1x
  94         add             $inp, 16, $inp
  95
  96         brnz,pn         $ooff, 2f
  97         sub             $len, 1, $len
  98
  99         std             %f0, [$out + 0]
 100         std             %f2, [$out + 8]
 101         brnz,pt         $len, .L${bits}_cbc_enc_loop
 102         add             $out, 16, $out
 103 ___
 104 $::code.=<<___ if ($::evp);
 105         st              %f0, [$ivec + 0]
 106         st              %f1, [$ivec + 4]
 107         st              %f2, [$ivec + 8]
 108         st              %f3, [$ivec + 12]
 109 ___
 110 $::code.=<<___ if (!$::evp);
 111         brnz,pn         $ivoff, 3f
 112         nop
 113
 114         std             %f0, [$ivec + 0]        ! write out ivec
 115         std             %f2, [$ivec + 8]
 116 ___
 117 $::code.=<<___;
 118         ret
 119         restore
 120
 121 .align  16
 122 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 123                                                 ! and ~3x deterioration
 124                                                 ! in inp==out case
 125         faligndata      %f0, %f0, %f4           ! handle unaligned output
 126         faligndata      %f0, %f2, %f6
 127         faligndata      %f2, %f2, %f8
 128
 129         stda            %f4, [$out + $omask]0xc0        ! partial store
 130         std             %f6, [$out + 8]
 131         add             $out, 16, $out
 132         orn             %g0, $omask, $omask
 133         stda            %f8, [$out + $omask]0xc0        ! partial store
 134
 135         brnz,pt         $len, .L${bits}_cbc_enc_loop+4
 136         orn             %g0, $omask, $omask
 137 ___
 138 $::code.=<<___ if ($::evp);
 139         st              %f0, [$ivec + 0]
 140         st              %f1, [$ivec + 4]
 141         st              %f2, [$ivec + 8]
 142         st              %f3, [$ivec + 12]
 143 ___
 144 $::code.=<<___ if (!$::evp);
 145         brnz,pn         $ivoff, 3f
 146         nop
 147
 148         std             %f0, [$ivec + 0]        ! write out ivec
 149         std             %f2, [$ivec + 8]
 150         ret
 151         restore
 152
 153 .align  16
 154 3:      alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
 155         mov             0xff, $omask
 156         srl             $omask, $ivoff, $omask
 157         faligndata      %f0, %f0, %f4
 158         faligndata      %f0, %f2, %f6
 159         faligndata      %f2, %f2, %f8
 160         stda            %f4, [$ivec + $omask]0xc0
 161         std             %f6, [$ivec + 8]
 162         add             $ivec, 16, $ivec
 163         orn             %g0, $omask, $omask
 164         stda            %f8, [$ivec + $omask]0xc0
 165 ___
 166 $::code.=<<___;
 167         ret
 168         restore
 169
 170 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 171 .align  32
 172 .L${bits}cbc_enc_blk:
 173         add     $out, $len, $blk_init
 174         and     $blk_init, 63, $blk_init        ! tail
 175         sub     $len, $blk_init, $len
 176         add     $blk_init, 15, $blk_init        ! round up to 16n
 177         srlx    $len, 4, $len
 178         srl     $blk_init, 4, $blk_init
 179
 180 .L${bits}_cbc_enc_blk_loop:
 181         ldx             [$inp + 0], %o0
 182         brz,pt          $ileft, 5f
 183         ldx             [$inp + 8], %o1
 184
 185         ldx             [$inp + 16], %o2
 186         sllx            %o0, $ileft, %o0
 187         srlx            %o1, $iright, %g1
 188         sllx            %o1, $ileft, %o1
 189         or              %g1, %o0, %o0
 190         srlx            %o2, $iright, %o2
 191         or              %o2, %o1, %o1
 192 5:
 193         xor             %g4, %o0, %o0           ! ^= rk[0]
 194         xor             %g5, %o1, %o1
 195         movxtod         %o0, %f12
 196         movxtod         %o1, %f14
 197
 198         fxor            %f12, %f0, %f0          ! ^= ivec
 199         fxor            %f14, %f2, %f2
 200         prefetch        [$inp + 16+63], 20
 201         call            _${alg}${bits}_encrypt_1x
 202         add             $inp, 16, $inp
 203         sub             $len, 1, $len
 204
 205         stda            %f0, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 206         add             $out, 8, $out
 207         stda            %f2, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 208         brnz,pt         $len, .L${bits}_cbc_enc_blk_loop
 209         add             $out, 8, $out
 210
 211         membar          #StoreLoad|#StoreStore
 212         brnz,pt         $blk_init, .L${bits}_cbc_enc_loop
 213         mov             $blk_init, $len
 214 ___
 215 $::code.=<<___ if ($::evp);
 216         st              %f0, [$ivec + 0]
 217         st              %f1, [$ivec + 4]
 218         st              %f2, [$ivec + 8]
 219         st              %f3, [$ivec + 12]
 220 ___
 221 $::code.=<<___ if (!$::evp);
 222         brnz,pn         $ivoff, 3b
 223         nop
 224
 225         std             %f0, [$ivec + 0]        ! write out ivec
 226         std             %f2, [$ivec + 8]
 227 ___
 228 $::code.=<<___;
 229         ret
 230         restore
 231 .type   ${alg}${bits}_t4_cbc_encrypt,#function
 232 .size   ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
 233 ___
 234 }
 235
 236 sub alg_cbc_decrypt_implement {
 237 my ($alg,$bits) = @_;
 238
 239 $::code.=<<___;
 240 .globl  ${alg}${bits}_t4_cbc_decrypt
 241 .align  32
 242 ${alg}${bits}_t4_cbc_decrypt:
 243         save            %sp, -$::frame, %sp
 244         sub             $inp, $out, $blk_init   ! $inp!=$out
 245 ___
 246 $::code.=<<___ if (!$::evp);
 247         andcc           $ivec, 7, $ivoff
 248         alignaddr       $ivec, %g0, $ivec
 249
 250         ldd             [$ivec + 0], %f12       ! load ivec
 251         bz,pt           %icc, 1f
 252         ldd             [$ivec + 8], %f14
 253         ldd             [$ivec + 16], %f0
 254         faligndata      %f12, %f14, %f12
 255         faligndata      %f14, %f0, %f14
 256 1:
 257 ___
 258 $::code.=<<___ if ($::evp);
 259         ld              [$ivec + 0], %f12       ! load ivec
 260         ld              [$ivec + 4], %f13
 261         ld              [$ivec + 8], %f14
 262         ld              [$ivec + 12], %f15
 263 ___
 264 $::code.=<<___;
 265         prefetch        [$inp], 20
 266         prefetch        [$inp + 63], 20
 267         call            _${alg}${bits}_load_deckey
 268         and             $inp, 7, $ileft
 269         andn            $inp, 7, $inp
 270         sll             $ileft, 3, $ileft
 271         mov             64, $iright
 272         mov             0xff, $omask
 273         sub             $iright, $ileft, $iright
 274         and             $out, 7, $ooff
 275         cmp             $len, 255
 276         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
 277         movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
 278         brnz,pn         $blk_init, .L${bits}cbc_dec_blk !       $inp==$out)
 279         srl             $omask, $ooff, $omask
 280
 281         andcc           $len, 16, %g0           ! is number of blocks even?
 282         srlx            $len, 4, $len
 283         alignaddrl      $out, %g0, $out
 284         bz              %icc, .L${bits}_cbc_dec_loop2x
 285         prefetch        [$out], 22
 286 .L${bits}_cbc_dec_loop:
 287         ldx             [$inp + 0], %o0
 288         brz,pt          $ileft, 4f
 289         ldx             [$inp + 8], %o1
 290
 291         ldx             [$inp + 16], %o2
 292         sllx            %o0, $ileft, %o0
 293         srlx            %o1, $iright, %g1
 294         sllx            %o1, $ileft, %o1
 295         or              %g1, %o0, %o0
 296         srlx            %o2, $iright, %o2
 297         or              %o2, %o1, %o1
 298 4:
 299         xor             %g4, %o0, %o2           ! ^= rk[0]
 300         xor             %g5, %o1, %o3
 301         movxtod         %o2, %f0
 302         movxtod         %o3, %f2
 303
 304         prefetch        [$out + 63], 22
 305         prefetch        [$inp + 16+63], 20
 306         call            _${alg}${bits}_decrypt_1x
 307         add             $inp, 16, $inp
 308
 309         fxor            %f12, %f0, %f0          ! ^= ivec
 310         fxor            %f14, %f2, %f2
 311         movxtod         %o0, %f12
 312         movxtod         %o1, %f14
 313
 314         brnz,pn         $ooff, 2f
 315         sub             $len, 1, $len
 316
 317         std             %f0, [$out + 0]
 318         std             %f2, [$out + 8]
 319         brnz,pt         $len, .L${bits}_cbc_dec_loop2x
 320         add             $out, 16, $out
 321 ___
 322 $::code.=<<___ if ($::evp);
 323         st              %f12, [$ivec + 0]
 324         st              %f13, [$ivec + 4]
 325         st              %f14, [$ivec + 8]
 326         st              %f15, [$ivec + 12]
 327 ___
 328 $::code.=<<___ if (!$::evp);
 329         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 330         nop
 331
 332         std             %f12, [$ivec + 0]       ! write out ivec
 333         std             %f14, [$ivec + 8]
 334 ___
 335 $::code.=<<___;
 336         ret
 337         restore
 338
 339 .align  16
 340 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 341                                                 ! and ~3x deterioration
 342                                                 ! in inp==out case
 343         faligndata      %f0, %f0, %f4           ! handle unaligned output
 344         faligndata      %f0, %f2, %f6
 345         faligndata      %f2, %f2, %f8
 346
 347         stda            %f4, [$out + $omask]0xc0        ! partial store
 348         std             %f6, [$out + 8]
 349         add             $out, 16, $out
 350         orn             %g0, $omask, $omask
 351         stda            %f8, [$out + $omask]0xc0        ! partial store
 352
 353         brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
 354         orn             %g0, $omask, $omask
 355 ___
 356 $::code.=<<___ if ($::evp);
 357         st              %f12, [$ivec + 0]
 358         st              %f13, [$ivec + 4]
 359         st              %f14, [$ivec + 8]
 360         st              %f15, [$ivec + 12]
 361 ___
 362 $::code.=<<___ if (!$::evp);
 363         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 364         nop
 365
 366         std             %f12, [$ivec + 0]       ! write out ivec
 367         std             %f14, [$ivec + 8]
 368 ___
 369 $::code.=<<___;
 370         ret
 371         restore
 372
 373 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 374 .align  32
 375 .L${bits}_cbc_dec_loop2x:
 376         ldx             [$inp + 0], %o0
 377         ldx             [$inp + 8], %o1
 378         ldx             [$inp + 16], %o2
 379         brz,pt          $ileft, 4f
 380         ldx             [$inp + 24], %o3
 381
 382         ldx             [$inp + 32], %o4
 383         sllx            %o0, $ileft, %o0
 384         srlx            %o1, $iright, %g1
 385         or              %g1, %o0, %o0
 386         sllx            %o1, $ileft, %o1
 387         srlx            %o2, $iright, %g1
 388         or              %g1, %o1, %o1
 389         sllx            %o2, $ileft, %o2
 390         srlx            %o3, $iright, %g1
 391         or              %g1, %o2, %o2
 392         sllx            %o3, $ileft, %o3
 393         srlx            %o4, $iright, %o4
 394         or              %o4, %o3, %o3
 395 4:
 396         xor             %g4, %o0, %o4           ! ^= rk[0]
 397         xor             %g5, %o1, %o5
 398         movxtod         %o4, %f0
 399         movxtod         %o5, %f2
 400         xor             %g4, %o2, %o4
 401         xor             %g5, %o3, %o5
 402         movxtod         %o4, %f4
 403         movxtod         %o5, %f6
 404
 405         prefetch        [$out + 63], 22
 406         prefetch        [$inp + 32+63], 20
 407         call            _${alg}${bits}_decrypt_2x
 408         add             $inp, 32, $inp
 409
 410         movxtod         %o0, %f8
 411         movxtod         %o1, %f10
 412         fxor            %f12, %f0, %f0          ! ^= ivec
 413         fxor            %f14, %f2, %f2
 414         movxtod         %o2, %f12
 415         movxtod         %o3, %f14
 416         fxor            %f8, %f4, %f4
 417         fxor            %f10, %f6, %f6
 418
 419         brnz,pn         $ooff, 2f
 420         sub             $len, 2, $len
 421
 422         std             %f0, [$out + 0]
 423         std             %f2, [$out + 8]
 424         std             %f4, [$out + 16]
 425         std             %f6, [$out + 24]
 426         brnz,pt         $len, .L${bits}_cbc_dec_loop2x
 427         add             $out, 32, $out
 428 ___
 429 $::code.=<<___ if ($::evp);
 430         st              %f12, [$ivec + 0]
 431         st              %f13, [$ivec + 4]
 432         st              %f14, [$ivec + 8]
 433         st              %f15, [$ivec + 12]
 434 ___
 435 $::code.=<<___ if (!$::evp);
 436         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 437         nop
 438
 439         std             %f12, [$ivec + 0]       ! write out ivec
 440         std             %f14, [$ivec + 8]
 441 ___
 442 $::code.=<<___;
 443         ret
 444         restore
 445
 446 .align  16
 447 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 448                                                 ! and ~3x deterioration
 449                                                 ! in inp==out case
 450         faligndata      %f0, %f0, %f8           ! handle unaligned output
 451         faligndata      %f0, %f2, %f0
 452         faligndata      %f2, %f4, %f2
 453         faligndata      %f4, %f6, %f4
 454         faligndata      %f6, %f6, %f6
 455         stda            %f8, [$out + $omask]0xc0        ! partial store
 456         std             %f0, [$out + 8]
 457         std             %f2, [$out + 16]
 458         std             %f4, [$out + 24]
 459         add             $out, 32, $out
 460         orn             %g0, $omask, $omask
 461         stda            %f6, [$out + $omask]0xc0        ! partial store
 462
 463         brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
 464         orn             %g0, $omask, $omask
 465 ___
 466 $::code.=<<___ if ($::evp);
 467         st              %f12, [$ivec + 0]
 468         st              %f13, [$ivec + 4]
 469         st              %f14, [$ivec + 8]
 470         st              %f15, [$ivec + 12]
 471 ___
 472 $::code.=<<___ if (!$::evp);
 473         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 474         nop
 475
 476         std             %f12, [$ivec + 0]       ! write out ivec
 477         std             %f14, [$ivec + 8]
 478         ret
 479         restore
 480
 481 .align  16
 482 .L${bits}_cbc_dec_unaligned_ivec:
 483         alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
 484         mov             0xff, $omask
 485         srl             $omask, $ivoff, $omask
 486         faligndata      %f12, %f12, %f0
 487         faligndata      %f12, %f14, %f2
 488         faligndata      %f14, %f14, %f4
 489         stda            %f0, [$ivec + $omask]0xc0
 490         std             %f2, [$ivec + 8]
 491         add             $ivec, 16, $ivec
 492         orn             %g0, $omask, $omask
 493         stda            %f4, [$ivec + $omask]0xc0
 494 ___
 495 $::code.=<<___;
 496         ret
 497         restore
 498
 499 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 500 .align  32
 501 .L${bits}cbc_dec_blk:
 502         add     $out, $len, $blk_init
 503         and     $blk_init, 63, $blk_init        ! tail
 504         sub     $len, $blk_init, $len
 505         add     $blk_init, 15, $blk_init        ! round up to 16n
 506         srlx    $len, 4, $len
 507         srl     $blk_init, 4, $blk_init
 508         sub     $len, 1, $len
 509         add     $blk_init, 1, $blk_init
 510
 511 .L${bits}_cbc_dec_blk_loop2x:
 512         ldx             [$inp + 0], %o0
 513         ldx             [$inp + 8], %o1
 514         ldx             [$inp + 16], %o2
 515         brz,pt          $ileft, 5f
 516         ldx             [$inp + 24], %o3
 517
 518         ldx             [$inp + 32], %o4
 519         sllx            %o0, $ileft, %o0
 520         srlx            %o1, $iright, %g1
 521         or              %g1, %o0, %o0
 522         sllx            %o1, $ileft, %o1
 523         srlx            %o2, $iright, %g1
 524         or              %g1, %o1, %o1
 525         sllx            %o2, $ileft, %o2
 526         srlx            %o3, $iright, %g1
 527         or              %g1, %o2, %o2
 528         sllx            %o3, $ileft, %o3
 529         srlx            %o4, $iright, %o4
 530         or              %o4, %o3, %o3
 531 5:
 532         xor             %g4, %o0, %o4           ! ^= rk[0]
 533         xor             %g5, %o1, %o5
 534         movxtod         %o4, %f0
 535         movxtod         %o5, %f2
 536         xor             %g4, %o2, %o4
 537         xor             %g5, %o3, %o5
 538         movxtod         %o4, %f4
 539         movxtod         %o5, %f6
 540
 541         prefetch        [$inp + 32+63], 20
 542         call            _${alg}${bits}_decrypt_2x
 543         add             $inp, 32, $inp
 544         subcc           $len, 2, $len
 545
 546         movxtod         %o0, %f8
 547         movxtod         %o1, %f10
 548         fxor            %f12, %f0, %f0          ! ^= ivec
 549         fxor            %f14, %f2, %f2
 550         movxtod         %o2, %f12
 551         movxtod         %o3, %f14
 552         fxor            %f8, %f4, %f4
 553         fxor            %f10, %f6, %f6
 554
 555         stda            %f0, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 556         add             $out, 8, $out
 557         stda            %f2, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 558         add             $out, 8, $out
 559         stda            %f4, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 560         add             $out, 8, $out
 561         stda            %f6, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 562         bgu,pt          $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
 563         add             $out, 8, $out
 564
 565         add             $blk_init, $len, $len
 566         andcc           $len, 1, %g0            ! is number of blocks even?
 567         membar          #StoreLoad|#StoreStore
 568         bnz,pt          %icc, .L${bits}_cbc_dec_loop
 569         srl             $len, 0, $len
 570         brnz,pn         $len, .L${bits}_cbc_dec_loop2x
 571         nop
 572 ___
 573 $::code.=<<___ if ($::evp);
 574         st              %f0, [$ivec + 0]
 575         st              %f1, [$ivec + 4]
 576         st              %f2, [$ivec + 8]
 577         st              %f3, [$ivec + 12]
 578 ___
 579 $::code.=<<___ if (!$::evp);
 580         brnz,pn         $ivoff, 3b
 581         nop
 582
 583         std             %f0, [$ivec + 0]        ! write out ivec
 584         std             %f2, [$ivec + 8]
 585 ___
 586 $::code.=<<___;
 587         ret
 588         restore
 589 .type   ${alg}${bits}_t4_cbc_decrypt,#function
 590 .size   ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
 591 ___
 592 }
 593
 594 sub alg_ctr32_implement {
 595 my ($alg,$bits) = @_;
 596
 597 $::code.=<<___;
 598 .globl  ${alg}${bits}_t4_ctr32_encrypt
 599 .align  32
 600 ${alg}${bits}_t4_ctr32_encrypt:
 601         save            %sp, -$::frame, %sp
 602
 603         prefetch        [$inp], 20
 604         prefetch        [$inp + 63], 20
 605         call            _${alg}${bits}_load_enckey
 606         sllx            $len, 4, $len
 607
 608         ld              [$ivec + 0], %l4        ! counter
 609         ld              [$ivec + 4], %l5
 610         ld              [$ivec + 8], %l6
 611         ld              [$ivec + 12], %l7
 612
 613         sllx            %l4, 32, %o5
 614         or              %l5, %o5, %o5
 615         sllx            %l6, 32, %g1
 616         xor             %o5, %g4, %g4           ! ^= rk[0]
 617         xor             %g1, %g5, %g5
 618         movxtod         %g4, %f14               ! most significant 64 bits
 619
 620         sub             $inp, $out, $blk_init   ! $inp!=$out
 621         and             $inp, 7, $ileft
 622         andn            $inp, 7, $inp
 623         sll             $ileft, 3, $ileft
 624         mov             64, $iright
 625         mov             0xff, $omask
 626         sub             $iright, $ileft, $iright
 627         and             $out, 7, $ooff
 628         cmp             $len, 255
 629         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
 630         movleu          $::size_t_cc, 0, $blk_init      !       $len<256 ||
 631         brnz,pn         $blk_init, .L${bits}_ctr32_blk  !       $inp==$out)
 632         srl             $omask, $ooff, $omask
 633
 634         andcc           $len, 16, %g0           ! is number of blocks even?
 635         alignaddrl      $out, %g0, $out
 636         bz              %icc, .L${bits}_ctr32_loop2x
 637         srlx            $len, 4, $len
 638 .L${bits}_ctr32_loop:
 639         ldx             [$inp + 0], %o0
 640         brz,pt          $ileft, 4f
 641         ldx             [$inp + 8], %o1
 642
 643         ldx             [$inp + 16], %o2
 644         sllx            %o0, $ileft, %o0
 645         srlx            %o1, $iright, %g1
 646         sllx            %o1, $ileft, %o1
 647         or              %g1, %o0, %o0
 648         srlx            %o2, $iright, %o2
 649         or              %o2, %o1, %o1
 650 4:
 651         xor             %g5, %l7, %g1           ! ^= rk[0]
 652         add             %l7, 1, %l7
 653         movxtod         %g1, %f2
 654         srl             %l7, 0, %l7             ! clruw
 655         prefetch        [$out + 63], 22
 656         prefetch        [$inp + 16+63], 20
 657 ___
 658 $::code.=<<___ if ($alg eq "aes");
 659         aes_eround01    %f16, %f14, %f2, %f4
 660         aes_eround23    %f18, %f14, %f2, %f2
 661 ___
 662 $::code.=<<___ if ($alg eq "cmll");
 663         camellia_f      %f16, %f2, %f14, %f2
 664         camellia_f      %f18, %f14, %f2, %f0
 665 ___
 666 $::code.=<<___;
 667         call            _${alg}${bits}_encrypt_1x+8
 668         add             $inp, 16, $inp
 669
 670         movxtod         %o0, %f10
 671         movxtod         %o1, %f12
 672         fxor            %f10, %f0, %f0          ! ^= inp
 673         fxor            %f12, %f2, %f2
 674
 675         brnz,pn         $ooff, 2f
 676         sub             $len, 1, $len
 677
 678         std             %f0, [$out + 0]
 679         std             %f2, [$out + 8]
 680         brnz,pt         $len, .L${bits}_ctr32_loop2x
 681         add             $out, 16, $out
 682
 683         ret
 684         restore
 685
 686 .align  16
 687 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 688                                                 ! and ~3x deterioration
 689                                                 ! in inp==out case
 690         faligndata      %f0, %f0, %f4           ! handle unaligned output
 691         faligndata      %f0, %f2, %f6
 692         faligndata      %f2, %f2, %f8
 693         stda            %f4, [$out + $omask]0xc0        ! partial store
 694         std             %f6, [$out + 8]
 695         add             $out, 16, $out
 696         orn             %g0, $omask, $omask
 697         stda            %f8, [$out + $omask]0xc0        ! partial store
 698
 699         brnz,pt         $len, .L${bits}_ctr32_loop2x+4
 700         orn             %g0, $omask, $omask
 701
 702         ret
 703         restore
 704
 705 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 706 .align  32
 707 .L${bits}_ctr32_loop2x:
 708         ldx             [$inp + 0], %o0
 709         ldx             [$inp + 8], %o1
 710         ldx             [$inp + 16], %o2
 711         brz,pt          $ileft, 4f
 712         ldx             [$inp + 24], %o3
 713
 714         ldx             [$inp + 32], %o4
 715         sllx            %o0, $ileft, %o0
 716         srlx            %o1, $iright, %g1
 717         or              %g1, %o0, %o0
 718         sllx            %o1, $ileft, %o1
 719         srlx            %o2, $iright, %g1
 720         or              %g1, %o1, %o1
 721         sllx            %o2, $ileft, %o2
 722         srlx            %o3, $iright, %g1
 723         or              %g1, %o2, %o2
 724         sllx            %o3, $ileft, %o3
 725         srlx            %o4, $iright, %o4
 726         or              %o4, %o3, %o3
 727 4:
 728         xor             %g5, %l7, %g1           ! ^= rk[0]
 729         add             %l7, 1, %l7
 730         movxtod         %g1, %f2
 731         srl             %l7, 0, %l7             ! clruw
 732         xor             %g5, %l7, %g1
 733         add             %l7, 1, %l7
 734         movxtod         %g1, %f6
 735         srl             %l7, 0, %l7             ! clruw
 736         prefetch        [$out + 63], 22
 737         prefetch        [$inp + 32+63], 20
 738 ___
 739 $::code.=<<___ if ($alg eq "aes");
 740         aes_eround01    %f16, %f14, %f2, %f8
 741         aes_eround23    %f18, %f14, %f2, %f2
 742         aes_eround01    %f16, %f14, %f6, %f10
 743         aes_eround23    %f18, %f14, %f6, %f6
 744 ___
 745 $::code.=<<___ if ($alg eq "cmll");
 746         camellia_f      %f16, %f2, %f14, %f2
 747         camellia_f      %f16, %f6, %f14, %f6
 748         camellia_f      %f18, %f14, %f2, %f0
 749         camellia_f      %f18, %f14, %f6, %f4
 750 ___
 751 $::code.=<<___;
 752         call            _${alg}${bits}_encrypt_2x+16
 753         add             $inp, 32, $inp
 754
 755         movxtod         %o0, %f8
 756         movxtod         %o1, %f10
 757         movxtod         %o2, %f12
 758         fxor            %f8, %f0, %f0           ! ^= inp
 759         movxtod         %o3, %f8
 760         fxor            %f10, %f2, %f2
 761         fxor            %f12, %f4, %f4
 762         fxor            %f8, %f6, %f6
 763
 764         brnz,pn         $ooff, 2f
 765         sub             $len, 2, $len
 766
 767         std             %f0, [$out + 0]
 768         std             %f2, [$out + 8]
 769         std             %f4, [$out + 16]
 770         std             %f6, [$out + 24]
 771         brnz,pt         $len, .L${bits}_ctr32_loop2x
 772         add             $out, 32, $out
 773
 774         ret
 775         restore
 776
 777 .align  16
 778 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 779                                                 ! and ~3x deterioration
 780                                                 ! in inp==out case
 781         faligndata      %f0, %f0, %f8           ! handle unaligned output
 782         faligndata      %f0, %f2, %f0
 783         faligndata      %f2, %f4, %f2
 784         faligndata      %f4, %f6, %f4
 785         faligndata      %f6, %f6, %f6
 786
 787         stda            %f8, [$out + $omask]0xc0        ! partial store
 788         std             %f0, [$out + 8]
 789         std             %f2, [$out + 16]
 790         std             %f4, [$out + 24]
 791         add             $out, 32, $out
 792         orn             %g0, $omask, $omask
 793         stda            %f6, [$out + $omask]0xc0        ! partial store
 794
 795         brnz,pt         $len, .L${bits}_ctr32_loop2x+4
 796         orn             %g0, $omask, $omask
 797
 798         ret
 799         restore
 800
 801 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 802 .align  32
 803 .L${bits}_ctr32_blk:
 804         add     $out, $len, $blk_init
 805         and     $blk_init, 63, $blk_init        ! tail
 806         sub     $len, $blk_init, $len
 807         add     $blk_init, 15, $blk_init        ! round up to 16n
 808         srlx    $len, 4, $len
 809         srl     $blk_init, 4, $blk_init
 810         sub     $len, 1, $len
 811         add     $blk_init, 1, $blk_init
 812
 813 .L${bits}_ctr32_blk_loop2x:
 814         ldx             [$inp + 0], %o0
 815         ldx             [$inp + 8], %o1
 816         ldx             [$inp + 16], %o2
 817         brz,pt          $ileft, 5f
 818         ldx             [$inp + 24], %o3
 819
 820         ldx             [$inp + 32], %o4
 821         sllx            %o0, $ileft, %o0
 822         srlx            %o1, $iright, %g1
 823         or              %g1, %o0, %o0
 824         sllx            %o1, $ileft, %o1
 825         srlx            %o2, $iright, %g1
 826         or              %g1, %o1, %o1
 827         sllx            %o2, $ileft, %o2
 828         srlx            %o3, $iright, %g1
 829         or              %g1, %o2, %o2
 830         sllx            %o3, $ileft, %o3
 831         srlx            %o4, $iright, %o4
 832         or              %o4, %o3, %o3
 833 5:
 834         xor             %g5, %l7, %g1           ! ^= rk[0]
 835         add             %l7, 1, %l7
 836         movxtod         %g1, %f2
 837         srl             %l7, 0, %l7             ! clruw
 838         xor             %g5, %l7, %g1
 839         add             %l7, 1, %l7
 840         movxtod         %g1, %f6
 841         srl             %l7, 0, %l7             ! clruw
 842         prefetch        [$inp + 32+63], 20
 843 ___
 844 $::code.=<<___ if ($alg eq "aes");
 845         aes_eround01    %f16, %f14, %f2, %f8
 846         aes_eround23    %f18, %f14, %f2, %f2
 847         aes_eround01    %f16, %f14, %f6, %f10
 848         aes_eround23    %f18, %f14, %f6, %f6
 849 ___
 850 $::code.=<<___ if ($alg eq "cmll");
 851         camellia_f      %f16, %f2, %f14, %f2
 852         camellia_f      %f16, %f6, %f14, %f6
 853         camellia_f      %f18, %f14, %f2, %f0
 854         camellia_f      %f18, %f14, %f6, %f4
 855 ___
 856 $::code.=<<___;
 857         call            _${alg}${bits}_encrypt_2x+16
 858         add             $inp, 32, $inp
 859         subcc           $len, 2, $len
 860
 861         movxtod         %o0, %f8
 862         movxtod         %o1, %f10
 863         movxtod         %o2, %f12
 864         fxor            %f8, %f0, %f0           ! ^= inp
 865         movxtod         %o3, %f8
 866         fxor            %f10, %f2, %f2
 867         fxor            %f12, %f4, %f4
 868         fxor            %f8, %f6, %f6
 869
 870         stda            %f0, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 871         add             $out, 8, $out
 872         stda            %f2, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 873         add             $out, 8, $out
 874         stda            %f4, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 875         add             $out, 8, $out
 876         stda            %f6, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 877         bgu,pt          $::size_t_cc, .L${bits}_ctr32_blk_loop2x
 878         add             $out, 8, $out
 879
 880         add             $blk_init, $len, $len
 881         andcc           $len, 1, %g0            ! is number of blocks even?
 882         membar          #StoreLoad|#StoreStore
 883         bnz,pt          %icc, .L${bits}_ctr32_loop
 884         srl             $len, 0, $len
 885         brnz,pn         $len, .L${bits}_ctr32_loop2x
 886         nop
 887
 888         ret
 889         restore
 890 .type   ${alg}${bits}_t4_ctr32_encrypt,#function
 891 .size   ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
 892 ___
 893 }
 894
 895 # Purpose of these subroutines is to explicitly encode VIS instructions,
 896 # so that one can compile the module without having to specify VIS
 897 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 898 # Idea is to reserve for option to produce "universal" binary and let
 899 # programmer detect if current CPU is VIS capable at run-time.
 900 sub unvis {
 901 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 902 my ($ref,$opf);
 903 my %visopf = (  "faligndata"    => 0x048,
 904                 "fnot2"         => 0x066,
 905                 "fxor"          => 0x06c,
 906                 "fsrc2"         => 0x078        );
 907
 908     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 909
 910     if ($opf=$visopf{$mnemonic}) {
 911         foreach ($rs1,$rs2,$rd) {
 912             return $ref if (!/%f([0-9]{1,2})/);
 913             $_=$1;
 914             if ($1>=32) {
 915                 return $ref if ($1&1);
 916                 # re-encode for upper double register addressing
 917                 $_=($1|$1>>5)&31;
 918             }
 919         }
 920
 921         return  sprintf ".word\t0x%08x !%s",
 922                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 923                         $ref;
 924     } else {
 925         return $ref;
 926     }
 927 }
 928 sub unalignaddr {
 929 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 930 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 931 my $ref = "$mnemonic\t$rs1,$rs2,$rd";
 932 my $opf = $mnemonic =~ /l$/ ? 0x01a :0x18;
 933
 934     foreach ($rs1,$rs2,$rd) {
 935         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 936         else                    { return $ref; }
 937     }
 938     return  sprintf ".word\t0x%08x !%s",
 939                     0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 940                     $ref;
 941 }
 942
 943 sub unaes_round {       # 4-argument instructions
 944 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
 945 my ($ref,$opf);
 946 my %aesopf = (  "aes_eround01"  => 0,
 947                 "aes_eround23"  => 1,
 948                 "aes_dround01"  => 2,
 949                 "aes_dround23"  => 3,
 950                 "aes_eround01_l"=> 4,
 951                 "aes_eround23_l"=> 5,
 952                 "aes_dround01_l"=> 6,
 953                 "aes_dround23_l"=> 7,
 954                 "aes_kexpand1"  => 8    );
 955
 956     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
 957
 958     if (defined($opf=$aesopf{$mnemonic})) {
 959         $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
 960         foreach ($rs1,$rs2,$rd) {
 961             return $ref if (!/%f([0-9]{1,2})/);
 962             $_=$1;
 963             if ($1>=32) {
 964                 return $ref if ($1&1);
 965                 # re-encode for upper double register addressing
 966                 $_=($1|$1>>5)&31;
 967             }
 968         }
 969
 970         return  sprintf ".word\t0x%08x !%s",
 971                         2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
 972                         $ref;
 973     } else {
 974         return $ref;
 975     }
 976 }
 977
 978 sub unaes_kexpand {     # 3-argument instructions
 979 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 980 my ($ref,$opf);
 981 my %aesopf = (  "aes_kexpand0"  => 0x130,
 982                 "aes_kexpand2"  => 0x131        );
 983
 984     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 985
 986     if (defined($opf=$aesopf{$mnemonic})) {
 987         foreach ($rs1,$rs2,$rd) {
 988             return $ref if (!/%f([0-9]{1,2})/);
 989             $_=$1;
 990             if ($1>=32) {
 991                 return $ref if ($1&1);
 992                 # re-encode for upper double register addressing
 993                 $_=($1|$1>>5)&31;
 994             }
 995         }
 996
 997         return  sprintf ".word\t0x%08x !%s",
 998                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
 999                         $ref;
1000     } else {
1001         return $ref;
1002     }
1003 }
1004
1005 sub uncamellia_f {      # 4-argument instructions
1006 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1007 my ($ref,$opf);
1008
1009     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1010
1011     if (1) {
1012         $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1013         foreach ($rs1,$rs2,$rd) {
1014             return $ref if (!/%f([0-9]{1,2})/);
1015             $_=$1;
1016             if ($1>=32) {
1017                 return $ref if ($1&1);
1018                 # re-encode for upper double register addressing
1019                 $_=($1|$1>>5)&31;
1020             }
1021         }
1022
1023         return  sprintf ".word\t0x%08x !%s",
1024                         2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1025                         $ref;
1026     } else {
1027         return $ref;
1028     }
1029 }
1030
1031 sub uncamellia3 {       # 3-argument instructions
1032 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1033 my ($ref,$opf);
1034 my %cmllopf = ( "camellia_fl"   => 0x13c,
1035                 "camellia_fli"  => 0x13d        );
1036
1037     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1038
1039     if (defined($opf=$cmllopf{$mnemonic})) {
1040         foreach ($rs1,$rs2,$rd) {
1041             return $ref if (!/%f([0-9]{1,2})/);
1042             $_=$1;
1043             if ($1>=32) {
1044                 return $ref if ($1&1);
1045                 # re-encode for upper double register addressing
1046                 $_=($1|$1>>5)&31;
1047             }
1048         }
1049
1050         return  sprintf ".word\t0x%08x !%s",
1051                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1052                         $ref;
1053     } else {
1054         return $ref;
1055     }
1056 }
1057
1058 sub unmovxtox {         # 2-argument instructions
1059 my ($mnemonic,$rs,$rd)=@_;
1060 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1061 my ($ref,$opf);
1062 my %movxopf = ( "movdtox"       => 0x110,
1063                 "movstouw"      => 0x111,
1064                 "movstosw"      => 0x113,
1065                 "movxtod"       => 0x118,
1066                 "movwtos"       => 0x119        );
1067
1068     $ref = "$mnemonic\t$rs,$rd";
1069
1070     if (defined($opf=$movxopf{$mnemonic})) {
1071         foreach ($rs,$rd) {
1072             return $ref if (!/%([fgoli])([0-9]{1,2})/);
1073             $_=$bias{$1}+$2;
1074             if ($2>=32) {
1075                 return $ref if ($2&1);
1076                 # re-encode for upper double register addressing
1077                 $_=($2|$2>>5)&31;
1078             }
1079         }
1080
1081         return  sprintf ".word\t0x%08x !%s",
1082                         2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1083                         $ref;
1084     } else {
1085         return $ref;
1086     }
1087 }
1088
1089 sub emit_assembler {
1090     foreach (split("\n",$::code)) {
1091         s/\`([^\`]*)\`/eval $1/ge;
1092
1093         s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/g;
1094
1095         s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1096                 &unaes_round($1,$2,$3,$4,$5)
1097          /ge or
1098         s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1099                 &unaes_kexpand($1,$2,$3,$4)
1100          /ge or
1101         s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1102                 &uncamellia_f($1,$2,$3,$4,$5)
1103          /ge or
1104         s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1105                 &uncamellia3($1,$2,$3,$4)
1106          /ge or
1107         s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1108                 &unmovxtox($1,$2,$3)
1109          /ge or
1110         s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1111                 &unmovxtox($1,$2,$3)
1112          /ge or
1113         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1114                 &unvis($1,$2,$3,$4)
1115          /ge or
1116         s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1117                 &unalignaddr($1,$2,$3,$4)
1118          /ge;
1119
1120         print $_,"\n";
1121     }
1122 }
1123
1124 1;