crypto/perlasm/sparcv9_modes.pl

   1 #!/usr/bin/env perl
   2
   3 # Specific modes implementations for SPARC Architecture 2011. There
   4 # is T4 dependency though, an ASI value that is not specified in the
   5 # Architecture Manual. But as SPARC universe is rather monocultural,
   6 # we imply that processor capable of executing crypto instructions
   7 # can handle the ASI in question as well. This means that we ought to
   8 # keep eyes open when new processors emerge...
   9 #
  10 # As for above mentioned ASI. It's so called "block initializing
  11 # store" which cancels "read" in "read-update-write" on cache lines.
  12 # This is "cooperative" optimization, as it reduces overall pressure
  13 # on memory interface. Benefits can't be observed/quantified with
  14 # usual benchmarks, on the contrary you can notice that single-thread
  15 # performance for parallelizable modes is ~1.5% worse. Special note
  16 # about commented 'membar' instructions, otherwise recommended by
  17 # manual. Rationale is following. Memory view is consistent from
  18 # viewpoint of processor executing the code even when ASI in question
  19 # is used. If thread on another processor has to access the result,
  20 # its availability would have to be mediated and it can be done only
  21 # through a syncronization operation which would requre ... 'membar'.
  22 # All this based on suggestions from David Miller.
  23
  24 my ($inp,$out,$len,$key,$ivec,$enc)=map("%i$_",(0..5));
  25 my ($ileft,$iright,$ooff,$omask,$ivoff,$blk_init)=map("%l$_",(0..7));
  26
  27 sub alg_cbc_encrypt_implement {
  28 my ($alg,$bits) = @_;
  29
  30 $::code.=<<___;
  31 .globl  ${alg}${bits}_t4_cbc_encrypt
  32 .align  32
  33 ${alg}${bits}_t4_cbc_encrypt:
  34         save            %sp, -$::frame, %sp
  35         sub             $inp, $out, $blk_init   ! $inp!=$out
  36 ___
  37 $::code.=<<___ if (!$::evp);
  38         andcc           $ivec, 7, $ivoff
  39         alignaddr       $ivec, %g0, $ivec
  40
  41         ldd             [$ivec + 0], %f0        ! load ivec
  42         bz,pt           %icc, 1f
  43         ldd             [$ivec + 8], %f2
  44         ldd             [$ivec + 16], %f4
  45         faligndata      %f0, %f2, %f0
  46         faligndata      %f2, %f4, %f2
  47 1:
  48 ___
  49 $::code.=<<___ if ($::evp);
  50         ld              [$ivec + 0], %f0
  51         ld              [$ivec + 4], %f1
  52         ld              [$ivec + 8], %f2
  53         ld              [$ivec + 12], %f3
  54 ___
  55 $::code.=<<___;
  56         prefetch        [$inp], 20
  57         prefetch        [$inp + 63], 20
  58         call            _${alg}${bits}_load_enckey
  59         and             $inp, 7, $ileft
  60         andn            $inp, 7, $inp
  61         sll             $ileft, 3, $ileft
  62         mov             64, $iright
  63         mov             0xff, $omask
  64         sub             $iright, $ileft, $iright
  65         and             $out, 7, $ooff
  66         cmp             $len, 127
  67         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
  68         movleu          $::size_t_cc, 0, $blk_init      !       $len<128 ||
  69         brnz,pn         $blk_init, .L${bits}cbc_enc_blk !       $inp==$out)
  70         srl             $omask, $ooff, $omask
  71
  72         alignaddrl      $out, %g0, $out
  73         srlx            $len, 4, $len
  74         prefetch        [$out], 22
  75
  76 .L${bits}_cbc_enc_loop:
  77         ldx             [$inp + 0], %o0
  78         brz,pt          $ileft, 4f
  79         ldx             [$inp + 8], %o1
  80
  81         ldx             [$inp + 16], %o2
  82         sllx            %o0, $ileft, %o0
  83         srlx            %o1, $iright, %g1
  84         sllx            %o1, $ileft, %o1
  85         or              %g1, %o0, %o0
  86         srlx            %o2, $iright, %o2
  87         or              %o2, %o1, %o1
  88 4:
  89         xor             %g4, %o0, %o0           ! ^= rk[0]
  90         xor             %g5, %o1, %o1
  91         movxtod         %o0, %f12
  92         movxtod         %o1, %f14
  93
  94         fxor            %f12, %f0, %f0          ! ^= ivec
  95         fxor            %f14, %f2, %f2
  96         prefetch        [$out + 63], 22
  97         prefetch        [$inp + 16+63], 20
  98         call            _${alg}${bits}_encrypt_1x
  99         add             $inp, 16, $inp
 100
 101         brnz,pn         $ooff, 2f
 102         sub             $len, 1, $len
 103
 104         std             %f0, [$out + 0]
 105         std             %f2, [$out + 8]
 106         brnz,pt         $len, .L${bits}_cbc_enc_loop
 107         add             $out, 16, $out
 108 ___
 109 $::code.=<<___ if ($::evp);
 110         st              %f0, [$ivec + 0]
 111         st              %f1, [$ivec + 4]
 112         st              %f2, [$ivec + 8]
 113         st              %f3, [$ivec + 12]
 114 ___
 115 $::code.=<<___ if (!$::evp);
 116         brnz,pn         $ivoff, 3f
 117         nop
 118
 119         std             %f0, [$ivec + 0]        ! write out ivec
 120         std             %f2, [$ivec + 8]
 121 ___
 122 $::code.=<<___;
 123         ret
 124         restore
 125
 126 .align  16
 127 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 128                                                 ! and ~3x deterioration
 129                                                 ! in inp==out case
 130         faligndata      %f0, %f0, %f4           ! handle unaligned output
 131         faligndata      %f0, %f2, %f6
 132         faligndata      %f2, %f2, %f8
 133
 134         stda            %f4, [$out + $omask]0xc0        ! partial store
 135         std             %f6, [$out + 8]
 136         add             $out, 16, $out
 137         orn             %g0, $omask, $omask
 138         stda            %f8, [$out + $omask]0xc0        ! partial store
 139
 140         brnz,pt         $len, .L${bits}_cbc_enc_loop+4
 141         orn             %g0, $omask, $omask
 142 ___
 143 $::code.=<<___ if ($::evp);
 144         st              %f0, [$ivec + 0]
 145         st              %f1, [$ivec + 4]
 146         st              %f2, [$ivec + 8]
 147         st              %f3, [$ivec + 12]
 148 ___
 149 $::code.=<<___ if (!$::evp);
 150         brnz,pn         $ivoff, 3f
 151         nop
 152
 153         std             %f0, [$ivec + 0]        ! write out ivec
 154         std             %f2, [$ivec + 8]
 155         ret
 156         restore
 157
 158 .align  16
 159 3:      alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
 160         mov             0xff, $omask
 161         srl             $omask, $ivoff, $omask
 162         faligndata      %f0, %f0, %f4
 163         faligndata      %f0, %f2, %f6
 164         faligndata      %f2, %f2, %f8
 165         stda            %f4, [$ivec + $omask]0xc0
 166         std             %f6, [$ivec + 8]
 167         add             $ivec, 16, $ivec
 168         orn             %g0, $omask, $omask
 169         stda            %f8, [$ivec + $omask]0xc0
 170 ___
 171 $::code.=<<___;
 172         ret
 173         restore
 174
 175 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 176 .align  32
 177 .L${bits}cbc_enc_blk:
 178         add     $out, $len, $blk_init
 179         and     $blk_init, 63, $blk_init        ! tail
 180         sub     $len, $blk_init, $len
 181         add     $blk_init, 15, $blk_init        ! round up to 16n
 182         srlx    $len, 4, $len
 183         srl     $blk_init, 4, $blk_init
 184
 185 .L${bits}_cbc_enc_blk_loop:
 186         ldx             [$inp + 0], %o0
 187         brz,pt          $ileft, 5f
 188         ldx             [$inp + 8], %o1
 189
 190         ldx             [$inp + 16], %o2
 191         sllx            %o0, $ileft, %o0
 192         srlx            %o1, $iright, %g1
 193         sllx            %o1, $ileft, %o1
 194         or              %g1, %o0, %o0
 195         srlx            %o2, $iright, %o2
 196         or              %o2, %o1, %o1
 197 5:
 198         xor             %g4, %o0, %o0           ! ^= rk[0]
 199         xor             %g5, %o1, %o1
 200         movxtod         %o0, %f12
 201         movxtod         %o1, %f14
 202
 203         fxor            %f12, %f0, %f0          ! ^= ivec
 204         fxor            %f14, %f2, %f2
 205         prefetch        [$inp + 16+63], 20
 206         call            _${alg}${bits}_encrypt_1x
 207         add             $inp, 16, $inp
 208         sub             $len, 1, $len
 209
 210         stda            %f0, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 211         add             $out, 8, $out
 212         stda            %f2, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 213         brnz,pt         $len, .L${bits}_cbc_enc_blk_loop
 214         add             $out, 8, $out
 215
 216         !membar         0x0f
 217         brnz,pt         $blk_init, .L${bits}_cbc_enc_loop
 218         mov             $blk_init, $len
 219 ___
 220 $::code.=<<___ if ($::evp);
 221         st              %f0, [$ivec + 0]
 222         st              %f1, [$ivec + 4]
 223         st              %f2, [$ivec + 8]
 224         st              %f3, [$ivec + 12]
 225 ___
 226 $::code.=<<___ if (!$::evp);
 227         brnz,pn         $ivoff, 3b
 228         nop
 229
 230         std             %f0, [$ivec + 0]        ! write out ivec
 231         std             %f2, [$ivec + 8]
 232 ___
 233 $::code.=<<___;
 234         ret
 235         restore
 236 .type   ${alg}${bits}_t4_cbc_encrypt,#function
 237 .size   ${alg}${bits}_t4_cbc_encrypt,.-${alg}${bits}_t4_cbc_encrypt
 238 ___
 239 }
 240
 241 sub alg_cbc_decrypt_implement {
 242 my ($alg,$bits) = @_;
 243
 244 $::code.=<<___;
 245 .globl  ${alg}${bits}_t4_cbc_decrypt
 246 .align  32
 247 ${alg}${bits}_t4_cbc_decrypt:
 248         save            %sp, -$::frame, %sp
 249         sub             $inp, $out, $blk_init   ! $inp!=$out
 250 ___
 251 $::code.=<<___ if (!$::evp);
 252         andcc           $ivec, 7, $ivoff
 253         alignaddr       $ivec, %g0, $ivec
 254
 255         ldd             [$ivec + 0], %f12       ! load ivec
 256         bz,pt           %icc, 1f
 257         ldd             [$ivec + 8], %f14
 258         ldd             [$ivec + 16], %f0
 259         faligndata      %f12, %f14, %f12
 260         faligndata      %f14, %f0, %f14
 261 1:
 262 ___
 263 $::code.=<<___ if ($::evp);
 264         ld              [$ivec + 0], %f12       ! load ivec
 265         ld              [$ivec + 4], %f13
 266         ld              [$ivec + 8], %f14
 267         ld              [$ivec + 12], %f15
 268 ___
 269 $::code.=<<___;
 270         prefetch        [$inp], 20
 271         prefetch        [$inp + 63], 20
 272         call            _${alg}${bits}_load_deckey
 273         and             $inp, 7, $ileft
 274         andn            $inp, 7, $inp
 275         sll             $ileft, 3, $ileft
 276         mov             64, $iright
 277         mov             0xff, $omask
 278         sub             $iright, $ileft, $iright
 279         and             $out, 7, $ooff
 280         cmp             $len, 127
 281         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
 282         movleu          $::size_t_cc, 0, $blk_init      !       $len<128 ||
 283         brnz,pn         $blk_init, .L${bits}cbc_dec_blk !       $inp==$out)
 284         srl             $omask, $ooff, $omask
 285
 286         andcc           $len, 16, %g0           ! is number of blocks even?
 287         srlx            $len, 4, $len
 288         alignaddrl      $out, %g0, $out
 289         bz              %icc, .L${bits}_cbc_dec_loop2x
 290         prefetch        [$out], 22
 291 .L${bits}_cbc_dec_loop:
 292         ldx             [$inp + 0], %o0
 293         brz,pt          $ileft, 4f
 294         ldx             [$inp + 8], %o1
 295
 296         ldx             [$inp + 16], %o2
 297         sllx            %o0, $ileft, %o0
 298         srlx            %o1, $iright, %g1
 299         sllx            %o1, $ileft, %o1
 300         or              %g1, %o0, %o0
 301         srlx            %o2, $iright, %o2
 302         or              %o2, %o1, %o1
 303 4:
 304         xor             %g4, %o0, %o2           ! ^= rk[0]
 305         xor             %g5, %o1, %o3
 306         movxtod         %o2, %f0
 307         movxtod         %o3, %f2
 308
 309         prefetch        [$out + 63], 22
 310         prefetch        [$inp + 16+63], 20
 311         call            _${alg}${bits}_decrypt_1x
 312         add             $inp, 16, $inp
 313
 314         fxor            %f12, %f0, %f0          ! ^= ivec
 315         fxor            %f14, %f2, %f2
 316         movxtod         %o0, %f12
 317         movxtod         %o1, %f14
 318
 319         brnz,pn         $ooff, 2f
 320         sub             $len, 1, $len
 321
 322         std             %f0, [$out + 0]
 323         std             %f2, [$out + 8]
 324         brnz,pt         $len, .L${bits}_cbc_dec_loop2x
 325         add             $out, 16, $out
 326 ___
 327 $::code.=<<___ if ($::evp);
 328         st              %f12, [$ivec + 0]
 329         st              %f13, [$ivec + 4]
 330         st              %f14, [$ivec + 8]
 331         st              %f15, [$ivec + 12]
 332 ___
 333 $::code.=<<___ if (!$::evp);
 334         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 335         nop
 336
 337         std             %f12, [$ivec + 0]       ! write out ivec
 338         std             %f14, [$ivec + 8]
 339 ___
 340 $::code.=<<___;
 341         ret
 342         restore
 343
 344 .align  16
 345 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 346                                                 ! and ~3x deterioration
 347                                                 ! in inp==out case
 348         faligndata      %f0, %f0, %f4           ! handle unaligned output
 349         faligndata      %f0, %f2, %f6
 350         faligndata      %f2, %f2, %f8
 351
 352         stda            %f4, [$out + $omask]0xc0        ! partial store
 353         std             %f6, [$out + 8]
 354         add             $out, 16, $out
 355         orn             %g0, $omask, $omask
 356         stda            %f8, [$out + $omask]0xc0        ! partial store
 357
 358         brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
 359         orn             %g0, $omask, $omask
 360 ___
 361 $::code.=<<___ if ($::evp);
 362         st              %f12, [$ivec + 0]
 363         st              %f13, [$ivec + 4]
 364         st              %f14, [$ivec + 8]
 365         st              %f15, [$ivec + 12]
 366 ___
 367 $::code.=<<___ if (!$::evp);
 368         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 369         nop
 370
 371         std             %f12, [$ivec + 0]       ! write out ivec
 372         std             %f14, [$ivec + 8]
 373 ___
 374 $::code.=<<___;
 375         ret
 376         restore
 377
 378 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 379 .align  32
 380 .L${bits}_cbc_dec_loop2x:
 381         ldx             [$inp + 0], %o0
 382         ldx             [$inp + 8], %o1
 383         ldx             [$inp + 16], %o2
 384         brz,pt          $ileft, 4f
 385         ldx             [$inp + 24], %o3
 386
 387         ldx             [$inp + 32], %o4
 388         sllx            %o0, $ileft, %o0
 389         srlx            %o1, $iright, %g1
 390         or              %g1, %o0, %o0
 391         sllx            %o1, $ileft, %o1
 392         srlx            %o2, $iright, %g1
 393         or              %g1, %o1, %o1
 394         sllx            %o2, $ileft, %o2
 395         srlx            %o3, $iright, %g1
 396         or              %g1, %o2, %o2
 397         sllx            %o3, $ileft, %o3
 398         srlx            %o4, $iright, %o4
 399         or              %o4, %o3, %o3
 400 4:
 401         xor             %g4, %o0, %o4           ! ^= rk[0]
 402         xor             %g5, %o1, %o5
 403         movxtod         %o4, %f0
 404         movxtod         %o5, %f2
 405         xor             %g4, %o2, %o4
 406         xor             %g5, %o3, %o5
 407         movxtod         %o4, %f4
 408         movxtod         %o5, %f6
 409
 410         prefetch        [$out + 63], 22
 411         prefetch        [$inp + 32+63], 20
 412         call            _${alg}${bits}_decrypt_2x
 413         add             $inp, 32, $inp
 414
 415         movxtod         %o0, %f8
 416         movxtod         %o1, %f10
 417         fxor            %f12, %f0, %f0          ! ^= ivec
 418         fxor            %f14, %f2, %f2
 419         movxtod         %o2, %f12
 420         movxtod         %o3, %f14
 421         fxor            %f8, %f4, %f4
 422         fxor            %f10, %f6, %f6
 423
 424         brnz,pn         $ooff, 2f
 425         sub             $len, 2, $len
 426
 427         std             %f0, [$out + 0]
 428         std             %f2, [$out + 8]
 429         std             %f4, [$out + 16]
 430         std             %f6, [$out + 24]
 431         brnz,pt         $len, .L${bits}_cbc_dec_loop2x
 432         add             $out, 32, $out
 433 ___
 434 $::code.=<<___ if ($::evp);
 435         st              %f12, [$ivec + 0]
 436         st              %f13, [$ivec + 4]
 437         st              %f14, [$ivec + 8]
 438         st              %f15, [$ivec + 12]
 439 ___
 440 $::code.=<<___ if (!$::evp);
 441         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 442         nop
 443
 444         std             %f12, [$ivec + 0]       ! write out ivec
 445         std             %f14, [$ivec + 8]
 446 ___
 447 $::code.=<<___;
 448         ret
 449         restore
 450
 451 .align  16
 452 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 453                                                 ! and ~3x deterioration
 454                                                 ! in inp==out case
 455         faligndata      %f0, %f0, %f8           ! handle unaligned output
 456         faligndata      %f0, %f2, %f0
 457         faligndata      %f2, %f4, %f2
 458         faligndata      %f4, %f6, %f4
 459         faligndata      %f6, %f6, %f6
 460         stda            %f8, [$out + $omask]0xc0        ! partial store
 461         std             %f0, [$out + 8]
 462         std             %f2, [$out + 16]
 463         std             %f4, [$out + 24]
 464         add             $out, 32, $out
 465         orn             %g0, $omask, $omask
 466         stda            %f6, [$out + $omask]0xc0        ! partial store
 467
 468         brnz,pt         $len, .L${bits}_cbc_dec_loop2x+4
 469         orn             %g0, $omask, $omask
 470 ___
 471 $::code.=<<___ if ($::evp);
 472         st              %f12, [$ivec + 0]
 473         st              %f13, [$ivec + 4]
 474         st              %f14, [$ivec + 8]
 475         st              %f15, [$ivec + 12]
 476 ___
 477 $::code.=<<___ if (!$::evp);
 478         brnz,pn         $ivoff, .L${bits}_cbc_dec_unaligned_ivec
 479         nop
 480
 481         std             %f12, [$ivec + 0]       ! write out ivec
 482         std             %f14, [$ivec + 8]
 483         ret
 484         restore
 485
 486 .align  16
 487 .L${bits}_cbc_dec_unaligned_ivec:
 488         alignaddrl      $ivec, $ivoff, %g0      ! handle unaligned ivec
 489         mov             0xff, $omask
 490         srl             $omask, $ivoff, $omask
 491         faligndata      %f12, %f12, %f0
 492         faligndata      %f12, %f14, %f2
 493         faligndata      %f14, %f14, %f4
 494         stda            %f0, [$ivec + $omask]0xc0
 495         std             %f2, [$ivec + 8]
 496         add             $ivec, 16, $ivec
 497         orn             %g0, $omask, $omask
 498         stda            %f4, [$ivec + $omask]0xc0
 499 ___
 500 $::code.=<<___;
 501         ret
 502         restore
 503
 504 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 505 .align  32
 506 .L${bits}cbc_dec_blk:
 507         add     $out, $len, $blk_init
 508         and     $blk_init, 63, $blk_init        ! tail
 509         sub     $len, $blk_init, $len
 510         add     $blk_init, 15, $blk_init        ! round up to 16n
 511         srlx    $len, 4, $len
 512         srl     $blk_init, 4, $blk_init
 513         sub     $len, 1, $len
 514         add     $blk_init, 1, $blk_init
 515
 516 .L${bits}_cbc_dec_blk_loop2x:
 517         ldx             [$inp + 0], %o0
 518         ldx             [$inp + 8], %o1
 519         ldx             [$inp + 16], %o2
 520         brz,pt          $ileft, 5f
 521         ldx             [$inp + 24], %o3
 522
 523         ldx             [$inp + 32], %o4
 524         sllx            %o0, $ileft, %o0
 525         srlx            %o1, $iright, %g1
 526         or              %g1, %o0, %o0
 527         sllx            %o1, $ileft, %o1
 528         srlx            %o2, $iright, %g1
 529         or              %g1, %o1, %o1
 530         sllx            %o2, $ileft, %o2
 531         srlx            %o3, $iright, %g1
 532         or              %g1, %o2, %o2
 533         sllx            %o3, $ileft, %o3
 534         srlx            %o4, $iright, %o4
 535         or              %o4, %o3, %o3
 536 5:
 537         xor             %g4, %o0, %o4           ! ^= rk[0]
 538         xor             %g5, %o1, %o5
 539         movxtod         %o4, %f0
 540         movxtod         %o5, %f2
 541         xor             %g4, %o2, %o4
 542         xor             %g5, %o3, %o5
 543         movxtod         %o4, %f4
 544         movxtod         %o5, %f6
 545
 546         prefetch        [$inp + 32+63], 20
 547         call            _${alg}${bits}_decrypt_2x
 548         add             $inp, 32, $inp
 549         subcc           $len, 2, $len
 550
 551         movxtod         %o0, %f8
 552         movxtod         %o1, %f10
 553         fxor            %f12, %f0, %f0          ! ^= ivec
 554         fxor            %f14, %f2, %f2
 555         movxtod         %o2, %f12
 556         movxtod         %o3, %f14
 557         fxor            %f8, %f4, %f4
 558         fxor            %f10, %f6, %f6
 559
 560         stda            %f0, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 561         add             $out, 8, $out
 562         stda            %f2, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 563         add             $out, 8, $out
 564         stda            %f4, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 565         add             $out, 8, $out
 566         stda            %f6, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 567         bgu,pt          $::size_t_cc, .L${bits}_cbc_dec_blk_loop2x
 568         add             $out, 8, $out
 569
 570         add             $blk_init, $len, $len
 571         andcc           $len, 1, %g0            ! is number of blocks even?
 572         !membar         0x0f
 573         bnz,pt          %icc, .L${bits}_cbc_dec_loop
 574         srl             $len, 0, $len
 575         brnz,pn         $len, .L${bits}_cbc_dec_loop2x
 576         nop
 577 ___
 578 $::code.=<<___ if ($::evp);
 579         st              %f0, [$ivec + 0]
 580         st              %f1, [$ivec + 4]
 581         st              %f2, [$ivec + 8]
 582         st              %f3, [$ivec + 12]
 583 ___
 584 $::code.=<<___ if (!$::evp);
 585         brnz,pn         $ivoff, 3b
 586         nop
 587
 588         std             %f0, [$ivec + 0]        ! write out ivec
 589         std             %f2, [$ivec + 8]
 590 ___
 591 $::code.=<<___;
 592         ret
 593         restore
 594 .type   ${alg}${bits}_t4_cbc_decrypt,#function
 595 .size   ${alg}${bits}_t4_cbc_decrypt,.-${alg}${bits}_t4_cbc_decrypt
 596 ___
 597 }
 598
 599 sub alg_ctr32_implement {
 600 my ($alg,$bits) = @_;
 601
 602 $::code.=<<___;
 603 .globl  ${alg}${bits}_t4_ctr32_encrypt
 604 .align  32
 605 ${alg}${bits}_t4_ctr32_encrypt:
 606         save            %sp, -$::frame, %sp
 607
 608         prefetch        [$inp], 20
 609         prefetch        [$inp + 63], 20
 610         call            _${alg}${bits}_load_enckey
 611         sllx            $len, 4, $len
 612
 613         ld              [$ivec + 0], %l4        ! counter
 614         ld              [$ivec + 4], %l5
 615         ld              [$ivec + 8], %l6
 616         ld              [$ivec + 12], %l7
 617
 618         sllx            %l4, 32, %o5
 619         or              %l5, %o5, %o5
 620         sllx            %l6, 32, %g1
 621         xor             %o5, %g4, %g4           ! ^= rk[0]
 622         xor             %g1, %g5, %g5
 623         movxtod         %g4, %f14               ! most significant 64 bits
 624
 625         sub             $inp, $out, $blk_init   ! $inp!=$out
 626         and             $inp, 7, $ileft
 627         andn            $inp, 7, $inp
 628         sll             $ileft, 3, $ileft
 629         mov             64, $iright
 630         mov             0xff, $omask
 631         sub             $iright, $ileft, $iright
 632         and             $out, 7, $ooff
 633         cmp             $len, 127
 634         movrnz          $ooff, 0, $blk_init             ! if (  $out&7 ||
 635         movleu          $::size_t_cc, 0, $blk_init      !       $len<128 ||
 636         brnz,pn         $blk_init, .L${bits}_ctr32_blk  !       $inp==$out)
 637         srl             $omask, $ooff, $omask
 638
 639         andcc           $len, 16, %g0           ! is number of blocks even?
 640         alignaddrl      $out, %g0, $out
 641         bz              %icc, .L${bits}_ctr32_loop2x
 642         srlx            $len, 4, $len
 643 .L${bits}_ctr32_loop:
 644         ldx             [$inp + 0], %o0
 645         brz,pt          $ileft, 4f
 646         ldx             [$inp + 8], %o1
 647
 648         ldx             [$inp + 16], %o2
 649         sllx            %o0, $ileft, %o0
 650         srlx            %o1, $iright, %g1
 651         sllx            %o1, $ileft, %o1
 652         or              %g1, %o0, %o0
 653         srlx            %o2, $iright, %o2
 654         or              %o2, %o1, %o1
 655 4:
 656         xor             %g5, %l7, %g1           ! ^= rk[0]
 657         add             %l7, 1, %l7
 658         movxtod         %g1, %f2
 659         srl             %l7, 0, %l7             ! clruw
 660         prefetch        [$out + 63], 22
 661         prefetch        [$inp + 16+63], 20
 662 ___
 663 $::code.=<<___ if ($alg eq "aes");
 664         aes_eround01    %f16, %f14, %f2, %f4
 665         aes_eround23    %f18, %f14, %f2, %f2
 666 ___
 667 $::code.=<<___ if ($alg eq "cmll");
 668         camellia_f      %f16, %f2, %f14, %f2
 669         camellia_f      %f18, %f14, %f2, %f0
 670 ___
 671 $::code.=<<___;
 672         call            _${alg}${bits}_encrypt_1x+8
 673         add             $inp, 16, $inp
 674
 675         movxtod         %o0, %f10
 676         movxtod         %o1, %f12
 677         fxor            %f10, %f0, %f0          ! ^= inp
 678         fxor            %f12, %f2, %f2
 679
 680         brnz,pn         $ooff, 2f
 681         sub             $len, 1, $len
 682
 683         std             %f0, [$out + 0]
 684         std             %f2, [$out + 8]
 685         brnz,pt         $len, .L${bits}_ctr32_loop2x
 686         add             $out, 16, $out
 687
 688         ret
 689         restore
 690
 691 .align  16
 692 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 693                                                 ! and ~3x deterioration
 694                                                 ! in inp==out case
 695         faligndata      %f0, %f0, %f4           ! handle unaligned output
 696         faligndata      %f0, %f2, %f6
 697         faligndata      %f2, %f2, %f8
 698         stda            %f4, [$out + $omask]0xc0        ! partial store
 699         std             %f6, [$out + 8]
 700         add             $out, 16, $out
 701         orn             %g0, $omask, $omask
 702         stda            %f8, [$out + $omask]0xc0        ! partial store
 703
 704         brnz,pt         $len, .L${bits}_ctr32_loop2x+4
 705         orn             %g0, $omask, $omask
 706
 707         ret
 708         restore
 709
 710 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 711 .align  32
 712 .L${bits}_ctr32_loop2x:
 713         ldx             [$inp + 0], %o0
 714         ldx             [$inp + 8], %o1
 715         ldx             [$inp + 16], %o2
 716         brz,pt          $ileft, 4f
 717         ldx             [$inp + 24], %o3
 718
 719         ldx             [$inp + 32], %o4
 720         sllx            %o0, $ileft, %o0
 721         srlx            %o1, $iright, %g1
 722         or              %g1, %o0, %o0
 723         sllx            %o1, $ileft, %o1
 724         srlx            %o2, $iright, %g1
 725         or              %g1, %o1, %o1
 726         sllx            %o2, $ileft, %o2
 727         srlx            %o3, $iright, %g1
 728         or              %g1, %o2, %o2
 729         sllx            %o3, $ileft, %o3
 730         srlx            %o4, $iright, %o4
 731         or              %o4, %o3, %o3
 732 4:
 733         xor             %g5, %l7, %g1           ! ^= rk[0]
 734         add             %l7, 1, %l7
 735         movxtod         %g1, %f2
 736         srl             %l7, 0, %l7             ! clruw
 737         xor             %g5, %l7, %g1
 738         add             %l7, 1, %l7
 739         movxtod         %g1, %f6
 740         srl             %l7, 0, %l7             ! clruw
 741         prefetch        [$out + 63], 22
 742         prefetch        [$inp + 32+63], 20
 743 ___
 744 $::code.=<<___ if ($alg eq "aes");
 745         aes_eround01    %f16, %f14, %f2, %f8
 746         aes_eround23    %f18, %f14, %f2, %f2
 747         aes_eround01    %f16, %f14, %f6, %f10
 748         aes_eround23    %f18, %f14, %f6, %f6
 749 ___
 750 $::code.=<<___ if ($alg eq "cmll");
 751         camellia_f      %f16, %f2, %f14, %f2
 752         camellia_f      %f16, %f6, %f14, %f6
 753         camellia_f      %f18, %f14, %f2, %f0
 754         camellia_f      %f18, %f14, %f6, %f4
 755 ___
 756 $::code.=<<___;
 757         call            _${alg}${bits}_encrypt_2x+16
 758         add             $inp, 32, $inp
 759
 760         movxtod         %o0, %f8
 761         movxtod         %o1, %f10
 762         movxtod         %o2, %f12
 763         fxor            %f8, %f0, %f0           ! ^= inp
 764         movxtod         %o3, %f8
 765         fxor            %f10, %f2, %f2
 766         fxor            %f12, %f4, %f4
 767         fxor            %f8, %f6, %f6
 768
 769         brnz,pn         $ooff, 2f
 770         sub             $len, 2, $len
 771
 772         std             %f0, [$out + 0]
 773         std             %f2, [$out + 8]
 774         std             %f4, [$out + 16]
 775         std             %f6, [$out + 24]
 776         brnz,pt         $len, .L${bits}_ctr32_loop2x
 777         add             $out, 32, $out
 778
 779         ret
 780         restore
 781
 782 .align  16
 783 2:      ldxa            [$inp]0x82, %o0         ! avoid read-after-write hazard
 784                                                 ! and ~3x deterioration
 785                                                 ! in inp==out case
 786         faligndata      %f0, %f0, %f8           ! handle unaligned output
 787         faligndata      %f0, %f2, %f0
 788         faligndata      %f2, %f4, %f2
 789         faligndata      %f4, %f6, %f4
 790         faligndata      %f6, %f6, %f6
 791
 792         stda            %f8, [$out + $omask]0xc0        ! partial store
 793         std             %f0, [$out + 8]
 794         std             %f2, [$out + 16]
 795         std             %f4, [$out + 24]
 796         add             $out, 32, $out
 797         orn             %g0, $omask, $omask
 798         stda            %f6, [$out + $omask]0xc0        ! partial store
 799
 800         brnz,pt         $len, .L${bits}_ctr32_loop2x+4
 801         orn             %g0, $omask, $omask
 802
 803         ret
 804         restore
 805
 806 !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
 807 .align  32
 808 .L${bits}_ctr32_blk:
 809         add     $out, $len, $blk_init
 810         and     $blk_init, 63, $blk_init        ! tail
 811         sub     $len, $blk_init, $len
 812         add     $blk_init, 15, $blk_init        ! round up to 16n
 813         srlx    $len, 4, $len
 814         srl     $blk_init, 4, $blk_init
 815         sub     $len, 1, $len
 816         add     $blk_init, 1, $blk_init
 817
 818 .L${bits}_ctr32_blk_loop2x:
 819         ldx             [$inp + 0], %o0
 820         ldx             [$inp + 8], %o1
 821         ldx             [$inp + 16], %o2
 822         brz,pt          $ileft, 5f
 823         ldx             [$inp + 24], %o3
 824
 825         ldx             [$inp + 32], %o4
 826         sllx            %o0, $ileft, %o0
 827         srlx            %o1, $iright, %g1
 828         or              %g1, %o0, %o0
 829         sllx            %o1, $ileft, %o1
 830         srlx            %o2, $iright, %g1
 831         or              %g1, %o1, %o1
 832         sllx            %o2, $ileft, %o2
 833         srlx            %o3, $iright, %g1
 834         or              %g1, %o2, %o2
 835         sllx            %o3, $ileft, %o3
 836         srlx            %o4, $iright, %o4
 837         or              %o4, %o3, %o3
 838 5:
 839         xor             %g5, %l7, %g1           ! ^= rk[0]
 840         add             %l7, 1, %l7
 841         movxtod         %g1, %f2
 842         srl             %l7, 0, %l7             ! clruw
 843         xor             %g5, %l7, %g1
 844         add             %l7, 1, %l7
 845         movxtod         %g1, %f6
 846         srl             %l7, 0, %l7             ! clruw
 847         prefetch        [$inp + 32+63], 20
 848 ___
 849 $::code.=<<___ if ($alg eq "aes");
 850         aes_eround01    %f16, %f14, %f2, %f8
 851         aes_eround23    %f18, %f14, %f2, %f2
 852         aes_eround01    %f16, %f14, %f6, %f10
 853         aes_eround23    %f18, %f14, %f6, %f6
 854 ___
 855 $::code.=<<___ if ($alg eq "cmll");
 856         camellia_f      %f16, %f2, %f14, %f2
 857         camellia_f      %f16, %f6, %f14, %f6
 858         camellia_f      %f18, %f14, %f2, %f0
 859         camellia_f      %f18, %f14, %f6, %f4
 860 ___
 861 $::code.=<<___;
 862         call            _${alg}${bits}_encrypt_2x+16
 863         add             $inp, 32, $inp
 864         subcc           $len, 2, $len
 865
 866         movxtod         %o0, %f8
 867         movxtod         %o1, %f10
 868         movxtod         %o2, %f12
 869         fxor            %f8, %f0, %f0           ! ^= inp
 870         movxtod         %o3, %f8
 871         fxor            %f10, %f2, %f2
 872         fxor            %f12, %f4, %f4
 873         fxor            %f8, %f6, %f6
 874
 875         stda            %f0, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 876         add             $out, 8, $out
 877         stda            %f2, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 878         add             $out, 8, $out
 879         stda            %f4, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 880         add             $out, 8, $out
 881         stda            %f6, [$out]0xf2         ! ASI_BLK_INIT, T4-specific
 882         bgu,pt          $::size_t_cc, .L${bits}_ctr32_blk_loop2x
 883         add             $out, 8, $out
 884
 885         add             $blk_init, $len, $len
 886         andcc           $len, 1, %g0            ! is number of blocks even?
 887         !membar         0x0f
 888         bnz,pt          %icc, .L${bits}_ctr32_loop
 889         srl             $len, 0, $len
 890         brnz,pn         $len, .L${bits}_ctr32_loop2x
 891         nop
 892
 893         ret
 894         restore
 895 .type   ${alg}${bits}_t4_ctr32_encrypt,#function
 896 .size   ${alg}${bits}_t4_ctr32_encrypt,.-${alg}${bits}_t4_ctr32_encrypt
 897 ___
 898 }
 899
 900 # Purpose of these subroutines is to explicitly encode VIS instructions,
 901 # so that one can compile the module without having to specify VIS
 902 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 903 # Idea is to reserve for option to produce "universal" binary and let
 904 # programmer detect if current CPU is VIS capable at run-time.
 905 sub unvis {
 906 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 907 my ($ref,$opf);
 908 my %visopf = (  "faligndata"    => 0x048,
 909                 "fnot2"         => 0x066,
 910                 "fxor"          => 0x06c,
 911                 "fsrc2"         => 0x078        );
 912
 913     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 914
 915     if ($opf=$visopf{$mnemonic}) {
 916         foreach ($rs1,$rs2,$rd) {
 917             return $ref if (!/%f([0-9]{1,2})/);
 918             $_=$1;
 919             if ($1>=32) {
 920                 return $ref if ($1&1);
 921                 # re-encode for upper double register addressing
 922                 $_=($1|$1>>5)&31;
 923             }
 924         }
 925
 926         return  sprintf ".word\t0x%08x !%s",
 927                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 928                         $ref;
 929     } else {
 930         return $ref;
 931     }
 932 }
 933 sub unalignaddr {
 934 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 935 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 936 my $ref = "$mnemonic\t$rs1,$rs2,$rd";
 937 my $opf = $mnemonic =~ /l$/ ? 0x01a :0x18;
 938
 939     foreach ($rs1,$rs2,$rd) {
 940         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 941         else                    { return $ref; }
 942     }
 943     return  sprintf ".word\t0x%08x !%s",
 944                     0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 945                     $ref;
 946 }
 947
 948 sub unaes_round {       # 4-argument instructions
 949 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
 950 my ($ref,$opf);
 951 my %aesopf = (  "aes_eround01"  => 0,
 952                 "aes_eround23"  => 1,
 953                 "aes_dround01"  => 2,
 954                 "aes_dround23"  => 3,
 955                 "aes_eround01_l"=> 4,
 956                 "aes_eround23_l"=> 5,
 957                 "aes_dround01_l"=> 6,
 958                 "aes_dround23_l"=> 7,
 959                 "aes_kexpand1"  => 8    );
 960
 961     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
 962
 963     if (defined($opf=$aesopf{$mnemonic})) {
 964         $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
 965         foreach ($rs1,$rs2,$rd) {
 966             return $ref if (!/%f([0-9]{1,2})/);
 967             $_=$1;
 968             if ($1>=32) {
 969                 return $ref if ($1&1);
 970                 # re-encode for upper double register addressing
 971                 $_=($1|$1>>5)&31;
 972             }
 973         }
 974
 975         return  sprintf ".word\t0x%08x !%s",
 976                         2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|$opf<<5|$rs2,
 977                         $ref;
 978     } else {
 979         return $ref;
 980     }
 981 }
 982
 983 sub unaes_kexpand {     # 3-argument instructions
 984 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 985 my ($ref,$opf);
 986 my %aesopf = (  "aes_kexpand0"  => 0x130,
 987                 "aes_kexpand2"  => 0x131        );
 988
 989     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 990
 991     if (defined($opf=$aesopf{$mnemonic})) {
 992         foreach ($rs1,$rs2,$rd) {
 993             return $ref if (!/%f([0-9]{1,2})/);
 994             $_=$1;
 995             if ($1>=32) {
 996                 return $ref if ($1&1);
 997                 # re-encode for upper double register addressing
 998                 $_=($1|$1>>5)&31;
 999             }
1000         }
1001
1002         return  sprintf ".word\t0x%08x !%s",
1003                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1004                         $ref;
1005     } else {
1006         return $ref;
1007     }
1008 }
1009
1010 sub uncamellia_f {      # 4-argument instructions
1011 my ($mnemonic,$rs1,$rs2,$rs3,$rd)=@_;
1012 my ($ref,$opf);
1013
1014     $ref = "$mnemonic\t$rs1,$rs2,$rs3,$rd";
1015
1016     if (1) {
1017         $rs3 = ($rs3 =~ /%f([0-6]*[02468])/) ? (($1|$1>>5)&31) : $rs3;
1018         foreach ($rs1,$rs2,$rd) {
1019             return $ref if (!/%f([0-9]{1,2})/);
1020             $_=$1;
1021             if ($1>=32) {
1022                 return $ref if ($1&1);
1023                 # re-encode for upper double register addressing
1024                 $_=($1|$1>>5)&31;
1025             }
1026         }
1027
1028         return  sprintf ".word\t0x%08x !%s",
1029                         2<<30|$rd<<25|0x19<<19|$rs1<<14|$rs3<<9|0xc<<5|$rs2,
1030                         $ref;
1031     } else {
1032         return $ref;
1033     }
1034 }
1035
1036 sub uncamellia3 {       # 3-argument instructions
1037 my ($mnemonic,$rs1,$rs2,$rd)=@_;
1038 my ($ref,$opf);
1039 my %cmllopf = ( "camellia_fl"   => 0x13c,
1040                 "camellia_fli"  => 0x13d        );
1041
1042     $ref = "$mnemonic\t$rs1,$rs2,$rd";
1043
1044     if (defined($opf=$cmllopf{$mnemonic})) {
1045         foreach ($rs1,$rs2,$rd) {
1046             return $ref if (!/%f([0-9]{1,2})/);
1047             $_=$1;
1048             if ($1>=32) {
1049                 return $ref if ($1&1);
1050                 # re-encode for upper double register addressing
1051                 $_=($1|$1>>5)&31;
1052             }
1053         }
1054
1055         return  sprintf ".word\t0x%08x !%s",
1056                         2<<30|$rd<<25|0x36<<19|$rs1<<14|$opf<<5|$rs2,
1057                         $ref;
1058     } else {
1059         return $ref;
1060     }
1061 }
1062
1063 sub unmovxtox {         # 2-argument instructions
1064 my ($mnemonic,$rs,$rd)=@_;
1065 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24, "f" => 0 );
1066 my ($ref,$opf);
1067 my %movxopf = ( "movdtox"       => 0x110,
1068                 "movstouw"      => 0x111,
1069                 "movstosw"      => 0x113,
1070                 "movxtod"       => 0x118,
1071                 "movwtos"       => 0x119        );
1072
1073     $ref = "$mnemonic\t$rs,$rd";
1074
1075     if (defined($opf=$movxopf{$mnemonic})) {
1076         foreach ($rs,$rd) {
1077             return $ref if (!/%([fgoli])([0-9]{1,2})/);
1078             $_=$bias{$1}+$2;
1079             if ($2>=32) {
1080                 return $ref if ($2&1);
1081                 # re-encode for upper double register addressing
1082                 $_=($2|$2>>5)&31;
1083             }
1084         }
1085
1086         return  sprintf ".word\t0x%08x !%s",
1087                         2<<30|$rd<<25|0x36<<19|$opf<<5|$rs,
1088                         $ref;
1089     } else {
1090         return $ref;
1091     }
1092 }
1093
1094 sub emit_assembler {
1095     foreach (split("\n",$::code)) {
1096         s/\`([^\`]*)\`/eval $1/ge;
1097
1098         s/\b(f[a-z]+2[sd]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})\s*$/$1\t%f0,$2,$3/g;
1099
1100         s/\b(aes_[edk][^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1101                 &unaes_round($1,$2,$3,$4,$5)
1102          /ge or
1103         s/\b(aes_kexpand[02])\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1104                 &unaes_kexpand($1,$2,$3,$4)
1105          /ge or
1106         s/\b(camellia_f)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*([%fx0-9]+),\s*(%f[0-9]{1,2})/
1107                 &uncamellia_f($1,$2,$3,$4,$5)
1108          /ge or
1109         s/\b(camellia_[^s]+)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1110                 &uncamellia3($1,$2,$3,$4)
1111          /ge or
1112         s/\b(mov[ds]to\w+)\s+(%f[0-9]{1,2}),\s*(%[goli][0-7])/
1113                 &unmovxtox($1,$2,$3)
1114          /ge or
1115         s/\b(mov[xw]to[ds])\s+(%[goli][0-7]),\s*(%f[0-9]{1,2})/
1116                 &unmovxtox($1,$2,$3)
1117          /ge or
1118         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
1119                 &unvis($1,$2,$3,$4)
1120          /ge or
1121         s/\b(alignaddr[l]*)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
1122                 &unalignaddr($1,$2,$3,$4)
1123          /ge;
1124
1125         print $_,"\n";
1126     }
1127 }
1128
1129 1;