crypto/md5/asm/md5-sparcv9.pl

   1 #!/usr/bin/env perl
   2
   3 # ====================================================================
   4 # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
   5 # project. The module is, however, dual licensed under OpenSSL and
   6 # CRYPTOGAMS licenses depending on where you obtain it. For further
   7 # details see http://www.openssl.org/~appro/cryptogams/.
   8 #
   9 # Hardware SPARC T4 support by David S. Miller <davem@davemloft.net>.
  10 # ====================================================================
  11
  12 # MD5 for SPARCv9, 6.9 cycles per byte on UltraSPARC, >40% faster than
  13 # code generated by Sun C 5.2.
  14
  15 # SPARC T4 MD5 hardware achieves 3.20 cycles per byte, which is 2.1x
  16 # faster than software. Multi-process benchmark saturates at 12x
  17 # single-process result on 8-core processor, or ~11GBps per 2.85GHz
  18 # socket.
  19
  20 $output=shift;
  21 open STDOUT,">$output";
  22
  23 use integer;
  24
  25 ($ctx,$inp,$len)=("%i0","%i1","%i2");   # input arguments
  26
  27 # 64-bit values
  28 @X=("%o0","%o1","%o2","%o3","%o4","%o5","%o7","%g1","%g2");
  29 $tx="%g3";
  30 ($AB,$CD)=("%g4","%g5");
  31
  32 # 32-bit values
  33 @V=($A,$B,$C,$D)=map("%l$_",(0..3));
  34 ($t1,$t2,$t3,$saved_asi)=map("%l$_",(4..7));
  35 ($shr,$shl1,$shl2)=("%i3","%i4","%i5");
  36
  37 my @K=( 0xd76aa478,0xe8c7b756,0x242070db,0xc1bdceee,
  38         0xf57c0faf,0x4787c62a,0xa8304613,0xfd469501,
  39         0x698098d8,0x8b44f7af,0xffff5bb1,0x895cd7be,
  40         0x6b901122,0xfd987193,0xa679438e,0x49b40821,
  41
  42         0xf61e2562,0xc040b340,0x265e5a51,0xe9b6c7aa,
  43         0xd62f105d,0x02441453,0xd8a1e681,0xe7d3fbc8,
  44         0x21e1cde6,0xc33707d6,0xf4d50d87,0x455a14ed,
  45         0xa9e3e905,0xfcefa3f8,0x676f02d9,0x8d2a4c8a,
  46
  47         0xfffa3942,0x8771f681,0x6d9d6122,0xfde5380c,
  48         0xa4beea44,0x4bdecfa9,0xf6bb4b60,0xbebfbc70,
  49         0x289b7ec6,0xeaa127fa,0xd4ef3085,0x04881d05,
  50         0xd9d4d039,0xe6db99e5,0x1fa27cf8,0xc4ac5665,
  51
  52         0xf4292244,0x432aff97,0xab9423a7,0xfc93a039,
  53         0x655b59c3,0x8f0ccc92,0xffeff47d,0x85845dd1,
  54         0x6fa87e4f,0xfe2ce6e0,0xa3014314,0x4e0811a1,
  55         0xf7537e82,0xbd3af235,0x2ad7d2bb,0xeb86d391, 0  );
  56
  57 sub R0 {
  58   my ($i,$a,$b,$c,$d) = @_;
  59   my $rot = (7,12,17,22)[$i%4];
  60   my $j   = ($i+1)/2;
  61
  62   if ($i&1) {
  63     $code.=<<___;
  64          srlx   @X[$j],$shr,@X[$j]      ! align X[`$i+1`]
  65         and     $b,$t1,$t1              ! round $i
  66          sllx   @X[$j+1],$shl1,$tx
  67         add     $t2,$a,$a
  68          sllx   $tx,$shl2,$tx
  69         xor     $d,$t1,$t1
  70          or     $tx,@X[$j],@X[$j]
  71          sethi  %hi(@K[$i+1]),$t2
  72         add     $t1,$a,$a
  73          or     $t2,%lo(@K[$i+1]),$t2
  74         sll     $a,$rot,$t3
  75          add    @X[$j],$t2,$t2          ! X[`$i+1`]+K[`$i+1`]
  76         srl     $a,32-$rot,$a
  77         add     $b,$t3,$t3
  78          xor     $b,$c,$t1
  79         add     $t3,$a,$a
  80 ___
  81   } else {
  82     $code.=<<___;
  83          srlx   @X[$j],32,$tx           ! extract X[`2*$j+1`]
  84         and     $b,$t1,$t1              ! round $i
  85         add     $t2,$a,$a
  86         xor     $d,$t1,$t1
  87          sethi  %hi(@K[$i+1]),$t2
  88         add     $t1,$a,$a
  89          or     $t2,%lo(@K[$i+1]),$t2
  90         sll     $a,$rot,$t3
  91          add    $tx,$t2,$t2             ! X[`2*$j+1`]+K[`$i+1`]
  92         srl     $a,32-$rot,$a
  93         add     $b,$t3,$t3
  94          xor     $b,$c,$t1
  95         add     $t3,$a,$a
  96 ___
  97   }
  98 }
  99
 100 sub R0_1 {
 101   my ($i,$a,$b,$c,$d) = @_;
 102   my $rot = (7,12,17,22)[$i%4];
 103
 104 $code.=<<___;
 105          srlx   @X[0],32,$tx            ! extract X[1]
 106         and     $b,$t1,$t1              ! round $i
 107         add     $t2,$a,$a
 108         xor     $d,$t1,$t1
 109          sethi  %hi(@K[$i+1]),$t2
 110         add     $t1,$a,$a
 111          or     $t2,%lo(@K[$i+1]),$t2
 112         sll     $a,$rot,$t3
 113          add    $tx,$t2,$t2             ! X[1]+K[`$i+1`]
 114         srl     $a,32-$rot,$a
 115         add     $b,$t3,$t3
 116          andn    $b,$c,$t1
 117         add     $t3,$a,$a
 118 ___
 119 }
 120
 121 sub R1 {
 122   my ($i,$a,$b,$c,$d) = @_;
 123   my $rot = (5,9,14,20)[$i%4];
 124   my $j   = $i<31 ? (1+5*($i+1))%16 : (5+3*($i+1))%16;
 125   my $xi  = @X[$j/2];
 126
 127 $code.=<<___ if ($j&1 && ($xi=$tx));
 128          srlx   @X[$j/2],32,$xi         ! extract X[$j]
 129 ___
 130 $code.=<<___;
 131         and     $b,$d,$t3               ! round $i
 132         add     $t2,$a,$a
 133         or      $t3,$t1,$t1
 134          sethi  %hi(@K[$i+1]),$t2
 135         add     $t1,$a,$a
 136          or     $t2,%lo(@K[$i+1]),$t2
 137         sll     $a,$rot,$t3
 138          add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
 139         srl     $a,32-$rot,$a
 140         add     $b,$t3,$t3
 141          `$i<31?"andn":"xor"`    $b,$c,$t1
 142         add     $t3,$a,$a
 143 ___
 144 }
 145
 146 sub R2 {
 147   my ($i,$a,$b,$c,$d) = @_;
 148   my $rot = (4,11,16,23)[$i%4];
 149   my $j   = $i<47 ? (5+3*($i+1))%16 : (0+7*($i+1))%16;
 150   my $xi  = @X[$j/2];
 151
 152 $code.=<<___ if ($j&1 && ($xi=$tx));
 153          srlx   @X[$j/2],32,$xi         ! extract X[$j]
 154 ___
 155 $code.=<<___;
 156         add     $t2,$a,$a               ! round $i
 157         xor     $b,$t1,$t1
 158          sethi  %hi(@K[$i+1]),$t2
 159         add     $t1,$a,$a
 160          or     $t2,%lo(@K[$i+1]),$t2
 161         sll     $a,$rot,$t3
 162          add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
 163         srl     $a,32-$rot,$a
 164         add     $b,$t3,$t3
 165          xor     $b,$c,$t1
 166         add     $t3,$a,$a
 167 ___
 168 }
 169
 170 sub R3 {
 171   my ($i,$a,$b,$c,$d) = @_;
 172   my $rot = (6,10,15,21)[$i%4];
 173   my $j   = (0+7*($i+1))%16;
 174   my $xi  = @X[$j/2];
 175
 176 $code.=<<___;
 177         add     $t2,$a,$a               ! round $i
 178 ___
 179 $code.=<<___ if ($j&1 && ($xi=$tx));
 180          srlx   @X[$j/2],32,$xi         ! extract X[$j]
 181 ___
 182 $code.=<<___;
 183         orn     $b,$d,$t1
 184          sethi  %hi(@K[$i+1]),$t2
 185         xor     $c,$t1,$t1
 186          or     $t2,%lo(@K[$i+1]),$t2
 187         add     $t1,$a,$a
 188         sll     $a,$rot,$t3
 189          add    $xi,$t2,$t2             ! X[$j]+K[`$i+1`]
 190         srl     $a,32-$rot,$a
 191         add     $b,$t3,$t3
 192         add     $t3,$a,$a
 193 ___
 194 }
 195
 196 $code.=<<___;
 197 #include "sparc_arch.h"
 198
 199 #ifdef __arch64__
 200 .register       %g2,#scratch
 201 .register       %g3,#scratch
 202 #endif
 203
 204 .section        ".text",#alloc,#execinstr
 205
 206 #ifdef __PIC__
 207 SPARC_PIC_THUNK(%g1)
 208 #endif
 209
 210 .globl  md5_block_asm_data_order
 211 .align  32
 212 md5_block_asm_data_order:
 213         SPARC_LOAD_ADDRESS_LEAF(OPENSSL_sparcv9cap_P,%g1,%g5)
 214         ld      [%g1+4],%g1             ! OPENSSL_sparcv9cap_P[1]
 215
 216         andcc   %g1, CFR_MD5, %g0
 217         be      .Lsoftware
 218         nop
 219
 220         mov     4, %g1
 221         andcc   %o1, 0x7, %g0
 222         lda     [%o0 + %g0]0x88, %f0            ! load context
 223         lda     [%o0 + %g1]0x88, %f1
 224         add     %o0, 8, %o0
 225         lda     [%o0 + %g0]0x88, %f2
 226         lda     [%o0 + %g1]0x88, %f3
 227         bne,pn  %icc, .Lhwunaligned
 228         sub     %o0, 8, %o0
 229
 230 .Lhw_loop:
 231         ldd     [%o1 + 0x00], %f8
 232         ldd     [%o1 + 0x08], %f10
 233         ldd     [%o1 + 0x10], %f12
 234         ldd     [%o1 + 0x18], %f14
 235         ldd     [%o1 + 0x20], %f16
 236         ldd     [%o1 + 0x28], %f18
 237         ldd     [%o1 + 0x30], %f20
 238         subcc   %o2, 1, %o2             ! done yet?
 239         ldd     [%o1 + 0x38], %f22
 240         add     %o1, 0x40, %o1
 241         prefetch [%o1 + 63], 20
 242
 243         .word   0x81b02800              ! MD5
 244
 245         bne,pt  SIZE_T_CC, .Lhw_loop
 246         nop
 247
 248 .Lhwfinish:
 249         sta     %f0, [%o0 + %g0]0x88    ! store context
 250         sta     %f1, [%o0 + %g1]0x88
 251         add     %o0, 8, %o0
 252         sta     %f2, [%o0 + %g0]0x88
 253         sta     %f3, [%o0 + %g1]0x88
 254         retl
 255         nop
 256
 257 .align  8
 258 .Lhwunaligned:
 259         alignaddr %o1, %g0, %o1
 260
 261         ldd     [%o1 + 0x00], %f10
 262 .Lhwunaligned_loop:
 263         ldd     [%o1 + 0x08], %f12
 264         ldd     [%o1 + 0x10], %f14
 265         ldd     [%o1 + 0x18], %f16
 266         ldd     [%o1 + 0x20], %f18
 267         ldd     [%o1 + 0x28], %f20
 268         ldd     [%o1 + 0x30], %f22
 269         ldd     [%o1 + 0x38], %f24
 270         subcc   %o2, 1, %o2             ! done yet?
 271         ldd     [%o1 + 0x40], %f26
 272         add     %o1, 0x40, %o1
 273         prefetch [%o1 + 63], 20
 274
 275         faligndata %f10, %f12, %f8
 276         faligndata %f12, %f14, %f10
 277         faligndata %f14, %f16, %f12
 278         faligndata %f16, %f18, %f14
 279         faligndata %f18, %f20, %f16
 280         faligndata %f20, %f22, %f18
 281         faligndata %f22, %f24, %f20
 282         faligndata %f24, %f26, %f22
 283
 284         .word   0x81b02800              ! MD5
 285
 286         bne,pt  SIZE_T_CC, .Lhwunaligned_loop
 287         for     %f26, %f26, %f10        ! %f10=%f26
 288
 289         ba      .Lhwfinish
 290         nop
 291
 292 .align  16
 293 .Lsoftware:
 294         save    %sp,-STACK_FRAME,%sp
 295
 296         rd      %asi,$saved_asi
 297         wr      %g0,0x88,%asi           ! ASI_PRIMARY_LITTLE
 298         and     $inp,7,$shr
 299         andn    $inp,7,$inp
 300
 301         sll     $shr,3,$shr             ! *=8
 302         mov     56,$shl2
 303         ld      [$ctx+0],$A
 304         sub     $shl2,$shr,$shl2
 305         ld      [$ctx+4],$B
 306         and     $shl2,32,$shl1
 307         add     $shl2,8,$shl2
 308         ld      [$ctx+8],$C
 309         sub     $shl2,$shl1,$shl2       ! shr+shl1+shl2==64
 310         ld      [$ctx+12],$D
 311         nop
 312
 313 .Loop:
 314          cmp    $shr,0                  ! was inp aligned?
 315         ldxa    [$inp+0]%asi,@X[0]      ! load little-endian input
 316         ldxa    [$inp+8]%asi,@X[1]
 317         ldxa    [$inp+16]%asi,@X[2]
 318         ldxa    [$inp+24]%asi,@X[3]
 319         ldxa    [$inp+32]%asi,@X[4]
 320          sllx   $A,32,$AB               ! pack A,B
 321         ldxa    [$inp+40]%asi,@X[5]
 322          sllx   $C,32,$CD               ! pack C,D
 323         ldxa    [$inp+48]%asi,@X[6]
 324          or     $B,$AB,$AB
 325         ldxa    [$inp+56]%asi,@X[7]
 326          or     $D,$CD,$CD
 327         bnz,a,pn        %icc,.+8
 328         ldxa    [$inp+64]%asi,@X[8]
 329
 330         srlx    @X[0],$shr,@X[0]        ! align X[0]
 331         sllx    @X[1],$shl1,$tx
 332          sethi  %hi(@K[0]),$t2
 333         sllx    $tx,$shl2,$tx
 334          or     $t2,%lo(@K[0]),$t2
 335         or      $tx,@X[0],@X[0]
 336          xor    $C,$D,$t1
 337          add    @X[0],$t2,$t2           ! X[0]+K[0]
 338 ___
 339         for ($i=0;$i<15;$i++)   { &R0($i,@V);   unshift(@V,pop(@V)); }
 340         for (;$i<16;$i++)       { &R0_1($i,@V); unshift(@V,pop(@V)); }
 341         for (;$i<32;$i++)       { &R1($i,@V);   unshift(@V,pop(@V)); }
 342         for (;$i<48;$i++)       { &R2($i,@V);   unshift(@V,pop(@V)); }
 343         for (;$i<64;$i++)       { &R3($i,@V);   unshift(@V,pop(@V)); }
 344 $code.=<<___;
 345         srlx    $AB,32,$t1              ! unpack A,B,C,D and accumulate
 346         add     $inp,64,$inp            ! advance inp
 347         srlx    $CD,32,$t2
 348         add     $t1,$A,$A
 349         subcc   $len,1,$len             ! done yet?
 350         add     $AB,$B,$B
 351         add     $t2,$C,$C
 352         add     $CD,$D,$D
 353         srl     $B,0,$B                 ! clruw $B
 354         bne     SIZE_T_CC,.Loop
 355         srl     $D,0,$D                 ! clruw $D
 356
 357         st      $A,[$ctx+0]             ! write out ctx
 358         st      $B,[$ctx+4]
 359         st      $C,[$ctx+8]
 360         st      $D,[$ctx+12]
 361
 362         wr      %g0,$saved_asi,%asi
 363         ret
 364         restore
 365 .type   md5_block_asm_data_order,#function
 366 .size   md5_block_asm_data_order,(.-md5_block_asm_data_order)
 367
 368 .asciz  "MD5 block transform for SPARCv9, CRYPTOGAMS by <appro\@openssl.org>"
 369 .align  4
 370 ___
 371
 372 # Purpose of these subroutines is to explicitly encode VIS instructions,
 373 # so that one can compile the module without having to specify VIS
 374 # extentions on compiler command line, e.g. -xarch=v9 vs. -xarch=v9a.
 375 # Idea is to reserve for option to produce "universal" binary and let
 376 # programmer detect if current CPU is VIS capable at run-time.
 377 sub unvis {
 378 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 379 my $ref,$opf;
 380 my %visopf = (  "faligndata"    => 0x048,
 381                 "for"           => 0x07c        );
 382
 383     $ref = "$mnemonic\t$rs1,$rs2,$rd";
 384
 385     if ($opf=$visopf{$mnemonic}) {
 386         foreach ($rs1,$rs2,$rd) {
 387             return $ref if (!/%f([0-9]{1,2})/);
 388             $_=$1;
 389             if ($1>=32) {
 390                 return $ref if ($1&1);
 391                 # re-encode for upper double register addressing
 392                 $_=($1|$1>>5)&31;
 393             }
 394         }
 395
 396         return  sprintf ".word\t0x%08x !%s",
 397                         0x81b00000|$rd<<25|$rs1<<14|$opf<<5|$rs2,
 398                         $ref;
 399     } else {
 400         return $ref;
 401     }
 402 }
 403 sub unalignaddr {
 404 my ($mnemonic,$rs1,$rs2,$rd)=@_;
 405 my %bias = ( "g" => 0, "o" => 8, "l" => 16, "i" => 24 );
 406 my $ref="$mnemonic\t$rs1,$rs2,$rd";
 407
 408     foreach ($rs1,$rs2,$rd) {
 409         if (/%([goli])([0-7])/) { $_=$bias{$1}+$2; }
 410         else                    { return $ref; }
 411     }
 412     return  sprintf ".word\t0x%08x !%s",
 413                     0x81b00300|$rd<<25|$rs1<<14|$rs2,
 414                     $ref;
 415 }
 416
 417 foreach (split("\n",$code)) {
 418         s/\`([^\`]*)\`/eval $1/ge;
 419
 420         s/\b(f[^\s]*)\s+(%f[0-9]{1,2}),\s*(%f[0-9]{1,2}),\s*(%f[0-9]{1,2})/
 421                 &unvis($1,$2,$3,$4)
 422          /ge;
 423         s/\b(alignaddr)\s+(%[goli][0-7]),\s*(%[goli][0-7]),\s*(%[goli][0-7])/
 424                 &unalignaddr($1,$2,$3,$4)
 425          /ge;
 426
 427         print $_,"\n";
 428 }
 429
 430 close STDOUT;