crypto/rc4/asm/rc4-ia64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by David Mosberger <David.Mosberger@acm.org> based on the
   5 # Itanium optimized Crypto code which was released by HP Labs at
   6 # http://www.hpl.hp.com/research/linux/crypto/.
   7 #
   8 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
   9 #
  10 # Permission is hereby granted, free of charge, to any person obtaining
  11 # a copy of this software and associated documentation files (the
  12 # "Software"), to deal in the Software without restriction, including
  13 # without limitation the rights to use, copy, modify, merge, publish,
  14 # distribute, sublicense, and/or sell copies of the Software, and to
  15 # permit persons to whom the Software is furnished to do so, subject to
  16 # the following conditions:
  17 #
  18 # The above copyright notice and this permission notice shall be
  19 # included in all copies or substantial portions of the Software.
  20
  21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  22 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  23 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  24 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  25 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  26 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  27 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
  28
  29
  30
  31 # This is a little helper program which generates a software-pipelined
  32 # for RC4 encryption.  The basic algorithm looks like this:
  33 #
  34 #   for (counter = 0; counter < len; ++counter)
  35 #     {
  36 #       in = inp[counter];
  37 #       SI = S[I];
  38 #       J = (SI + J) & 0xff;
  39 #       SJ = S[J];
  40 #       T = (SI + SJ) & 0xff;
  41 #       S[I] = SJ, S[J] = SI;
  42 #       ST = S[T];
  43 #       outp[counter] = in ^ ST;
  44 #       I = (I + 1) & 0xff;
  45 #     }
  46 #
  47 # Pipelining this loop isn't easy, because the stores to the S[] array
  48 # need to be observed in the right order.  The loop generated by the
  49 # code below has the following pipeline diagram:
  50 #
  51 #      cycle
  52 #     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
  53 # iter
  54 #   1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  55 #   2:             xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  56 #   3:                         xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  57 #
  58 #   where:
  59 #       LDI = load of S[I]
  60 #       LDJ = load of S[J]
  61 #       SWP = swap of S[I] and S[J]
  62 #       LDT = load of S[T]
  63 #
  64 # Note that in the above diagram, the major trouble-spot is that LDI
  65 # of the 2nd iteration is performed BEFORE the SWP of the first
  66 # iteration.  Fortunately, this is easy to detect (I of the 1st
  67 # iteration will be equal to J of the 2nd iteration) and when this
  68 # happens, we simply forward the proper value from the 1st iteration
  69 # to the 2nd one.  The proper value in this case is simply the value
  70 # of S[I] from the first iteration (thanks to the fact that SWP
  71 # simply swaps the contents of S[I] and S[J]).
  72 #
  73 # Another potential trouble-spot is in cycle 7, where SWP of the 1st
  74 # iteration issues at the same time as the LDI of the 3rd iteration.
  75 # However, thanks to IA-64 execution semantics, this can be taken
  76 # care of simply by placing LDI later in the instruction-group than
  77 # SWP.  IA-64 CPUs will automatically forward the value if they
  78 # detect that the SWP and LDI are accessing the same memory-location.
  79
  80 # The core-loop that can be pipelined then looks like this (annotated
  81 # with McKinley/Madison issue port & latency numbers, assuming L1
  82 # cache hits for the most part):
  83
  84 # operation:        instruction:                    issue-ports:  latency
  85 # ------------------  -----------------------------   ------------- -------
  86
  87 # Data = *inp++       ld1 data = [inp], 1             M0-M1         1 cyc     c0
  88 #                     shladd Iptr = I, KeyTable, 3    M0-M3, I0, I1 1 cyc
  89 # I = (I + 1) & 0xff  padd1 nextI = I, one            M0-M3, I0, I1 3 cyc
  90 #                     ;;
  91 # SI = S[I]           ld8 SI = [Iptr]                 M0-M1         1 cyc     c1 * after SWAP!
  92 #                     ;;
  93 #                     cmp.eq.unc pBypass = I, J                                  * after J is valid!
  94 # J = SI + J          add J = J, SI                   M0-M3, I0, I1 1 cyc     c2
  95 #                     (pBypass) br.cond.spnt Bypass
  96 #                     ;;
  97 # ---------------------------------------------------------------------------------------
  98 # J = J & 0xff        zxt1 J = J                      I0, I1, 1 cyc           c3
  99 #                     ;;
 100 #                     shladd Jptr = J, KeyTable, 3    M0-M3, I0, I1 1 cyc     c4
 101 #                     ;;
 102 # SJ = S[J]           ld8 SJ = [Jptr]                 M0-M1         1 cyc     c5
 103 #                     ;;
 104 # ---------------------------------------------------------------------------------------
 105 # T = (SI + SJ)       add T = SI, SJ                  M0-M3, I0, I1 1 cyc     c6
 106 #                     ;;
 107 # T = T & 0xff        zxt1 T = T                      I0, I1        1 cyc
 108 # S[I] = SJ           st8 [Iptr] = SJ                 M2-M3                   c7
 109 # S[J] = SI           st8 [Jptr] = SI                 M2-M3
 110 #                     ;;
 111 #                     shladd Tptr = T, KeyTable, 3    M0-M3, I0, I1 1 cyc     c8
 112 #                     ;;
 113 # ---------------------------------------------------------------------------------------
 114 # T = S[T]            ld8 T = [Tptr]                  M0-M1         1 cyc     c9
 115 #                     ;;
 116 # data ^= T           xor data = data, T              M0-M3, I0, I1 1 cyc     c10
 117 #                     ;;
 118 # *out++ = Data ^ T   dep word = word, data, 8, POS   I0, I1        1 cyc     c11
 119 #                     ;;
 120 # ---------------------------------------------------------------------------------------
 121
 122 # There are several points worth making here:
 123
 124 #   - Note that due to the bypass/forwarding-path, the first two
 125 #     phases of the loop are strangly mingled together.  In
 126 #     particular, note that the first stage of the pipeline is
 127 #     using the value of "J", as calculated by the second stage.
 128 #   - Each bundle-pair will have exactly 6 instructions.
 129 #   - Pipelined, the loop can execute in 3 cycles/iteration and
 130 #     4 stages.  However, McKinley/Madison can issue "st1" to
 131 #     the same bank at a rate of at most one per 4 cycles.  Thus,
 132 #     instead of storing each byte, we accumulate them in a word
 133 #     and then write them back at once with a single "st8" (this
 134 #     implies that the setup code needs to ensure that the output
 135 #     buffer is properly aligned, if need be, by encoding the
 136 #     first few bytes separately).
 137 #   - There is no space for a "br.ctop" instruction.  For this
 138 #     reason we can't use module-loop support in IA-64 and have
 139 #     to do a traditional, purely software-pipelined loop.
 140 #   - We can't replace any of the remaining "add/zxt1" pairs with
 141 #     "padd1" because the latency for that instruction is too high
 142 #     and would push the loop to the point where more bypasses
 143 #     would be needed, which we don't have space for.
 144 #   - The above loop runs at around 3.26 cycles/byte, or roughly
 145 #     440 MByte/sec on a 1.5GHz Madison.  This is well below the
 146 #     system bus bandwidth and hence with judicious use of
 147 #     "lfetch" this loop can run at (almost) peak speed even when
 148 #     the input and output data reside in memory.  The
 149 #     max. latency that can be tolerated is (PREFETCH_DISTANCE *
 150 #     L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
 151 #     least) 1-ahead prefetching of 128 byte cache-lines.  Note
 152 #     that we do NOT prefetch into L1, since that would only
 153 #     interfere with the S[] table values stored there.  This is
 154 #     acceptable because there is a 10 cycle latency between
 155 #     load and first use of the input data.
 156 #   - We use a branch to out-of-line bypass-code of cycle-pressure:
 157 #     we calculate the next J, check for the need to activate the
 158 #     bypass path, and activate the bypass path ALL IN THE SAME
 159 #     CYCLE.  If we didn't have these constraints, we could do
 160 #     the bypass with a simple conditional move instruction.
 161 #     Fortunately, the bypass paths get activated relatively
 162 #     infrequently, so the extra branches don't cost all that much
 163 #     (about 0.04 cycles/byte, measured on a 16396 byte file with
 164 #     random input data).
 165 #
 166
 167 $phases = 4;            # number of stages/phases in the pipelined-loop
 168 $unroll_count = 6;      # number of times we unrolled it
 169 $pComI = (1 << 0);
 170 $pComJ = (1 << 1);
 171 $pComT = (1 << 2);
 172 $pOut  = (1 << 3);
 173
 174 $NData = 4;
 175 $NIP = 3;
 176 $NJP = 2;
 177 $NI = 2;
 178 $NSI = 3;
 179 $NSJ = 2;
 180 $NT = 2;
 181 $NOutWord = 2;
 182
 183 #
 184 # $threshold is the minimum length before we attempt to use the
 185 # big software-pipelined loop.  It MUST be greater-or-equal
 186 # to:
 187 #               PHASES * (UNROLL_COUNT + 1) + 7
 188 #
 189 # The "+ 7" comes from the fact we may have to encode up to
 190 #   7 bytes separately before the output pointer is aligned.
 191 #
 192 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
 193
 194 sub I {
 195     local *code = shift;
 196     local $format = shift;
 197     $code .= sprintf ("\t\t".$format."\n", @_);
 198 }
 199
 200 sub P {
 201     local *code = shift;
 202     local $format = shift;
 203     $code .= sprintf ($format."\n", @_);
 204 }
 205
 206 sub STOP {
 207     local *code = shift;
 208     $code .=<<___;
 209                 ;;
 210 ___
 211 }
 212
 213 sub emit_body {
 214     local *c = shift;
 215     local *bypass = shift;
 216     local ($iteration, $p) = @_;
 217
 218     local $i0 = $iteration;
 219     local $i1 = $iteration - 1;
 220     local $i2 = $iteration - 2;
 221     local $i3 = $iteration - 3;
 222     local $iw0 = ($iteration - 3) / 8;
 223     local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
 224     local $byte_num = ($iteration - 3) % 8;
 225     local $label = $iteration + 1;
 226     local $pAny = ($p & 0xf) == 0xf;
 227     local $pByp = (($p & $pComI) && ($iteration > 0));
 228
 229     $c.=<<___;
 230 //////////////////////////////////////////////////
 231 ___
 232
 233     if (($p & 0xf) == 0) {
 234         $c.="#ifdef RC4_BIG_ENDIAN\n";
 235         &I(\$c,"shr.u   OutWord[%u] = OutWord[%u], 32;;",
 236                                 $iw1 % $NOutWord, $iw1 % $NOutWord);
 237         $c.="#endif\n";
 238         &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
 239         return;
 240     }
 241
 242     # Cycle 0
 243     &I(\$c, "{ .mmi")                                         if ($pAny);
 244     &I(\$c, "ld1    Data[%u] = [InPtr], 1", $i0 % $NData)     if ($p & $pComI);
 245     &I(\$c, "padd1  I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
 246     &I(\$c, "zxt1   J = J")                                   if ($p & $pComJ);
 247     &I(\$c, "}")                                              if ($pAny);
 248     &I(\$c, "{ .mmi")                                         if ($pAny);
 249     &I(\$c, "LKEY   T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT)   if ($p & $pOut);
 250     &I(\$c, "add    T[%u] = SI[%u], SJ[%u]",
 251        $i0 % $NT, $i2 % $NSI, $i1 % $NSJ)                     if ($p & $pComT);
 252     &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
 253     &I(\$c, "}")                                              if ($pAny);
 254     &STOP(\$c);
 255
 256     # Cycle 1
 257     &I(\$c, "{ .mmi")                                         if ($pAny);
 258     &I(\$c, "SKEY   [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
 259     &I(\$c, "SKEY   [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
 260     &I(\$c, "zxt1   T[%u] = T[%u]", $i0 % $NT, $i0 % $NT)     if ($p & $pComT);
 261     &I(\$c, "}")                                              if ($pAny);
 262     &I(\$c, "{ .mmi")                                         if ($pAny);
 263     &I(\$c, "LKEY   SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
 264     &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP)                 if ($p & $pComJ);
 265     &I(\$c, "xor    Data[%u] = Data[%u], T[%u]",
 266        $i3 % $NData, $i3 % $NData, $i1 % $NT)                 if ($p & $pOut);
 267     &I(\$c, "}")                                              if ($pAny);
 268     &STOP(\$c);
 269
 270     # Cycle 2
 271     &I(\$c, "{ .mmi")                                         if ($pAny);
 272     &I(\$c, "LKEY   SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
 273     &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI)       if ($pByp);
 274     &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
 275        $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
 276     &I(\$c, "}")                                              if ($pAny);
 277     &I(\$c, "{ .mmb")                                         if ($pAny);
 278     &I(\$c, "add    J = J, SI[%u]", $i0 % $NSI)               if ($p & $pComI);
 279     &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT)    if ($p & $pComT);
 280     &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
 281     &I(\$c, "}") if ($pAny);
 282     &STOP(\$c);
 283
 284     &P(\$c, ".rc4Resume%u:", $label)                          if ($pByp);
 285     if ($byte_num == 0 && $iteration >= $phases) {
 286         &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
 287            $iw1 % $NOutWord)                                  if ($p & $pOut);
 288         if ($iteration == (1 + $unroll_count) * $phases - 1) {
 289             if ($unroll_count == 6) {
 290                 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
 291                    $iw1 % $NOutWord, $iw0 % $NOutWord);
 292             }
 293             &I(\$c, "lfetch.nt1 [InPrefetch], %u",
 294                $unroll_count * $phases);
 295             &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
 296                $unroll_count * $phases);
 297             &I(\$c, "br.cloop.sptk.few .rc4Loop");
 298         }
 299     }
 300
 301     if ($pByp) {
 302         &P(\$bypass, ".rc4Bypass%u:", $label);
 303         &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
 304         &I(\$bypass, "nop 0");
 305         &I(\$bypass, "nop 0");
 306         &I(\$bypass, ";;");
 307         &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
 308         &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
 309         &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
 310         &I(\$bypass, ";;");
 311     }
 312 }
 313
 314 $code=<<___;
 315 .ident \"rc4-ia64.s, version 3.0\"
 316 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
 317
 318 #define LCSave          r8
 319 #define PRSave          r9
 320
 321 /* Inputs become invalid once rotation begins!  */
 322
 323 #define StateTable      in0
 324 #define DataLen         in1
 325 #define InputBuffer     in2
 326 #define OutputBuffer    in3
 327
 328 #define KTable          r14
 329 #define J               r15
 330 #define InPtr           r16
 331 #define OutPtr          r17
 332 #define InPrefetch      r18
 333 #define OutPrefetch     r19
 334 #define One             r20
 335 #define LoopCount       r21
 336 #define Remainder       r22
 337 #define IFinal          r23
 338 #define EndPtr          r24
 339
 340 #define tmp0            r25
 341 #define tmp1            r26
 342
 343 #define pBypass         p6
 344 #define pDone           p7
 345 #define pSmall          p8
 346 #define pAligned        p9
 347 #define pUnaligned      p10
 348
 349 #define pComputeI       pPhase[0]
 350 #define pComputeJ       pPhase[1]
 351 #define pComputeT       pPhase[2]
 352 #define pOutput         pPhase[3]
 353
 354 #define RetVal          r8
 355 #define L_OK            p7
 356 #define L_NOK           p8
 357
 358 #define _NINPUTS        4
 359 #define _NOUTPUT        0
 360
 361 #define _NROTATE        24
 362 #define _NLOCALS        (_NROTATE - _NINPUTS - _NOUTPUT)
 363
 364 #ifndef SZ
 365 # define SZ     4       // this must be set to sizeof(RC4_INT)
 366 #endif
 367
 368 #if SZ == 1
 369 # define LKEY                   ld1
 370 # define SKEY                   st1
 371 # define KEYADDR(dst, i)        add dst = i, KTable
 372 #elif SZ == 2
 373 # define LKEY                   ld2
 374 # define SKEY                   st2
 375 # define KEYADDR(dst, i)        shladd dst = i, 1, KTable
 376 #elif SZ == 4
 377 # define LKEY                   ld4
 378 # define SKEY                   st4
 379 # define KEYADDR(dst, i)        shladd dst = i, 2, KTable
 380 #else
 381 # define LKEY                   ld8
 382 # define SKEY                   st8
 383 # define KEYADDR(dst, i)        shladd dst = i, 3, KTable
 384 #endif
 385
 386 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 387 # define ADDP   addp4
 388 #else
 389 # define ADDP   add
 390 #endif
 391
 392 /* Define a macro for the bit number of the n-th byte: */
 393
 394 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
 395 # define RC4_BIG_ENDIAN
 396 # define BYTE_POS(n)    (56 - (8 * (n)))
 397 #else
 398 # define BYTE_POS(n)    (8 * (n))
 399 #endif
 400
 401 /*
 402    We must perform the first phase of the pipeline explicitly since
 403    we will always load from the stable the first time. The br.cexit
 404    will never be taken since regardless of the number of bytes because
 405    the epilogue count is 4.
 406 */
 407 /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
 408    assembler failed on original macro with syntax error. <appro> */
 409 #define MODSCHED_RC4_PROLOGUE                                              \\
 410         {                                                                  \\
 411                                 ld1             Data[0] = [InPtr], 1;      \\
 412                                 add             IFinal = 1, I[1];          \\
 413                                 KEYADDR(IPr[0], I[1]);                     \\
 414         } ;;                                                               \\
 415         {                                                                  \\
 416                                 LKEY            SI[0] = [IPr[0]];          \\
 417                                 mov             pr.rot = 0x10000;          \\
 418                                 mov             ar.ec = 4;                 \\
 419         } ;;                                                               \\
 420         {                                                                  \\
 421                                 add             J = J, SI[0];              \\
 422                                 zxt1            I[0] = IFinal;             \\
 423                                 br.cexit.spnt.few .+16; /* never taken */  \\
 424         } ;;
 425 #define MODSCHED_RC4_LOOP(label)                                           \\
 426 label:                                                                     \\
 427         {       .mmi;                                                      \\
 428                 (pComputeI)     ld1             Data[0] = [InPtr], 1;      \\
 429                 (pComputeI)     add             IFinal = 1, I[1];          \\
 430                 (pComputeJ)     zxt1            J = J;                     \\
 431         }{      .mmi;                                                      \\
 432                 (pOutput)       LKEY            T[1] = [T[1]];             \\
 433                 (pComputeT)     add             T[0] = SI[2], SJ[1];       \\
 434                 (pComputeI)     KEYADDR(IPr[0], I[1]);                     \\
 435         } ;;                                                               \\
 436         {       .mmi;                                                      \\
 437                 (pComputeT)     SKEY            [IPr[2]] = SJ[1];          \\
 438                 (pComputeT)     SKEY            [JP[1]] = SI[2];           \\
 439                 (pComputeT)     zxt1            T[0] = T[0];               \\
 440         }{      .mmi;                                                      \\
 441                 (pComputeI)     LKEY            SI[0] = [IPr[0]];          \\
 442                 (pComputeJ)     KEYADDR(JP[0], J);                         \\
 443                 (pComputeI)     cmp.eq.unc      pBypass, p0 = I[1], J;     \\
 444         } ;;                                                               \\
 445         {       .mmi;                                                      \\
 446                 (pComputeJ)     LKEY            SJ[0] = [JP[0]];           \\
 447                 (pOutput)       xor             Data[3] = Data[3], T[1];   \\
 448                                 nop             0x0;                       \\
 449         }{      .mmi;                                                      \\
 450                 (pComputeT)     KEYADDR(T[0], T[0]);                       \\
 451                 (pBypass)       mov             SI[0] = SI[1];             \\
 452                 (pComputeI)     zxt1            I[0] = IFinal;             \\
 453         } ;;                                                               \\
 454         {       .mmb;                                                      \\
 455                 (pOutput)       st1             [OutPtr] = Data[3], 1;     \\
 456                 (pComputeI)     add             J = J, SI[0];              \\
 457                                 br.ctop.sptk.few label;                    \\
 458         } ;;
 459
 460         .text
 461
 462         .align  32
 463
 464         .type   RC4, \@function
 465         .global RC4
 466
 467         .proc   RC4
 468         .prologue
 469
 470 RC4:
 471         {
 472                 .mmi
 473                 alloc   r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
 474
 475                 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
 476                       OutWord[2]
 477                 .rotp pPhase[4]
 478
 479                 ADDP            InPrefetch = 0, InputBuffer
 480                 ADDP            KTable = 0, StateTable
 481         }
 482         {
 483                 .mmi
 484                 ADDP            InPtr = 0, InputBuffer
 485                 ADDP            OutPtr = 0, OutputBuffer
 486                 mov             RetVal = r0
 487         }
 488         ;;
 489         {
 490                 .mmi
 491                 lfetch.nt1      [InPrefetch], 0x80
 492                 ADDP            OutPrefetch = 0, OutputBuffer
 493         }
 494         {               // Return 0 if the input length is nonsensical
 495                 .mib
 496                 ADDP            StateTable = 0, StateTable
 497                 cmp.ge.unc      L_NOK, L_OK = r0, DataLen
 498         (L_NOK) br.ret.sptk.few rp
 499         }
 500         ;;
 501         {
 502                 .mib
 503                 cmp.eq.or       L_NOK, L_OK = r0, InPtr
 504                 cmp.eq.or       L_NOK, L_OK = r0, OutPtr
 505                 nop             0x0
 506         }
 507         {
 508                 .mib
 509                 cmp.eq.or       L_NOK, L_OK = r0, StateTable
 510                 nop             0x0
 511         (L_NOK) br.ret.sptk.few rp
 512         }
 513         ;;
 514                 LKEY            I[1] = [KTable], SZ
 515 /* Prefetch the state-table. It contains 256 elements of size SZ */
 516
 517 #if SZ == 1
 518                 ADDP            tmp0 = 1*128, StateTable
 519 #elif SZ == 2
 520                 ADDP            tmp0 = 3*128, StateTable
 521                 ADDP            tmp1 = 2*128, StateTable
 522 #elif SZ == 4
 523                 ADDP            tmp0 = 7*128, StateTable
 524                 ADDP            tmp1 = 6*128, StateTable
 525 #elif SZ == 8
 526                 ADDP            tmp0 = 15*128, StateTable
 527                 ADDP            tmp1 = 14*128, StateTable
 528 #endif
 529                 ;;
 530 #if SZ >= 8
 531                 lfetch.fault.nt1                [tmp0], -256    // 15
 532                 lfetch.fault.nt1                [tmp1], -256;;
 533                 lfetch.fault.nt1                [tmp0], -256    // 13
 534                 lfetch.fault.nt1                [tmp1], -256;;
 535                 lfetch.fault.nt1                [tmp0], -256    // 11
 536                 lfetch.fault.nt1                [tmp1], -256;;
 537                 lfetch.fault.nt1                [tmp0], -256    //  9
 538                 lfetch.fault.nt1                [tmp1], -256;;
 539 #endif
 540 #if SZ >= 4
 541                 lfetch.fault.nt1                [tmp0], -256    //  7
 542                 lfetch.fault.nt1                [tmp1], -256;;
 543                 lfetch.fault.nt1                [tmp0], -256    //  5
 544                 lfetch.fault.nt1                [tmp1], -256;;
 545 #endif
 546 #if SZ >= 2
 547                 lfetch.fault.nt1                [tmp0], -256    //  3
 548                 lfetch.fault.nt1                [tmp1], -256;;
 549 #endif
 550         {
 551                 .mii
 552                 lfetch.fault.nt1                [tmp0]          //  1
 553                 add             I[1]=1,I[1];;
 554                 zxt1            I[1]=I[1]
 555         }
 556         {
 557                 .mmi
 558                 lfetch.nt1      [InPrefetch], 0x80
 559                 lfetch.excl.nt1 [OutPrefetch], 0x80
 560                 .save           pr, PRSave
 561                 mov             PRSave = pr
 562         } ;;
 563         {
 564                 .mmi
 565                 lfetch.excl.nt1 [OutPrefetch], 0x80
 566                 LKEY            J = [KTable], SZ
 567                 ADDP            EndPtr = DataLen, InPtr
 568         }  ;;
 569         {
 570                 .mmi
 571                 ADDP            EndPtr = -1, EndPtr     // Make it point to
 572                                                         // last data byte.
 573                 mov             One = 1
 574                 .save           ar.lc, LCSave
 575                 mov             LCSave = ar.lc
 576                 .body
 577         } ;;
 578         {
 579                 .mmb
 580                 sub             Remainder = 0, OutPtr
 581                 cmp.gtu         pSmall, p0 = $threshold, DataLen
 582 (pSmall)        br.cond.dpnt    .rc4Remainder           // Data too small for
 583                                                         // big loop.
 584         } ;;
 585         {
 586                 .mmi
 587                 and             Remainder = 0x7, Remainder
 588                 ;;
 589                 cmp.eq          pAligned, pUnaligned = Remainder, r0
 590                 nop             0x0
 591         } ;;
 592         {
 593                 .mmb
 594 .pred.rel       "mutex",pUnaligned,pAligned
 595 (pUnaligned)    add             Remainder = -1, Remainder
 596 (pAligned)      sub             Remainder = EndPtr, InPtr
 597 (pAligned)      br.cond.dptk.many .rc4Aligned
 598         } ;;
 599         {
 600                 .mmi
 601                 nop             0x0
 602                 nop             0x0
 603                 mov.i           ar.lc = Remainder
 604         }
 605
 606 /* Do the initial few bytes via the compact, modulo-scheduled loop
 607    until the output pointer is 8-byte-aligned.  */
 608
 609                 MODSCHED_RC4_PROLOGUE
 610                 MODSCHED_RC4_LOOP(.RC4AlignLoop)
 611
 612         {
 613                 .mib
 614                 sub             Remainder = EndPtr, InPtr
 615                 zxt1            IFinal = IFinal
 616                 clrrrb                          // Clear CFM.rrb.pr so
 617                 ;;                              // next "mov pr.rot = N"
 618                                                 // does the right thing.
 619         }
 620         {
 621                 .mmi
 622                 mov             I[1] = IFinal
 623                 nop             0x0
 624                 nop             0x0
 625         } ;;
 626
 627
 628 .rc4Aligned:
 629
 630 /*
 631    Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
 632  */
 633
 634         {
 635                 .mlx
 636                 add     LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
 637                 movl            Remainder = 0xaaaaaaaaaaaaaaab
 638         } ;;
 639         {
 640                 .mmi
 641                 setf.sig        f6 = LoopCount          // M2, M3       6 cyc
 642                 setf.sig        f7 = Remainder          // M2, M3       6 cyc
 643                 nop             0x0
 644         } ;;
 645         {
 646                 .mfb
 647                 nop             0x0
 648                 xmpy.hu         f6 = f6, f7
 649                 nop             0x0
 650         } ;;
 651         {
 652                 .mmi
 653                 getf.sig        LoopCount = f6;;        // M2           5 cyc
 654                 nop             0x0
 655                 shr.u           LoopCount = LoopCount, 4
 656         } ;;
 657         {
 658                 .mmi
 659                 nop             0x0
 660                 nop             0x0
 661                 mov.i           ar.lc = LoopCount
 662         } ;;
 663
 664 /* Now comes the unrolled loop: */
 665
 666 .rc4Prologue:
 667 ___
 668
 669 $iteration = 0;
 670
 671 # Generate the prologue:
 672 $predicates = 1;
 673 for ($i = 0; $i < $phases; ++$i) {
 674     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 675     $predicates = ($predicates << 1) | 1;
 676 }
 677
 678 $code.=<<___;
 679 .rc4Loop:
 680 ___
 681
 682 # Generate the body:
 683 for ($i = 0; $i < $unroll_count*$phases; ++$i) {
 684     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 685 }
 686
 687 $code.=<<___;
 688 .rc4Epilogue:
 689 ___
 690
 691 # Generate the epilogue:
 692 for ($i = 0; $i < $phases; ++$i) {
 693     $predicates <<= 1;
 694     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 695 }
 696
 697 $code.=<<___;
 698         {
 699                 .mmi
 700                 lfetch.nt1      [EndPtr]        // fetch line with last byte
 701                 mov             IFinal = I[1]
 702                 nop             0x0
 703         }
 704
 705 .rc4Remainder:
 706         {
 707                 .mmi
 708                 sub             Remainder = EndPtr, InPtr       // Calculate
 709                                                                 // # of bytes
 710                                                                 // left - 1
 711                 nop             0x0
 712                 nop             0x0
 713         } ;;
 714         {
 715                 .mib
 716                 cmp.eq          pDone, p0 = -1, Remainder // done already?
 717                 mov.i           ar.lc = Remainder
 718 (pDone)         br.cond.dptk.few .rc4Complete
 719         }
 720
 721 /* Do the remaining bytes via the compact, modulo-scheduled loop */
 722
 723                 MODSCHED_RC4_PROLOGUE
 724                 MODSCHED_RC4_LOOP(.RC4RestLoop)
 725
 726 .rc4Complete:
 727         {
 728                 .mmi
 729                 add             KTable = -SZ, KTable
 730                 add             IFinal = -1, IFinal
 731                 mov             ar.lc = LCSave
 732         } ;;
 733         {
 734                 .mii
 735                 SKEY            [KTable] = J,-SZ
 736                 zxt1            IFinal = IFinal
 737                 mov             pr = PRSave, 0x1FFFF
 738         } ;;
 739         {
 740                 .mib
 741                 SKEY            [KTable] = IFinal
 742                 add             RetVal = 1, r0
 743                 br.ret.sptk.few rp
 744         } ;;
 745 ___
 746
 747 # Last but not least, emit the code for the bypass-code of the unrolled loop:
 748
 749 $code.=$bypass;
 750
 751 $code.=<<___;
 752         .endp RC4
 753 ___
 754
 755 print $code;