crypto/rc4/asm/rc4-ia64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by David Mosberger <David.Mosberger@acm.org> based on the
   5 # Itanium optimized Crypto code which was released by HP Labs at
   6 # http://www.hpl.hp.com/research/linux/crypto/.
   7 #
   8 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
   9 #
  10 # Permission is hereby granted, free of charge, to any person obtaining
  11 # a copy of this software and associated documentation files (the
  12 # "Software"), to deal in the Software without restriction, including
  13 # without limitation the rights to use, copy, modify, merge, publish,
  14 # distribute, sublicense, and/or sell copies of the Software, and to
  15 # permit persons to whom the Software is furnished to do so, subject to
  16 # the following conditions:
  17 #
  18 # The above copyright notice and this permission notice shall be
  19 # included in all copies or substantial portions of the Software.
  20
  21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  22 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  23 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  24 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  25 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  26 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  27 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
  28
  29
  30
  31 # This is a little helper program which generates a software-pipelined
  32 # for RC4 encryption.  The basic algorithm looks like this:
  33 #
  34 #   for (counter = 0; counter < len; ++counter)
  35 #     {
  36 #       in = inp[counter];
  37 #       SI = S[I];
  38 #       J = (SI + J) & 0xff;
  39 #       SJ = S[J];
  40 #       T = (SI + SJ) & 0xff;
  41 #       S[I] = SJ, S[J] = SI;
  42 #       ST = S[T];
  43 #       outp[counter] = in ^ ST;
  44 #       I = (I + 1) & 0xff;
  45 #     }
  46 #
  47 # Pipelining this loop isn't easy, because the stores to the S[] array
  48 # need to be observed in the right order.  The loop generated by the
  49 # code below has the following pipeline diagram:
  50 #
  51 #      cycle
  52 #     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
  53 # iter
  54 #   1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  55 #   2:             xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  56 #   3:                         xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  57 #
  58 #   where:
  59 #       LDI = load of S[I]
  60 #       LDJ = load of S[J]
  61 #       SWP = swap of S[I] and S[J]
  62 #       LDT = load of S[T]
  63 #
  64 # Note that in the above diagram, the major trouble-spot is that LDI
  65 # of the 2nd iteration is performed BEFORE the SWP of the first
  66 # iteration.  Fortunately, this is easy to detect (I of the 1st
  67 # iteration will be equal to J of the 2nd iteration) and when this
  68 # happens, we simply forward the proper value from the 1st iteration
  69 # to the 2nd one.  The proper value in this case is simply the value
  70 # of S[I] from the first iteration (thanks to the fact that SWP
  71 # simply swaps the contents of S[I] and S[J]).
  72 #
  73 # Another potential trouble-spot is in cycle 7, where SWP of the 1st
  74 # iteration issues at the same time as the LDI of the 3rd iteration.
  75 # However, thanks to IA-64 execution semantics, this can be taken
  76 # care of simply by placing LDI later in the instruction-group than
  77 # SWP.  IA-64 CPUs will automatically forward the value if they
  78 # detect that the SWP and LDI are accessing the same memory-location.
  79
  80 # The core-loop that can be pipelined then looks like this (annotated
  81 # with McKinley/Madison issue port & latency numbers, assuming L1
  82 # cache hits for the most part):
  83
  84 # operation:        instruction:                    issue-ports:  latency
  85 # ------------------  -----------------------------   ------------- -------
  86
  87 # Data = *inp++       ld1 data = [inp], 1             M0-M1         1 cyc     c0
  88 #                     shladd Iptr = I, KeyTable, 3    M0-M3, I0, I1 1 cyc
  89 # I = (I + 1) & 0xff  padd1 nextI = I, one            M0-M3, I0, I1 3 cyc
  90 #                     ;;
  91 # SI = S[I]           ld8 SI = [Iptr]                 M0-M1         1 cyc     c1 * after SWAP!
  92 #                     ;;
  93 #                     cmp.eq.unc pBypass = I, J                                  * after J is valid!
  94 # J = SI + J          add J = J, SI                   M0-M3, I0, I1 1 cyc     c2
  95 #                     (pBypass) br.cond.spnt Bypass
  96 #                     ;;
  97 # ---------------------------------------------------------------------------------------
  98 # J = J & 0xff        zxt1 J = J                      I0, I1, 1 cyc           c3
  99 #                     ;;
 100 #                     shladd Jptr = J, KeyTable, 3    M0-M3, I0, I1 1 cyc     c4
 101 #                     ;;
 102 # SJ = S[J]           ld8 SJ = [Jptr]                 M0-M1         1 cyc     c5
 103 #                     ;;
 104 # ---------------------------------------------------------------------------------------
 105 # T = (SI + SJ)       add T = SI, SJ                  M0-M3, I0, I1 1 cyc     c6
 106 #                     ;;
 107 # T = T & 0xff        zxt1 T = T                      I0, I1        1 cyc
 108 # S[I] = SJ           st8 [Iptr] = SJ                 M2-M3                   c7
 109 # S[J] = SI           st8 [Jptr] = SI                 M2-M3
 110 #                     ;;
 111 #                     shladd Tptr = T, KeyTable, 3    M0-M3, I0, I1 1 cyc     c8
 112 #                     ;;
 113 # ---------------------------------------------------------------------------------------
 114 # T = S[T]            ld8 T = [Tptr]                  M0-M1         1 cyc     c9
 115 #                     ;;
 116 # data ^= T           xor data = data, T              M0-M3, I0, I1 1 cyc     c10
 117 #                     ;;
 118 # *out++ = Data ^ T   dep word = word, data, 8, POS   I0, I1        1 cyc     c11
 119 #                     ;;
 120 # ---------------------------------------------------------------------------------------
 121
 122 # There are several points worth making here:
 123
 124 #   - Note that due to the bypass/forwarding-path, the first two
 125 #     phases of the loop are strangly mingled together.  In
 126 #     particular, note that the first stage of the pipeline is
 127 #     using the value of "J", as calculated by the second stage.
 128 #   - Each bundle-pair will have exactly 6 instructions.
 129 #   - Pipelined, the loop can execute in 3 cycles/iteration and
 130 #     4 stages.  However, McKinley/Madison can issue "st1" to
 131 #     the same bank at a rate of at most one per 4 cycles.  Thus,
 132 #     instead of storing each byte, we accumulate them in a word
 133 #     and then write them back at once with a single "st8" (this
 134 #     implies that the setup code needs to ensure that the output
 135 #     buffer is properly aligned, if need be, by encoding the
 136 #     first few bytes separately).
 137 #   - There is no space for a "br.ctop" instruction.  For this
 138 #     reason we can't use module-loop support in IA-64 and have
 139 #     to do a traditional, purely software-pipelined loop.
 140 #   - We can't replace any of the remaining "add/zxt1" pairs with
 141 #     "padd1" because the latency for that instruction is too high
 142 #     and would push the loop to the point where more bypasses
 143 #     would be needed, which we don't have space for.
 144 #   - The above loop runs at around 3.26 cycles/byte, or roughly
 145 #     440 MByte/sec on a 1.5GHz Madison.  This is well below the
 146 #     system bus bandwidth and hence with judicious use of
 147 #     "lfetch" this loop can run at (almost) peak speed even when
 148 #     the input and output data reside in memory.  The
 149 #     max. latency that can be tolerated is (PREFETCH_DISTANCE *
 150 #     L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
 151 #     least) 1-ahead prefetching of 128 byte cache-lines.  Note
 152 #     that we do NOT prefetch into L1, since that would only
 153 #     interfere with the S[] table values stored there.  This is
 154 #     acceptable because there is a 10 cycle latency between
 155 #     load and first use of the input data.
 156 #   - We use a branch to out-of-line bypass-code of cycle-pressure:
 157 #     we calculate the next J, check for the need to activate the
 158 #     bypass path, and activate the bypass path ALL IN THE SAME
 159 #     CYCLE.  If we didn't have these constraints, we could do
 160 #     the bypass with a simple conditional move instruction.
 161 #     Fortunately, the bypass paths get activated relatively
 162 #     infrequently, so the extra branches don't cost all that much
 163 #     (about 0.04 cycles/byte, measured on a 16396 byte file with
 164 #     random input data).
 165 #
 166
 167 $phases = 4;            # number of stages/phases in the pipelined-loop
 168 $unroll_count = 6;      # number of times we unrolled it
 169 $pComI = (1 << 0);
 170 $pComJ = (1 << 1);
 171 $pComT = (1 << 2);
 172 $pOut  = (1 << 3);
 173
 174 $NData = 4;
 175 $NIP = 3;
 176 $NJP = 2;
 177 $NI = 2;
 178 $NSI = 3;
 179 $NSJ = 2;
 180 $NT = 2;
 181 $NOutWord = 2;
 182
 183 #
 184 # $threshold is the minimum length before we attempt to use the
 185 # big software-pipelined loop.  It MUST be greater-or-equal
 186 # to:
 187 #               PHASES * (UNROLL_COUNT + 1) + 7
 188 #
 189 # The "+ 7" comes from the fact we may have to encode up to
 190 #   7 bytes separately before the output pointer is aligned.
 191 #
 192 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
 193
 194 sub I {
 195     local *code = shift;
 196     local $format = shift;
 197     local $a0 = shift;
 198     local $a1 = shift;
 199     local $a2 = shift;
 200     local $a3 = shift;
 201     $code .= sprintf ("\t\t".$format."\n", $a0, $a1, $a2, $a3);
 202 }
 203
 204 sub P {
 205     local *code = shift;
 206     local $format = shift;
 207     local $a0 = shift;
 208     local $a1 = shift;
 209     local $a2 = shift;
 210     local $a3 = shift;
 211     $code .= sprintf ($format."\n", $a0, $a1, $a2, $a3);
 212 }
 213
 214 sub STOP {
 215     local *code = shift;
 216     $code .=<<___;
 217                 ;;
 218 ___
 219 }
 220
 221 sub emit_body {
 222     local *c = shift;
 223     local *bypass = shift;
 224     local ($iteration, $p) = @_;
 225
 226     local $i0 = $iteration;
 227     local $i1 = $iteration - 1;
 228     local $i2 = $iteration - 2;
 229     local $i3 = $iteration - 3;
 230     local $iw0 = ($iteration - 3) / 8;
 231     local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
 232     local $byte_num = ($iteration - 3) % 8;
 233     local $label = $iteration + 1;
 234     local $pAny = ($p & 0xf) == 0xf;
 235     local $pByp = (($p & $pComI) && ($iteration > 0));
 236
 237     $c.=<<___;
 238 //////////////////////////////////////////////////
 239 ___
 240
 241     if (($p & 0xf) == 0) {
 242         &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
 243         return;
 244     }
 245
 246     # Cycle 0
 247     &I(\$c, "{ .mmi")                                         if ($pAny);
 248     &I(\$c, "ld1    Data[%u] = [InPtr], 1", $i0 % $NData)     if ($p & $pComI);
 249     &I(\$c, "padd1  I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
 250     &I(\$c, "zxt1   J = J")                                   if ($p & $pComJ);
 251     &I(\$c, "}")                                              if ($pAny);
 252     &I(\$c, "{ .mmi")                                         if ($pAny);
 253     &I(\$c, "LKEY   T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT)   if ($p & $pOut);
 254     &I(\$c, "add    T[%u] = SI[%u], SJ[%u]",
 255        $i0 % $NT, $i2 % $NSI, $i1 % $NSJ)                     if ($p & $pComT);
 256     &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
 257     &I(\$c, "}")                                              if ($pAny);
 258     &STOP(\$c);
 259
 260     # Cycle 1
 261     &I(\$c, "{ .mmi")                                         if ($pAny);
 262     &I(\$c, "SKEY   [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
 263     &I(\$c, "SKEY   [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
 264     &I(\$c, "zxt1   T[%u] = T[%u]", $i0 % $NT, $i0 % $NT)     if ($p & $pComT);
 265     &I(\$c, "}")                                              if ($pAny);
 266     &I(\$c, "{ .mmi")                                         if ($pAny);
 267     &I(\$c, "LKEY   SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
 268     &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP)                 if ($p & $pComJ);
 269     &I(\$c, "xor    Data[%u] = Data[%u], T[%u]",
 270        $i3 % $NData, $i3 % $NData, $i1 % $NT)                 if ($p & $pOut);
 271     &I(\$c, "}")                                              if ($pAny);
 272     &STOP(\$c);
 273
 274     # Cycle 2
 275     &I(\$c, "{ .mmi")                                         if ($pAny);
 276     &I(\$c, "LKEY   SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
 277     &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI)       if ($pByp);
 278     &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
 279        $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
 280     &I(\$c, "}")                                              if ($pAny);
 281     &I(\$c, "{ .mmb")                                         if ($pAny);
 282     &I(\$c, "add    J = J, SI[%u]", $i0 % $NSI)               if ($p & $pComI);
 283     &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT)    if ($p & $pComT);
 284     &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
 285     &I(\$c, "}") if ($pAny);
 286     &STOP(\$c);
 287
 288     &P(\$c, ".rc4Resume%u:", $label)                          if ($pByp);
 289     if ($byte_num == 0 && $iteration >= $phases) {
 290         &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
 291            $iw1 % $NOutWord)                                  if ($p & $pOut);
 292         if ($iteration == (1 + $unroll_count) * $phases - 1) {
 293             if ($unroll_count == 6) {
 294                 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
 295                    $iw1 % $NOutWord, $iw0 % $NOutWord);
 296             }
 297             &I(\$c, "lfetch.nt1 [InPrefetch], %u",
 298                $unroll_count * $phases);
 299             &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
 300                $unroll_count * $phases);
 301             &I(\$c, "br.cloop.sptk.few .rc4Loop");
 302         }
 303     }
 304
 305     if ($pByp) {
 306         &P(\$bypass, ".rc4Bypass%u:", $label);
 307         &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
 308         &I(\$bypass, "nop 0");
 309         &I(\$bypass, "nop 0");
 310         &I(\$bypass, ";;");
 311         &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
 312         &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
 313         &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
 314     }
 315 }
 316
 317 $code=<<___;
 318 .ident \"rc4-ia64.s, version 3.0\"
 319 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
 320
 321 #define LCSave          r8
 322 #define PRSave          r9
 323
 324 /* Inputs become invalid once rotation begins!  */
 325
 326 #define StateTable      in0
 327 #define DataLen         in1
 328 #define InputBuffer     in2
 329 #define OutputBuffer    in3
 330
 331 #define KTable          r14
 332 #define J               r15
 333 #define InPtr           r16
 334 #define OutPtr          r17
 335 #define InPrefetch      r18
 336 #define OutPrefetch     r19
 337 #define One             r20
 338 #define LoopCount       r21
 339 #define Remainder       r22
 340 #define IFinal          r23
 341 #define EndPtr          r24
 342
 343 #define tmp0            r25
 344 #define tmp1            r26
 345
 346 #define pBypass         p6
 347 #define pDone           p7
 348 #define pSmall          p8
 349 #define pAligned        p9
 350 #define pUnaligned      p10
 351
 352 #define pComputeI       pPhase[0]
 353 #define pComputeJ       pPhase[1]
 354 #define pComputeT       pPhase[2]
 355 #define pOutput         pPhase[3]
 356
 357 #define RetVal          r8
 358 #define L_OK            p7
 359 #define L_NOK           p8
 360
 361 #define _NINPUTS        4
 362 #define _NOUTPUT        0
 363
 364 #define _NROTATE        24
 365 #define _NLOCALS        (_NROTATE - _NINPUTS - _NOUTPUT)
 366
 367 #ifndef SZ
 368 # define SZ     4       // this must be set to sizeof(RC4_INT)
 369 #endif
 370
 371 #if SZ == 1
 372 # define LKEY                   ld1
 373 # define SKEY                   st1
 374 # define KEYADDR(dst, i)        add dst = i, KTable
 375 #elif SZ == 2
 376 # define LKEY                   ld2
 377 # define SKEY                   st2
 378 # define KEYADDR(dst, i)        shladd dst = i, 1, KTable
 379 #elif SZ == 4
 380 # define LKEY                   ld4
 381 # define SKEY                   st4
 382 # define KEYADDR(dst, i)        shladd dst = i, 2, KTable
 383 #else
 384 # define LKEY                   ld8
 385 # define SKEY                   st8
 386 # define KEYADDR(dst, i)        shladd dst = i, 3, KTable
 387 #endif
 388
 389 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 390 # define ADDP   addp4
 391 #else
 392 # define ADDP   add
 393 #endif
 394
 395 /* Define a macro for the bit number of the n-th byte: */
 396
 397 #ifdef L_ENDIAN
 398 # define BYTE_POS(n)    (8 * (n))
 399 #else
 400 # define BYTE_POS(n)    (56 - (8 * (n)))
 401 #endif
 402
 403 /*
 404    We must perform the first phase of the pipeline explicitly since
 405    we will always load from the stable the first time. The br.cexit
 406    will never be taken since regardless of the number of bytes because
 407    the epilogue count is 4.
 408 */
 409
 410 #define MODSCHED_RC4(label)                                                \\
 411         {                                                                  \\
 412                                 ld1             Data[0] = [InPtr], 1;      \\
 413                                 add             IFinal = 1, I[1];          \\
 414                                 KEYADDR(IPr[0], I[1]);                     \\
 415         } ;;                                                               \\
 416         {                                                                  \\
 417                                 LKEY            SI[0] = [IPr[0]];          \\
 418                                 mov             pr.rot = 0x10000;          \\
 419                                 mov             ar.ec = 4;                 \\
 420         } ;;                                                               \\
 421         {                                                                  \\
 422                                 add             J = J, SI[0];              \\
 423                                 zxt1            I[0] = IFinal;             \\
 424                                 br.cexit.spnt.few label; /* never taken */ \\
 425         } ;;                                                               \\
 426 label:                                                                     \\
 427         {       .mmi;                                                      \\
 428                 (pComputeI)     ld1             Data[0] = [InPtr], 1;      \\
 429                 (pComputeI)     add             IFinal = 1, I[1];          \\
 430                 (pComputeJ)     zxt1            J = J;                     \\
 431         }{      .mmi;                                                      \\
 432                 (pOutput)       LKEY            T[1] = [T[1]];             \\
 433                 (pComputeT)     add             T[0] = SI[2], SJ[1];       \\
 434                 (pComputeI)     KEYADDR(IPr[0], I[1]);                     \\
 435         } ;;                                                               \\
 436         {       .mmi;                                                      \\
 437                 (pComputeT)     SKEY            [IPr[2]] = SJ[1];          \\
 438                 (pComputeT)     SKEY            [JP[1]] = SI[2];           \\
 439                 (pComputeT)     zxt1            T[0] = T[0];               \\
 440         }{      .mmi;                                                      \\
 441                 (pComputeI)     LKEY            SI[0] = [IPr[0]];          \\
 442                 (pComputeJ)     KEYADDR(JP[0], J);                         \\
 443                 (pComputeI)     cmp.eq.unc      pBypass, p0 = I[1], J;     \\
 444         } ;;                                                               \\
 445         {       .mmi;                                                      \\
 446                 (pComputeJ)     LKEY            SJ[0] = [JP[0]];           \\
 447                 (pOutput)       xor             Data[3] = Data[3], T[1];   \\
 448                                 nop             0x0;                       \\
 449         }{      .mmi;                                                      \\
 450                 (pComputeT)     KEYADDR(T[0], T[0]);                       \\
 451                 (pBypass)       mov             SI[0] = SI[1];             \\
 452                 (pComputeI)     zxt1            I[0] = IFinal;             \\
 453         } ;;                                                               \\
 454         {       .mmb;                                                      \\
 455                 (pOutput)       st1             [OutPtr] = Data[3], 1;     \\
 456                 (pComputeI)     add             J = J, SI[0];              \\
 457                                 br.ctop.sptk.few label;                    \\
 458         } ;;
 459
 460         .text
 461
 462         .align  32
 463
 464         .type   RC4, \@function
 465         .global RC4
 466
 467         .proc   RC4
 468         .prologue
 469
 470 RC4:
 471         {
 472                 .mmi
 473                 alloc   r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
 474
 475                 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
 476                       OutWord[2]
 477                 .rotp pPhase[4]
 478
 479 #ifdef _LP64
 480                 add             InPrefetch = 0, InputBuffer
 481                 nop             0x0
 482         }
 483 #else
 484                 ADDP            InputBuffer = 0, InputBuffer
 485                 ADDP            StateTable = 0, StateTable
 486         }
 487         ;;
 488         {
 489                 ADDP            InPrefetch = 0, InputBuffer
 490                 ADDP            OutputBuffer = 0, OutputBuffer
 491                 nop             0x0
 492         }
 493 #endif
 494         ;;
 495         {
 496                 .mmi
 497                 lfetch.nt1      [InPrefetch], 0x80
 498                 LKEY            I[1] = [StateTable], SZ
 499                 mov             OutPrefetch = OutputBuffer
 500         } ;;
 501         {
 502                 .mii
 503                 nop             0x0
 504                 nop             0x0
 505                 mov             RetVal = r0
 506         }
 507         {               // Return 0 if the input length is nonsensical
 508                 .mib
 509                 nop             0x0
 510                 cmp.ge          L_NOK, L_OK = r0, DataLen
 511         (L_NOK) br.ret.sptk.few rp
 512         }
 513         ;;
 514         {
 515                 .mib
 516                 nop             0x0
 517                 cmp.eq          L_NOK, L_OK = r0, InputBuffer
 518         (L_NOK) br.ret.sptk.few rp
 519         }
 520         ;;
 521         {
 522                 .mib
 523                 nop             0x0
 524                 cmp.eq          L_NOK, L_OK = r0, OutputBuffer
 525         (L_NOK) br.ret.sptk.few rp
 526         }
 527         ;;
 528         {
 529                 .mib
 530                 nop             0x0
 531                 cmp.eq          L_NOK, L_OK = r0, StateTable
 532         (L_NOK) br.ret.sptk.few rp
 533         }
 534
 535
 536 /* Prefetch the state-table. It contains 256 elements of size SZ */
 537
 538 #if SZ == 1
 539                 ADDP            tmp0 = 1*128, StateTable
 540 #elif SZ == 2
 541                 ADDP            tmp0 = 3*128, StateTable
 542                 ADDP            tmp1 = 2*128, StateTable
 543 #elif SZ == 4
 544                 ADDP            tmp0 = 7*128, StateTable
 545                 ADDP            tmp1 = 6*128, StateTable
 546 #elif SZ == 8
 547                 ADDP            tmp0 = 15*128, StateTable
 548                 ADDP            tmp1 = 14*128, StateTable
 549 #endif
 550                 ;;
 551 #if SZ >= 8
 552                 lfetch.fault.nt1                [tmp0], -256    // 15
 553                 lfetch.fault.nt1                [tmp1], -256;;
 554                 lfetch.fault.nt1                [tmp0], -256    // 13
 555                 lfetch.fault.nt1                [tmp1], -256;;
 556                 lfetch.fault.nt1                [tmp0], -256    // 11
 557                 lfetch.fault.nt1                [tmp1], -256;;
 558                 lfetch.fault.nt1                [tmp0], -256    //  9
 559                 lfetch.fault.nt1                [tmp1], -256;;
 560 #endif
 561 #if SZ >= 4
 562                 lfetch.fault.nt1                [tmp0], -256    //  7
 563                 lfetch.fault.nt1                [tmp1], -256;;
 564                 lfetch.fault.nt1                [tmp0], -256    //  5
 565                 lfetch.fault.nt1                [tmp1], -256;;
 566 #endif
 567 #if SZ >= 2
 568                 lfetch.fault.nt1                [tmp0], -256    //  3
 569                 lfetch.fault.nt1                [tmp1], -256;;
 570 #endif
 571                 lfetch.fault.nt1                [tmp0]          //  1
 572
 573         {
 574                 .mmi
 575                 lfetch.nt1      [InPrefetch], 0x80
 576                 lfetch.excl.nt1 [OutPrefetch], 0x80
 577                 .save           pr, PRSave
 578                 mov             PRSave = pr
 579         } ;;
 580         {
 581                 .mmi
 582                 lfetch.excl.nt1 [OutPrefetch], 0x80
 583                 LKEY            J = [StateTable], SZ
 584                 ADDP            EndPtr = DataLen, InputBuffer
 585         }  ;;
 586         {
 587                 .mmi
 588                 mov             InPtr = InputBuffer
 589                 mov             OutPtr = OutputBuffer
 590                 ADDP            EndPtr = -1, EndPtr     // Make it point to
 591                                                         // last data byte.
 592         } ;;
 593         {
 594                 .mii
 595                 mov             KTable = StateTable
 596                 mov             One = 1
 597                 .save           ar.lc, LCSave
 598                 mov             LCSave = ar.lc
 599                 .body
 600         } ;;
 601         {
 602                 .mmb
 603                 sub             Remainder = 0, OutPtr
 604                 cmp.gtu         pSmall, p0 = $threshold, DataLen
 605 (pSmall)        br.cond.dpnt    .rc4Remainder           // Data too small for
 606                                                         // big loop.
 607         } ;;
 608         {
 609                 .mmi
 610                 and             Remainder = 0x7, Remainder
 611                 ;;
 612                 cmp.eq          pAligned, pUnaligned = Remainder, r0
 613                 nop             0x0
 614         } ;;
 615         {
 616                 .mmb
 617 (pUnaligned)    add             Remainder = -1, Remainder
 618 (pAligned)      sub             Remainder = EndPtr, InPtr
 619 (pAligned)      br.cond.dptk.many .rc4Aligned
 620         } ;;
 621         {
 622                 .mmi
 623                 nop             0x0
 624                 nop             0x0
 625                 mov.i           ar.lc = Remainder
 626         }
 627
 628 /* Do the initial few bytes via the compact, modulo-scheduled loop
 629    until the output pointer is 8-byte-aligned.  */
 630
 631                 MODSCHED_RC4(.RC4AlignLoop)
 632
 633         {
 634                 .mib
 635                 sub             Remainder = EndPtr, InPtr
 636                 zxt1            IFinal = IFinal
 637                 clrrrb                          // Clear CFM.rrb.pr so
 638                 ;;                              // next "mov pr.rot = N"
 639                                                 // does the right thing.
 640         }
 641         {
 642                 .mmi
 643                 mov             I[1] = IFinal
 644                 nop             0x0
 645                 nop             0x0
 646         } ;;
 647
 648
 649 .rc4Aligned:
 650
 651 /*
 652    Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
 653  */
 654
 655         {
 656                 .mlx
 657                 add     LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
 658                 movl            Remainder = 0xaaaaaaaaaaaaaaab
 659         } ;;
 660         {
 661                 .mmi
 662                 setf.sig        f6 = LoopCount          // M2, M3       6 cyc
 663                 setf.sig        f7 = Remainder          // M2, M3       6 cyc
 664                 nop             0x0
 665         } ;;
 666         {
 667                 .mfb
 668                 nop             0x0
 669                 xmpy.hu         f6 = f6, f7
 670                 nop             0x0
 671         } ;;
 672         {
 673                 .mmi
 674                 getf.sig        LoopCount = f6          // M2           5 cyc
 675                 nop             0x0
 676                 nop             0x0
 677         } ;;
 678         {
 679                 .mmi
 680                 nop             0x0
 681                 nop             0x0
 682                 shr.u           LoopCount = LoopCount, 4
 683         } ;;
 684         {
 685                 .mmi
 686                 nop             0x0
 687                 nop             0x0
 688                 mov.i           ar.lc = LoopCount
 689         } ;;
 690
 691 /* Now comes the unrolled loop: */
 692
 693 .rc4Prologue:
 694 ___
 695
 696 $iteration = 0;
 697
 698 # Generate the prologue:
 699 $predicates = 1;
 700 for ($i = 0; $i < $phases; ++$i) {
 701     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 702     $predicates = ($predicates << 1) | 1;
 703 }
 704
 705 $code.=<<___;
 706 .rc4Loop:
 707 ___
 708
 709 # Generate the body:
 710 for ($i = 0; $i < $unroll_count*$phases; ++$i) {
 711     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 712 }
 713
 714 $code.=<<___;
 715 .rc4Epilogue:
 716 ___
 717
 718 # Generate the epilogue:
 719 for ($i = 0; $i < $phases; ++$i) {
 720     $predicates <<= 1;
 721     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 722 }
 723
 724 $code.=<<___;
 725         {
 726                 .mmi
 727                 lfetch.nt1      [EndPtr]        // fetch line with last byte
 728                 mov             IFinal = I[1]
 729                 nop             0x0
 730         }
 731
 732 .rc4Remainder:
 733         {
 734                 .mmi
 735                 sub             Remainder = EndPtr, InPtr       // Calculate
 736                                                                 // # of bytes
 737                                                                 // left - 1
 738                 nop             0x0
 739                 nop             0x0
 740         } ;;
 741         {
 742                 .mib
 743                 cmp.eq          pDone, p0 = -1, Remainder // done already?
 744                 mov.i           ar.lc = Remainder
 745 (pDone)         br.cond.dptk.few .rc4Complete
 746         }
 747
 748 /* Do the remaining bytes via the compact, modulo-scheduled loop */
 749
 750                 MODSCHED_RC4(.RC4RestLoop)
 751
 752         {
 753                 .mmi
 754                 nop             0x0
 755                 nop             0x0
 756                 zxt1            IFinal = IFinal
 757         } ;;
 758
 759 .rc4Complete:
 760         {
 761                 .mmi
 762                 ADDP            KTable = -2*SZ, KTable ;;
 763                 SKEY            [KTable] = IFinal, SZ
 764                 mov             ar.lc = LCSave
 765         } ;;
 766         {
 767                 .mii
 768                 nop             0x0
 769                 nop             0x0
 770                 add             RetVal = 1, r0
 771         }
 772         {
 773                 .mib
 774                 SKEY            [KTable] = J
 775                 mov             pr = PRSave, 0x1FFFF
 776                 br.ret.sptk.few rp
 777         } ;;
 778 ___
 779
 780 # Last but not least, emit the code for the bypass-code of the unrolled loop:
 781
 782 $code.=$bypass;
 783
 784 $code.=<<___;
 785         .endp RC4
 786 ___
 787
 788 print $code;