crypto/rc4/asm/rc4-ia64.pl

   1 #!/usr/bin/env perl
   2 #
   3 # ====================================================================
   4 # Written by David Mosberger <David.Mosberger@acm.org> based on the
   5 # Itanium optimized Crypto code which was released by HP Labs at
   6 # http://www.hpl.hp.com/research/linux/crypto/.
   7 #
   8 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
   9 #
  10 # Permission is hereby granted, free of charge, to any person obtaining
  11 # a copy of this software and associated documentation files (the
  12 # "Software"), to deal in the Software without restriction, including
  13 # without limitation the rights to use, copy, modify, merge, publish,
  14 # distribute, sublicense, and/or sell copies of the Software, and to
  15 # permit persons to whom the Software is furnished to do so, subject to
  16 # the following conditions:
  17 #
  18 # The above copyright notice and this permission notice shall be
  19 # included in all copies or substantial portions of the Software.
  20
  21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
  22 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  23 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
  24 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
  25 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
  26 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
  27 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.  */
  28
  29
  30
  31 # This is a little helper program which generates a software-pipelined
  32 # for RC4 encryption.  The basic algorithm looks like this:
  33 #
  34 #   for (counter = 0; counter < len; ++counter)
  35 #     {
  36 #       in = inp[counter];
  37 #       SI = S[I];
  38 #       J = (SI + J) & 0xff;
  39 #       SJ = S[J];
  40 #       T = (SI + SJ) & 0xff;
  41 #       S[I] = SJ, S[J] = SI;
  42 #       ST = S[T];
  43 #       outp[counter] = in ^ ST;
  44 #       I = (I + 1) & 0xff;
  45 #     }
  46 #
  47 # Pipelining this loop isn't easy, because the stores to the S[] array
  48 # need to be observed in the right order.  The loop generated by the
  49 # code below has the following pipeline diagram:
  50 #
  51 #      cycle
  52 #     | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
  53 # iter
  54 #   1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  55 #   2:             xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  56 #   3:                         xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
  57 #
  58 #   where:
  59 #       LDI = load of S[I]
  60 #       LDJ = load of S[J]
  61 #       SWP = swap of S[I] and S[J]
  62 #       LDT = load of S[T]
  63 #
  64 # Note that in the above diagram, the major trouble-spot is that LDI
  65 # of the 2nd iteration is performed BEFORE the SWP of the first
  66 # iteration.  Fortunately, this is easy to detect (I of the 1st
  67 # iteration will be equal to J of the 2nd iteration) and when this
  68 # happens, we simply forward the proper value from the 1st iteration
  69 # to the 2nd one.  The proper value in this case is simply the value
  70 # of S[I] from the first iteration (thanks to the fact that SWP
  71 # simply swaps the contents of S[I] and S[J]).
  72 #
  73 # Another potential trouble-spot is in cycle 7, where SWP of the 1st
  74 # iteration issues at the same time as the LDI of the 3rd iteration.
  75 # However, thanks to IA-64 execution semantics, this can be taken
  76 # care of simply by placing LDI later in the instruction-group than
  77 # SWP.  IA-64 CPUs will automatically forward the value if they
  78 # detect that the SWP and LDI are accessing the same memory-location.
  79
  80 # The core-loop that can be pipelined then looks like this (annotated
  81 # with McKinley/Madison issue port & latency numbers, assuming L1
  82 # cache hits for the most part):
  83
  84 # operation:        instruction:                    issue-ports:  latency
  85 # ------------------  -----------------------------   ------------- -------
  86
  87 # Data = *inp++       ld1 data = [inp], 1             M0-M1         1 cyc     c0
  88 #                     shladd Iptr = I, KeyTable, 3    M0-M3, I0, I1 1 cyc
  89 # I = (I + 1) & 0xff  padd1 nextI = I, one            M0-M3, I0, I1 3 cyc
  90 #                     ;;
  91 # SI = S[I]           ld8 SI = [Iptr]                 M0-M1         1 cyc     c1 * after SWAP!
  92 #                     ;;
  93 #                     cmp.eq.unc pBypass = I, J                                  * after J is valid!
  94 # J = SI + J          add J = J, SI                   M0-M3, I0, I1 1 cyc     c2
  95 #                     (pBypass) br.cond.spnt Bypass
  96 #                     ;;
  97 # ---------------------------------------------------------------------------------------
  98 # J = J & 0xff        zxt1 J = J                      I0, I1, 1 cyc           c3
  99 #                     ;;
 100 #                     shladd Jptr = J, KeyTable, 3    M0-M3, I0, I1 1 cyc     c4
 101 #                     ;;
 102 # SJ = S[J]           ld8 SJ = [Jptr]                 M0-M1         1 cyc     c5
 103 #                     ;;
 104 # ---------------------------------------------------------------------------------------
 105 # T = (SI + SJ)       add T = SI, SJ                  M0-M3, I0, I1 1 cyc     c6
 106 #                     ;;
 107 # T = T & 0xff        zxt1 T = T                      I0, I1        1 cyc
 108 # S[I] = SJ           st8 [Iptr] = SJ                 M2-M3                   c7
 109 # S[J] = SI           st8 [Jptr] = SI                 M2-M3
 110 #                     ;;
 111 #                     shladd Tptr = T, KeyTable, 3    M0-M3, I0, I1 1 cyc     c8
 112 #                     ;;
 113 # ---------------------------------------------------------------------------------------
 114 # T = S[T]            ld8 T = [Tptr]                  M0-M1         1 cyc     c9
 115 #                     ;;
 116 # data ^= T           xor data = data, T              M0-M3, I0, I1 1 cyc     c10
 117 #                     ;;
 118 # *out++ = Data ^ T   dep word = word, data, 8, POS   I0, I1        1 cyc     c11
 119 #                     ;;
 120 # ---------------------------------------------------------------------------------------
 121
 122 # There are several points worth making here:
 123
 124 #   - Note that due to the bypass/forwarding-path, the first two
 125 #     phases of the loop are strangly mingled together.  In
 126 #     particular, note that the first stage of the pipeline is
 127 #     using the value of "J", as calculated by the second stage.
 128 #   - Each bundle-pair will have exactly 6 instructions.
 129 #   - Pipelined, the loop can execute in 3 cycles/iteration and
 130 #     4 stages.  However, McKinley/Madison can issue "st1" to
 131 #     the same bank at a rate of at most one per 4 cycles.  Thus,
 132 #     instead of storing each byte, we accumulate them in a word
 133 #     and then write them back at once with a single "st8" (this
 134 #     implies that the setup code needs to ensure that the output
 135 #     buffer is properly aligned, if need be, by encoding the
 136 #     first few bytes separately).
 137 #   - There is no space for a "br.ctop" instruction.  For this
 138 #     reason we can't use module-loop support in IA-64 and have
 139 #     to do a traditional, purely software-pipelined loop.
 140 #   - We can't replace any of the remaining "add/zxt1" pairs with
 141 #     "padd1" because the latency for that instruction is too high
 142 #     and would push the loop to the point where more bypasses
 143 #     would be needed, which we don't have space for.
 144 #   - The above loop runs at around 3.26 cycles/byte, or roughly
 145 #     440 MByte/sec on a 1.5GHz Madison.  This is well below the
 146 #     system bus bandwidth and hence with judicious use of
 147 #     "lfetch" this loop can run at (almost) peak speed even when
 148 #     the input and output data reside in memory.  The
 149 #     max. latency that can be tolerated is (PREFETCH_DISTANCE *
 150 #     L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
 151 #     least) 1-ahead prefetching of 128 byte cache-lines.  Note
 152 #     that we do NOT prefetch into L1, since that would only
 153 #     interfere with the S[] table values stored there.  This is
 154 #     acceptable because there is a 10 cycle latency between
 155 #     load and first use of the input data.
 156 #   - We use a branch to out-of-line bypass-code of cycle-pressure:
 157 #     we calculate the next J, check for the need to activate the
 158 #     bypass path, and activate the bypass path ALL IN THE SAME
 159 #     CYCLE.  If we didn't have these constraints, we could do
 160 #     the bypass with a simple conditional move instruction.
 161 #     Fortunately, the bypass paths get activated relatively
 162 #     infrequently, so the extra branches don't cost all that much
 163 #     (about 0.04 cycles/byte, measured on a 16396 byte file with
 164 #     random input data).
 165 #
 166
 167 $output = pop;
 168 open STDOUT,">$output";
 169
 170 $phases = 4;            # number of stages/phases in the pipelined-loop
 171 $unroll_count = 6;      # number of times we unrolled it
 172 $pComI = (1 << 0);
 173 $pComJ = (1 << 1);
 174 $pComT = (1 << 2);
 175 $pOut  = (1 << 3);
 176
 177 $NData = 4;
 178 $NIP = 3;
 179 $NJP = 2;
 180 $NI = 2;
 181 $NSI = 3;
 182 $NSJ = 2;
 183 $NT = 2;
 184 $NOutWord = 2;
 185
 186 #
 187 # $threshold is the minimum length before we attempt to use the
 188 # big software-pipelined loop.  It MUST be greater-or-equal
 189 # to:
 190 #               PHASES * (UNROLL_COUNT + 1) + 7
 191 #
 192 # The "+ 7" comes from the fact we may have to encode up to
 193 #   7 bytes separately before the output pointer is aligned.
 194 #
 195 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
 196
 197 sub I {
 198     local *code = shift;
 199     local $format = shift;
 200     $code .= sprintf ("\t\t".$format."\n", @_);
 201 }
 202
 203 sub P {
 204     local *code = shift;
 205     local $format = shift;
 206     $code .= sprintf ($format."\n", @_);
 207 }
 208
 209 sub STOP {
 210     local *code = shift;
 211     $code .=<<___;
 212                 ;;
 213 ___
 214 }
 215
 216 sub emit_body {
 217     local *c = shift;
 218     local *bypass = shift;
 219     local ($iteration, $p) = @_;
 220
 221     local $i0 = $iteration;
 222     local $i1 = $iteration - 1;
 223     local $i2 = $iteration - 2;
 224     local $i3 = $iteration - 3;
 225     local $iw0 = ($iteration - 3) / 8;
 226     local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
 227     local $byte_num = ($iteration - 3) % 8;
 228     local $label = $iteration + 1;
 229     local $pAny = ($p & 0xf) == 0xf;
 230     local $pByp = (($p & $pComI) && ($iteration > 0));
 231
 232     $c.=<<___;
 233 //////////////////////////////////////////////////
 234 ___
 235
 236     if (($p & 0xf) == 0) {
 237         $c.="#ifdef HOST_IS_BIG_ENDIAN\n";
 238         &I(\$c,"shr.u   OutWord[%u] = OutWord[%u], 32;;",
 239                                 $iw1 % $NOutWord, $iw1 % $NOutWord);
 240         $c.="#endif\n";
 241         &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
 242         return;
 243     }
 244
 245     # Cycle 0
 246     &I(\$c, "{ .mmi")                                         if ($pAny);
 247     &I(\$c, "ld1    Data[%u] = [InPtr], 1", $i0 % $NData)     if ($p & $pComI);
 248     &I(\$c, "padd1  I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
 249     &I(\$c, "zxt1   J = J")                                   if ($p & $pComJ);
 250     &I(\$c, "}")                                              if ($pAny);
 251     &I(\$c, "{ .mmi")                                         if ($pAny);
 252     &I(\$c, "LKEY   T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT)   if ($p & $pOut);
 253     &I(\$c, "add    T[%u] = SI[%u], SJ[%u]",
 254        $i0 % $NT, $i2 % $NSI, $i1 % $NSJ)                     if ($p & $pComT);
 255     &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
 256     &I(\$c, "}")                                              if ($pAny);
 257     &STOP(\$c);
 258
 259     # Cycle 1
 260     &I(\$c, "{ .mmi")                                         if ($pAny);
 261     &I(\$c, "SKEY   [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
 262     &I(\$c, "SKEY   [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
 263     &I(\$c, "zxt1   T[%u] = T[%u]", $i0 % $NT, $i0 % $NT)     if ($p & $pComT);
 264     &I(\$c, "}")                                              if ($pAny);
 265     &I(\$c, "{ .mmi")                                         if ($pAny);
 266     &I(\$c, "LKEY   SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
 267     &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP)                 if ($p & $pComJ);
 268     &I(\$c, "xor    Data[%u] = Data[%u], T[%u]",
 269        $i3 % $NData, $i3 % $NData, $i1 % $NT)                 if ($p & $pOut);
 270     &I(\$c, "}")                                              if ($pAny);
 271     &STOP(\$c);
 272
 273     # Cycle 2
 274     &I(\$c, "{ .mmi")                                         if ($pAny);
 275     &I(\$c, "LKEY   SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
 276     &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI)       if ($pByp);
 277     &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
 278        $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
 279     &I(\$c, "}")                                              if ($pAny);
 280     &I(\$c, "{ .mmb")                                         if ($pAny);
 281     &I(\$c, "add    J = J, SI[%u]", $i0 % $NSI)               if ($p & $pComI);
 282     &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT)    if ($p & $pComT);
 283     &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
 284     &I(\$c, "}") if ($pAny);
 285     &STOP(\$c);
 286
 287     &P(\$c, ".rc4Resume%u:", $label)                          if ($pByp);
 288     if ($byte_num == 0 && $iteration >= $phases) {
 289         &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
 290            $iw1 % $NOutWord)                                  if ($p & $pOut);
 291         if ($iteration == (1 + $unroll_count) * $phases - 1) {
 292             if ($unroll_count == 6) {
 293                 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
 294                    $iw1 % $NOutWord, $iw0 % $NOutWord);
 295             }
 296             &I(\$c, "lfetch.nt1 [InPrefetch], %u",
 297                $unroll_count * $phases);
 298             &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
 299                $unroll_count * $phases);
 300             &I(\$c, "br.cloop.sptk.few .rc4Loop");
 301         }
 302     }
 303
 304     if ($pByp) {
 305         &P(\$bypass, ".rc4Bypass%u:", $label);
 306         &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
 307         &I(\$bypass, "nop 0");
 308         &I(\$bypass, "nop 0");
 309         &I(\$bypass, ";;");
 310         &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
 311         &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
 312         &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
 313         &I(\$bypass, ";;");
 314     }
 315 }
 316
 317 $code=<<___;
 318 .ident \"rc4-ia64.s, version 3.0\"
 319 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
 320
 321 #define LCSave          r8
 322 #define PRSave          r9
 323
 324 /* Inputs become invalid once rotation begins!  */
 325
 326 #define StateTable      in0
 327 #define DataLen         in1
 328 #define InputBuffer     in2
 329 #define OutputBuffer    in3
 330
 331 #define KTable          r14
 332 #define J               r15
 333 #define InPtr           r16
 334 #define OutPtr          r17
 335 #define InPrefetch      r18
 336 #define OutPrefetch     r19
 337 #define One             r20
 338 #define LoopCount       r21
 339 #define Remainder       r22
 340 #define IFinal          r23
 341 #define EndPtr          r24
 342
 343 #define tmp0            r25
 344 #define tmp1            r26
 345
 346 #define pBypass         p6
 347 #define pDone           p7
 348 #define pSmall          p8
 349 #define pAligned        p9
 350 #define pUnaligned      p10
 351
 352 #define pComputeI       pPhase[0]
 353 #define pComputeJ       pPhase[1]
 354 #define pComputeT       pPhase[2]
 355 #define pOutput         pPhase[3]
 356
 357 #define RetVal          r8
 358 #define L_OK            p7
 359 #define L_NOK           p8
 360
 361 #define _NINPUTS        4
 362 #define _NOUTPUT        0
 363
 364 #define _NROTATE        24
 365 #define _NLOCALS        (_NROTATE - _NINPUTS - _NOUTPUT)
 366
 367 #ifndef SZ
 368 # define SZ     4       // this must be set to sizeof(RC4_INT)
 369 #endif
 370
 371 #if SZ == 1
 372 # define LKEY                   ld1
 373 # define SKEY                   st1
 374 # define KEYADDR(dst, i)        add dst = i, KTable
 375 #elif SZ == 2
 376 # define LKEY                   ld2
 377 # define SKEY                   st2
 378 # define KEYADDR(dst, i)        shladd dst = i, 1, KTable
 379 #elif SZ == 4
 380 # define LKEY                   ld4
 381 # define SKEY                   st4
 382 # define KEYADDR(dst, i)        shladd dst = i, 2, KTable
 383 #else
 384 # define LKEY                   ld8
 385 # define SKEY                   st8
 386 # define KEYADDR(dst, i)        shladd dst = i, 3, KTable
 387 #endif
 388
 389 #if defined(_HPUX_SOURCE) && !defined(_LP64)
 390 # define ADDP   addp4
 391 #else
 392 # define ADDP   add
 393 #endif
 394
 395 /* Define a macro for the bit number of the n-th byte: */
 396
 397 #if defined(_HPUX_SOURCE) || defined(B_ENDIAN)
 398 # define HOST_IS_BIG_ENDIAN
 399 # define BYTE_POS(n)    (56 - (8 * (n)))
 400 #else
 401 # define BYTE_POS(n)    (8 * (n))
 402 #endif
 403
 404 /*
 405    We must perform the first phase of the pipeline explicitly since
 406    we will always load from the stable the first time. The br.cexit
 407    will never be taken since regardless of the number of bytes because
 408    the epilogue count is 4.
 409 */
 410 /* MODSCHED_RC4 macro was split to _PROLOGUE and _LOOP, because HP-UX
 411    assembler failed on original macro with syntax error. <appro> */
 412 #define MODSCHED_RC4_PROLOGUE                                              \\
 413         {                                                                  \\
 414                                 ld1             Data[0] = [InPtr], 1;      \\
 415                                 add             IFinal = 1, I[1];          \\
 416                                 KEYADDR(IPr[0], I[1]);                     \\
 417         } ;;                                                               \\
 418         {                                                                  \\
 419                                 LKEY            SI[0] = [IPr[0]];          \\
 420                                 mov             pr.rot = 0x10000;          \\
 421                                 mov             ar.ec = 4;                 \\
 422         } ;;                                                               \\
 423         {                                                                  \\
 424                                 add             J = J, SI[0];              \\
 425                                 zxt1            I[0] = IFinal;             \\
 426                                 br.cexit.spnt.few .+16; /* never taken */  \\
 427         } ;;
 428 #define MODSCHED_RC4_LOOP(label)                                           \\
 429 label:                                                                     \\
 430         {       .mmi;                                                      \\
 431                 (pComputeI)     ld1             Data[0] = [InPtr], 1;      \\
 432                 (pComputeI)     add             IFinal = 1, I[1];          \\
 433                 (pComputeJ)     zxt1            J = J;                     \\
 434         }{      .mmi;                                                      \\
 435                 (pOutput)       LKEY            T[1] = [T[1]];             \\
 436                 (pComputeT)     add             T[0] = SI[2], SJ[1];       \\
 437                 (pComputeI)     KEYADDR(IPr[0], I[1]);                     \\
 438         } ;;                                                               \\
 439         {       .mmi;                                                      \\
 440                 (pComputeT)     SKEY            [IPr[2]] = SJ[1];          \\
 441                 (pComputeT)     SKEY            [JP[1]] = SI[2];           \\
 442                 (pComputeT)     zxt1            T[0] = T[0];               \\
 443         }{      .mmi;                                                      \\
 444                 (pComputeI)     LKEY            SI[0] = [IPr[0]];          \\
 445                 (pComputeJ)     KEYADDR(JP[0], J);                         \\
 446                 (pComputeI)     cmp.eq.unc      pBypass, p0 = I[1], J;     \\
 447         } ;;                                                               \\
 448         {       .mmi;                                                      \\
 449                 (pComputeJ)     LKEY            SJ[0] = [JP[0]];           \\
 450                 (pOutput)       xor             Data[3] = Data[3], T[1];   \\
 451                                 nop             0x0;                       \\
 452         }{      .mmi;                                                      \\
 453                 (pComputeT)     KEYADDR(T[0], T[0]);                       \\
 454                 (pBypass)       mov             SI[0] = SI[1];             \\
 455                 (pComputeI)     zxt1            I[0] = IFinal;             \\
 456         } ;;                                                               \\
 457         {       .mmb;                                                      \\
 458                 (pOutput)       st1             [OutPtr] = Data[3], 1;     \\
 459                 (pComputeI)     add             J = J, SI[0];              \\
 460                                 br.ctop.sptk.few label;                    \\
 461         } ;;
 462
 463         .text
 464
 465         .align  32
 466
 467         .type   RC4, \@function
 468         .global RC4
 469
 470         .proc   RC4
 471         .prologue
 472
 473 RC4:
 474         {
 475                 .mmi
 476                 alloc   r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
 477
 478                 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
 479                       OutWord[2]
 480                 .rotp pPhase[4]
 481
 482                 ADDP            InPrefetch = 0, InputBuffer
 483                 ADDP            KTable = 0, StateTable
 484         }
 485         {
 486                 .mmi
 487                 ADDP            InPtr = 0, InputBuffer
 488                 ADDP            OutPtr = 0, OutputBuffer
 489                 mov             RetVal = r0
 490         }
 491         ;;
 492         {
 493                 .mmi
 494                 lfetch.nt1      [InPrefetch], 0x80
 495                 ADDP            OutPrefetch = 0, OutputBuffer
 496         }
 497         {               // Return 0 if the input length is nonsensical
 498                 .mib
 499                 ADDP            StateTable = 0, StateTable
 500                 cmp.ge.unc      L_NOK, L_OK = r0, DataLen
 501         (L_NOK) br.ret.sptk.few rp
 502         }
 503         ;;
 504         {
 505                 .mib
 506                 cmp.eq.or       L_NOK, L_OK = r0, InPtr
 507                 cmp.eq.or       L_NOK, L_OK = r0, OutPtr
 508                 nop             0x0
 509         }
 510         {
 511                 .mib
 512                 cmp.eq.or       L_NOK, L_OK = r0, StateTable
 513                 nop             0x0
 514         (L_NOK) br.ret.sptk.few rp
 515         }
 516         ;;
 517                 LKEY            I[1] = [KTable], SZ
 518 /* Prefetch the state-table. It contains 256 elements of size SZ */
 519
 520 #if SZ == 1
 521                 ADDP            tmp0 = 1*128, StateTable
 522 #elif SZ == 2
 523                 ADDP            tmp0 = 3*128, StateTable
 524                 ADDP            tmp1 = 2*128, StateTable
 525 #elif SZ == 4
 526                 ADDP            tmp0 = 7*128, StateTable
 527                 ADDP            tmp1 = 6*128, StateTable
 528 #elif SZ == 8
 529                 ADDP            tmp0 = 15*128, StateTable
 530                 ADDP            tmp1 = 14*128, StateTable
 531 #endif
 532                 ;;
 533 #if SZ >= 8
 534                 lfetch.fault.nt1                [tmp0], -256    // 15
 535                 lfetch.fault.nt1                [tmp1], -256;;
 536                 lfetch.fault.nt1                [tmp0], -256    // 13
 537                 lfetch.fault.nt1                [tmp1], -256;;
 538                 lfetch.fault.nt1                [tmp0], -256    // 11
 539                 lfetch.fault.nt1                [tmp1], -256;;
 540                 lfetch.fault.nt1                [tmp0], -256    //  9
 541                 lfetch.fault.nt1                [tmp1], -256;;
 542 #endif
 543 #if SZ >= 4
 544                 lfetch.fault.nt1                [tmp0], -256    //  7
 545                 lfetch.fault.nt1                [tmp1], -256;;
 546                 lfetch.fault.nt1                [tmp0], -256    //  5
 547                 lfetch.fault.nt1                [tmp1], -256;;
 548 #endif
 549 #if SZ >= 2
 550                 lfetch.fault.nt1                [tmp0], -256    //  3
 551                 lfetch.fault.nt1                [tmp1], -256;;
 552 #endif
 553         {
 554                 .mii
 555                 lfetch.fault.nt1                [tmp0]          //  1
 556                 add             I[1]=1,I[1];;
 557                 zxt1            I[1]=I[1]
 558         }
 559         {
 560                 .mmi
 561                 lfetch.nt1      [InPrefetch], 0x80
 562                 lfetch.excl.nt1 [OutPrefetch], 0x80
 563                 .save           pr, PRSave
 564                 mov             PRSave = pr
 565         } ;;
 566         {
 567                 .mmi
 568                 lfetch.excl.nt1 [OutPrefetch], 0x80
 569                 LKEY            J = [KTable], SZ
 570                 ADDP            EndPtr = DataLen, InPtr
 571         }  ;;
 572         {
 573                 .mmi
 574                 ADDP            EndPtr = -1, EndPtr     // Make it point to
 575                                                         // last data byte.
 576                 mov             One = 1
 577                 .save           ar.lc, LCSave
 578                 mov             LCSave = ar.lc
 579                 .body
 580         } ;;
 581         {
 582                 .mmb
 583                 sub             Remainder = 0, OutPtr
 584                 cmp.gtu         pSmall, p0 = $threshold, DataLen
 585 (pSmall)        br.cond.dpnt    .rc4Remainder           // Data too small for
 586                                                         // big loop.
 587         } ;;
 588         {
 589                 .mmi
 590                 and             Remainder = 0x7, Remainder
 591                 ;;
 592                 cmp.eq          pAligned, pUnaligned = Remainder, r0
 593                 nop             0x0
 594         } ;;
 595         {
 596                 .mmb
 597 .pred.rel       "mutex",pUnaligned,pAligned
 598 (pUnaligned)    add             Remainder = -1, Remainder
 599 (pAligned)      sub             Remainder = EndPtr, InPtr
 600 (pAligned)      br.cond.dptk.many .rc4Aligned
 601         } ;;
 602         {
 603                 .mmi
 604                 nop             0x0
 605                 nop             0x0
 606                 mov.i           ar.lc = Remainder
 607         }
 608
 609 /* Do the initial few bytes via the compact, modulo-scheduled loop
 610    until the output pointer is 8-byte-aligned.  */
 611
 612                 MODSCHED_RC4_PROLOGUE
 613                 MODSCHED_RC4_LOOP(.RC4AlignLoop)
 614
 615         {
 616                 .mib
 617                 sub             Remainder = EndPtr, InPtr
 618                 zxt1            IFinal = IFinal
 619                 clrrrb                          // Clear CFM.rrb.pr so
 620                 ;;                              // next "mov pr.rot = N"
 621                                                 // does the right thing.
 622         }
 623         {
 624                 .mmi
 625                 mov             I[1] = IFinal
 626                 nop             0x0
 627                 nop             0x0
 628         } ;;
 629
 630
 631 .rc4Aligned:
 632
 633 /*
 634    Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
 635  */
 636
 637         {
 638                 .mlx
 639                 add     LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
 640                 movl            Remainder = 0xaaaaaaaaaaaaaaab
 641         } ;;
 642         {
 643                 .mmi
 644                 setf.sig        f6 = LoopCount          // M2, M3       6 cyc
 645                 setf.sig        f7 = Remainder          // M2, M3       6 cyc
 646                 nop             0x0
 647         } ;;
 648         {
 649                 .mfb
 650                 nop             0x0
 651                 xmpy.hu         f6 = f6, f7
 652                 nop             0x0
 653         } ;;
 654         {
 655                 .mmi
 656                 getf.sig        LoopCount = f6;;        // M2           5 cyc
 657                 nop             0x0
 658                 shr.u           LoopCount = LoopCount, 4
 659         } ;;
 660         {
 661                 .mmi
 662                 nop             0x0
 663                 nop             0x0
 664                 mov.i           ar.lc = LoopCount
 665         } ;;
 666
 667 /* Now comes the unrolled loop: */
 668
 669 .rc4Prologue:
 670 ___
 671
 672 $iteration = 0;
 673
 674 # Generate the prologue:
 675 $predicates = 1;
 676 for ($i = 0; $i < $phases; ++$i) {
 677     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 678     $predicates = ($predicates << 1) | 1;
 679 }
 680
 681 $code.=<<___;
 682 .rc4Loop:
 683 ___
 684
 685 # Generate the body:
 686 for ($i = 0; $i < $unroll_count*$phases; ++$i) {
 687     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 688 }
 689
 690 $code.=<<___;
 691 .rc4Epilogue:
 692 ___
 693
 694 # Generate the epilogue:
 695 for ($i = 0; $i < $phases; ++$i) {
 696     $predicates <<= 1;
 697     &emit_body (\$code, \$bypass, $iteration++, $predicates);
 698 }
 699
 700 $code.=<<___;
 701         {
 702                 .mmi
 703                 lfetch.nt1      [EndPtr]        // fetch line with last byte
 704                 mov             IFinal = I[1]
 705                 nop             0x0
 706         }
 707
 708 .rc4Remainder:
 709         {
 710                 .mmi
 711                 sub             Remainder = EndPtr, InPtr       // Calculate
 712                                                                 // # of bytes
 713                                                                 // left - 1
 714                 nop             0x0
 715                 nop             0x0
 716         } ;;
 717         {
 718                 .mib
 719                 cmp.eq          pDone, p0 = -1, Remainder // done already?
 720                 mov.i           ar.lc = Remainder
 721 (pDone)         br.cond.dptk.few .rc4Complete
 722         }
 723
 724 /* Do the remaining bytes via the compact, modulo-scheduled loop */
 725
 726                 MODSCHED_RC4_PROLOGUE
 727                 MODSCHED_RC4_LOOP(.RC4RestLoop)
 728
 729 .rc4Complete:
 730         {
 731                 .mmi
 732                 add             KTable = -SZ, KTable
 733                 add             IFinal = -1, IFinal
 734                 mov             ar.lc = LCSave
 735         } ;;
 736         {
 737                 .mii
 738                 SKEY            [KTable] = J,-SZ
 739                 zxt1            IFinal = IFinal
 740                 mov             pr = PRSave, 0x1FFFF
 741         } ;;
 742         {
 743                 .mib
 744                 SKEY            [KTable] = IFinal
 745                 add             RetVal = 1, r0
 746                 br.ret.sptk.few rp
 747         } ;;
 748 ___
 749
 750 # Last but not least, emit the code for the bypass-code of the unrolled loop:
 751
 752 $code.=$bypass;
 753
 754 $code.=<<___;
 755         .endp RC4
 756 ___
 757
 758 print $code;
 759
 760 close STDOUT;