3 # ====================================================================
4 # Written by David Mosberger <David.Mosberger@acm.org> based on the
5 # Itanium optimized Crypto code which was released by HP Labs at
6 # http://www.hpl.hp.com/research/linux/crypto/.
8 # Copyright (c) 2005 Hewlett-Packard Development Company, L.P.
10 # Permission is hereby granted, free of charge, to any person obtaining
11 # a copy of this software and associated documentation files (the
12 # "Software"), to deal in the Software without restriction, including
13 # without limitation the rights to use, copy, modify, merge, publish,
14 # distribute, sublicense, and/or sell copies of the Software, and to
15 # permit persons to whom the Software is furnished to do so, subject to
16 # the following conditions:
18 # The above copyright notice and this permission notice shall be
19 # included in all copies or substantial portions of the Software.
21 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
22 # EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
23 # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
24 # NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
25 # LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
26 # OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
27 # WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
31 # This is a little helper program which generates a software-pipelined
32 # for RC4 encryption. The basic algorithm looks like this:
34 # for (counter = 0; counter < len; ++counter)
38 # J = (SI + J) & 0xff;
40 # T = (SI + SJ) & 0xff;
41 # S[I] = SJ, S[J] = SI;
43 # outp[counter] = in ^ ST;
47 # Pipelining this loop isn't easy, because the stores to the S[] array
48 # need to be observed in the right order. The loop generated by the
49 # code below has the following pipeline diagram:
52 # | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |10 |11 |12 |13 |14 |15 |16 |17 |
54 # 1: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
55 # 2: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
56 # 3: xxx LDI xxx xxx xxx LDJ xxx SWP xxx LDT xxx xxx
61 # SWP = swap of S[I] and S[J]
64 # Note that in the above diagram, the major trouble-spot is that LDI
65 # of the 2nd iteration is performed BEFORE the SWP of the first
66 # iteration. Fortunately, this is easy to detect (I of the 1st
67 # iteration will be equal to J of the 2nd iteration) and when this
68 # happens, we simply forward the proper value from the 1st iteration
69 # to the 2nd one. The proper value in this case is simply the value
70 # of S[I] from the first iteration (thanks to the fact that SWP
71 # simply swaps the contents of S[I] and S[J]).
73 # Another potential trouble-spot is in cycle 7, where SWP of the 1st
74 # iteration issues at the same time as the LDI of the 3rd iteration.
75 # However, thanks to IA-64 execution semantics, this can be taken
76 # care of simply by placing LDI later in the instruction-group than
77 # SWP. IA-64 CPUs will automatically forward the value if they
78 # detect that the SWP and LDI are accessing the same memory-location.
80 # The core-loop that can be pipelined then looks like this (annotated
81 # with McKinley/Madison issue port & latency numbers, assuming L1
82 # cache hits for the most part):
84 # operation: instruction: issue-ports: latency
85 # ------------------ ----------------------------- ------------- -------
87 # Data = *inp++ ld1 data = [inp], 1 M0-M1 1 cyc c0
88 # shladd Iptr = I, KeyTable, 3 M0-M3, I0, I1 1 cyc
89 # I = (I + 1) & 0xff padd1 nextI = I, one M0-M3, I0, I1 3 cyc
91 # SI = S[I] ld8 SI = [Iptr] M0-M1 1 cyc c1 * after SWAP!
93 # cmp.eq.unc pBypass = I, J * after J is valid!
94 # J = SI + J add J = J, SI M0-M3, I0, I1 1 cyc c2
95 # (pBypass) br.cond.spnt Bypass
97 # ---------------------------------------------------------------------------------------
98 # J = J & 0xff zxt1 J = J I0, I1, 1 cyc c3
100 # shladd Jptr = J, KeyTable, 3 M0-M3, I0, I1 1 cyc c4
102 # SJ = S[J] ld8 SJ = [Jptr] M0-M1 1 cyc c5
104 # ---------------------------------------------------------------------------------------
105 # T = (SI + SJ) add T = SI, SJ M0-M3, I0, I1 1 cyc c6
107 # T = T & 0xff zxt1 T = T I0, I1 1 cyc
108 # S[I] = SJ st8 [Iptr] = SJ M2-M3 c7
109 # S[J] = SI st8 [Jptr] = SI M2-M3
111 # shladd Tptr = T, KeyTable, 3 M0-M3, I0, I1 1 cyc c8
113 # ---------------------------------------------------------------------------------------
114 # T = S[T] ld8 T = [Tptr] M0-M1 1 cyc c9
116 # data ^= T xor data = data, T M0-M3, I0, I1 1 cyc c10
118 # *out++ = Data ^ T dep word = word, data, 8, POS I0, I1 1 cyc c11
120 # ---------------------------------------------------------------------------------------
122 # There are several points worth making here:
124 # - Note that due to the bypass/forwarding-path, the first two
125 # phases of the loop are strangly mingled together. In
126 # particular, note that the first stage of the pipeline is
127 # using the value of "J", as calculated by the second stage.
128 # - Each bundle-pair will have exactly 6 instructions.
129 # - Pipelined, the loop can execute in 3 cycles/iteration and
130 # 4 stages. However, McKinley/Madison can issue "st1" to
131 # the same bank at a rate of at most one per 4 cycles. Thus,
132 # instead of storing each byte, we accumulate them in a word
133 # and then write them back at once with a single "st8" (this
134 # implies that the setup code needs to ensure that the output
135 # buffer is properly aligned, if need be, by encoding the
136 # first few bytes separately).
137 # - There is no space for a "br.ctop" instruction. For this
138 # reason we can't use module-loop support in IA-64 and have
139 # to do a traditional, purely software-pipelined loop.
140 # - We can't replace any of the remaining "add/zxt1" pairs with
141 # "padd1" because the latency for that instruction is too high
142 # and would push the loop to the point where more bypasses
143 # would be needed, which we don't have space for.
144 # - The above loop runs at around 3.26 cycles/byte, or roughly
145 # 440 MByte/sec on a 1.5GHz Madison. This is well below the
146 # system bus bandwidth and hence with judicious use of
147 # "lfetch" this loop can run at (almost) peak speed even when
148 # the input and output data reside in memory. The
149 # max. latency that can be tolerated is (PREFETCH_DISTANCE *
150 # L2_LINE_SIZE * 3 cyc), or about 384 cycles assuming (at
151 # least) 1-ahead prefetching of 128 byte cache-lines. Note
152 # that we do NOT prefetch into L1, since that would only
153 # interfere with the S[] table values stored there. This is
154 # acceptable because there is a 10 cycle latency between
155 # load and first use of the input data.
156 # - We use a branch to out-of-line bypass-code of cycle-pressure:
157 # we calculate the next J, check for the need to activate the
158 # bypass path, and activate the bypass path ALL IN THE SAME
159 # CYCLE. If we didn't have these constraints, we could do
160 # the bypass with a simple conditional move instruction.
161 # Fortunately, the bypass paths get activated relatively
162 # infrequently, so the extra branches don't cost all that much
163 # (about 0.04 cycles/byte, measured on a 16396 byte file with
164 # random input data).
167 $phases = 4; # number of stages/phases in the pipelined-loop
168 $unroll_count = 6; # number of times we unrolled it
184 # $threshold is the minimum length before we attempt to use the
185 # big software-pipelined loop. It MUST be greater-or-equal
187 # PHASES * (UNROLL_COUNT + 1) + 7
189 # The "+ 7" comes from the fact we may have to encode up to
190 # 7 bytes separately before the output pointer is aligned.
192 $threshold = (3 * ($phases * ($unroll_count + 1)) + 7);
196 local $format = shift;
201 $code .= sprintf ("\t\t".$format."\n", $a0, $a1, $a2, $a3);
206 local $format = shift;
211 $code .= sprintf ($format."\n", $a0, $a1, $a2, $a3);
223 local *bypass = shift;
224 local ($iteration, $p) = @_;
226 local $i0 = $iteration;
227 local $i1 = $iteration - 1;
228 local $i2 = $iteration - 2;
229 local $i3 = $iteration - 3;
230 local $iw0 = ($iteration - 3) / 8;
231 local $iw1 = ($iteration > 3) ? ($iteration - 4) / 8 : 1;
232 local $byte_num = ($iteration - 3) % 8;
233 local $label = $iteration + 1;
234 local $pAny = ($p & 0xf) == 0xf;
235 local $pByp = (($p & $pComI) && ($iteration > 0));
238 //////////////////////////////////////////////////
241 if (($p & 0xf) == 0) {
242 &I(\$c, "st4 [OutPtr] = OutWord[%u], 4", $iw1 % $NOutWord);
247 &I(\$c, "{ .mmi") if ($pAny);
248 &I(\$c, "ld1 Data[%u] = [InPtr], 1", $i0 % $NData) if ($p & $pComI);
249 &I(\$c, "padd1 I[%u] = One, I[%u]", $i0 % $NI, $i1 % $NI)if ($p & $pComI);
250 &I(\$c, "zxt1 J = J") if ($p & $pComJ);
251 &I(\$c, "}") if ($pAny);
252 &I(\$c, "{ .mmi") if ($pAny);
253 &I(\$c, "LKEY T[%u] = [T[%u]]", $i1 % $NT, $i1 % $NT) if ($p & $pOut);
254 &I(\$c, "add T[%u] = SI[%u], SJ[%u]",
255 $i0 % $NT, $i2 % $NSI, $i1 % $NSJ) if ($p & $pComT);
256 &I(\$c, "KEYADDR(IPr[%u], I[%u])", $i0 % $NIP, $i1 % $NI) if ($p & $pComI);
257 &I(\$c, "}") if ($pAny);
261 &I(\$c, "{ .mmi") if ($pAny);
262 &I(\$c, "SKEY [IPr[%u]] = SJ[%u]", $i2 % $NIP, $i1%$NSJ)if ($p & $pComT);
263 &I(\$c, "SKEY [JP[%u]] = SI[%u]", $i1 % $NJP, $i2%$NSI) if ($p & $pComT);
264 &I(\$c, "zxt1 T[%u] = T[%u]", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
265 &I(\$c, "}") if ($pAny);
266 &I(\$c, "{ .mmi") if ($pAny);
267 &I(\$c, "LKEY SI[%u] = [IPr[%u]]", $i0 % $NSI, $i0%$NIP)if ($p & $pComI);
268 &I(\$c, "KEYADDR(JP[%u], J)", $i0 % $NJP) if ($p & $pComJ);
269 &I(\$c, "xor Data[%u] = Data[%u], T[%u]",
270 $i3 % $NData, $i3 % $NData, $i1 % $NT) if ($p & $pOut);
271 &I(\$c, "}") if ($pAny);
275 &I(\$c, "{ .mmi") if ($pAny);
276 &I(\$c, "LKEY SJ[%u] = [JP[%u]]", $i0 % $NSJ, $i0%$NJP) if ($p & $pComJ);
277 &I(\$c, "cmp.eq pBypass, p0 = I[%u], J", $i1 % $NI) if ($pByp);
278 &I(\$c, "dep OutWord[%u] = Data[%u], OutWord[%u], BYTE_POS(%u), 8",
279 $iw0%$NOutWord, $i3%$NData, $iw1%$NOutWord, $byte_num) if ($p & $pOut);
280 &I(\$c, "}") if ($pAny);
281 &I(\$c, "{ .mmb") if ($pAny);
282 &I(\$c, "add J = J, SI[%u]", $i0 % $NSI) if ($p & $pComI);
283 &I(\$c, "KEYADDR(T[%u], T[%u])", $i0 % $NT, $i0 % $NT) if ($p & $pComT);
284 &P(\$c, "(pBypass)\tbr.cond.spnt.many .rc4Bypass%u",$label)if ($pByp);
285 &I(\$c, "}") if ($pAny);
288 &P(\$c, ".rc4Resume%u:", $label) if ($pByp);
289 if ($byte_num == 0 && $iteration >= $phases) {
290 &I(\$c, "st8 [OutPtr] = OutWord[%u], 8",
291 $iw1 % $NOutWord) if ($p & $pOut);
292 if ($iteration == (1 + $unroll_count) * $phases - 1) {
293 if ($unroll_count == 6) {
294 &I(\$c, "mov OutWord[%u] = OutWord[%u]",
295 $iw1 % $NOutWord, $iw0 % $NOutWord);
297 &I(\$c, "lfetch.nt1 [InPrefetch], %u",
298 $unroll_count * $phases);
299 &I(\$c, "lfetch.excl.nt1 [OutPrefetch], %u",
300 $unroll_count * $phases);
301 &I(\$c, "br.cloop.sptk.few .rc4Loop");
306 &P(\$bypass, ".rc4Bypass%u:", $label);
307 &I(\$bypass, "sub J = J, SI[%u]", $i0 % $NSI);
308 &I(\$bypass, "nop 0");
309 &I(\$bypass, "nop 0");
311 &I(\$bypass, "add J = J, SI[%u]", $i1 % $NSI);
312 &I(\$bypass, "mov SI[%u] = SI[%u]", $i0 % $NSI, $i1 % $NSI);
313 &I(\$bypass, "br.sptk.many .rc4Resume%u\n", $label);
318 .ident \"rc4-ia64.s, version 3.0\"
319 .ident \"Copyright (c) 2005 Hewlett-Packard Development Company, L.P.\"
324 /* Inputs become invalid once rotation begins! */
326 #define StateTable in0
328 #define InputBuffer in2
329 #define OutputBuffer in3
335 #define InPrefetch r18
336 #define OutPrefetch r19
338 #define LoopCount r21
339 #define Remainder r22
350 #define pUnaligned p10
352 #define pComputeI pPhase[0]
353 #define pComputeJ pPhase[1]
354 #define pComputeT pPhase[2]
355 #define pOutput pPhase[3]
365 #define _NLOCALS (_NROTATE - _NINPUTS - _NOUTPUT)
368 # define SZ 4 // this must be set to sizeof(RC4_INT)
374 # define KEYADDR(dst, i) add dst = i, KTable
378 # define KEYADDR(dst, i) shladd dst = i, 1, KTable
382 # define KEYADDR(dst, i) shladd dst = i, 2, KTable
386 # define KEYADDR(dst, i) shladd dst = i, 3, KTable
389 #if defined(_HPUX_SOURCE) && !defined(_LP64)
395 /* Define a macro for the bit number of the n-th byte: */
398 # define BYTE_POS(n) (8 * (n))
400 # define BYTE_POS(n) (56 - (8 * (n)))
404 We must perform the first phase of the pipeline explicitly since
405 we will always load from the stable the first time. The br.cexit
406 will never be taken since regardless of the number of bytes because
407 the epilogue count is 4.
410 #define MODSCHED_RC4(label) \\
412 ld1 Data[0] = [InPtr], 1; \\
413 add IFinal = 1, I[1]; \\
414 KEYADDR(IPr[0], I[1]); \\
417 LKEY SI[0] = [IPr[0]]; \\
418 mov pr.rot = 0x10000; \\
423 zxt1 I[0] = IFinal; \\
424 br.cexit.spnt.few label; /* never taken */ \\
428 (pComputeI) ld1 Data[0] = [InPtr], 1; \\
429 (pComputeI) add IFinal = 1, I[1]; \\
430 (pComputeJ) zxt1 J = J; \\
432 (pOutput) LKEY T[1] = [T[1]]; \\
433 (pComputeT) add T[0] = SI[2], SJ[1]; \\
434 (pComputeI) KEYADDR(IPr[0], I[1]); \\
437 (pComputeT) SKEY [IPr[2]] = SJ[1]; \\
438 (pComputeT) SKEY [JP[1]] = SI[2]; \\
439 (pComputeT) zxt1 T[0] = T[0]; \\
441 (pComputeI) LKEY SI[0] = [IPr[0]]; \\
442 (pComputeJ) KEYADDR(JP[0], J); \\
443 (pComputeI) cmp.eq.unc pBypass, p0 = I[1], J; \\
446 (pComputeJ) LKEY SJ[0] = [JP[0]]; \\
447 (pOutput) xor Data[3] = Data[3], T[1]; \\
450 (pComputeT) KEYADDR(T[0], T[0]); \\
451 (pBypass) mov SI[0] = SI[1]; \\
452 (pComputeI) zxt1 I[0] = IFinal; \\
455 (pOutput) st1 [OutPtr] = Data[3], 1; \\
456 (pComputeI) add J = J, SI[0]; \\
457 br.ctop.sptk.few label; \\
464 .type RC4, \@function
473 alloc r2 = ar.pfs, _NINPUTS, _NLOCALS, _NOUTPUT, _NROTATE
475 .rotr Data[4], I[2], IPr[3], SI[3], JP[2], SJ[2], T[2], \\
480 add InPrefetch = 0, InputBuffer
484 ADDP InputBuffer = 0, InputBuffer
485 ADDP StateTable = 0, StateTable
489 ADDP InPrefetch = 0, InputBuffer
490 ADDP OutputBuffer = 0, OutputBuffer
497 lfetch.nt1 [InPrefetch], 0x80
498 LKEY I[1] = [StateTable], SZ
499 mov OutPrefetch = OutputBuffer
507 { // Return 0 if the input length is nonsensical
510 cmp.ge L_NOK, L_OK = r0, DataLen
511 (L_NOK) br.ret.sptk.few rp
517 cmp.eq L_NOK, L_OK = r0, InputBuffer
518 (L_NOK) br.ret.sptk.few rp
524 cmp.eq L_NOK, L_OK = r0, OutputBuffer
525 (L_NOK) br.ret.sptk.few rp
531 cmp.eq L_NOK, L_OK = r0, StateTable
532 (L_NOK) br.ret.sptk.few rp
536 /* Prefetch the state-table. It contains 256 elements of size SZ */
539 ADDP tmp0 = 1*128, StateTable
541 ADDP tmp0 = 3*128, StateTable
542 ADDP tmp1 = 2*128, StateTable
544 ADDP tmp0 = 7*128, StateTable
545 ADDP tmp1 = 6*128, StateTable
547 ADDP tmp0 = 15*128, StateTable
548 ADDP tmp1 = 14*128, StateTable
552 lfetch.fault.nt1 [tmp0], -256 // 15
553 lfetch.fault.nt1 [tmp1], -256;;
554 lfetch.fault.nt1 [tmp0], -256 // 13
555 lfetch.fault.nt1 [tmp1], -256;;
556 lfetch.fault.nt1 [tmp0], -256 // 11
557 lfetch.fault.nt1 [tmp1], -256;;
558 lfetch.fault.nt1 [tmp0], -256 // 9
559 lfetch.fault.nt1 [tmp1], -256;;
562 lfetch.fault.nt1 [tmp0], -256 // 7
563 lfetch.fault.nt1 [tmp1], -256;;
564 lfetch.fault.nt1 [tmp0], -256 // 5
565 lfetch.fault.nt1 [tmp1], -256;;
568 lfetch.fault.nt1 [tmp0], -256 // 3
569 lfetch.fault.nt1 [tmp1], -256;;
571 lfetch.fault.nt1 [tmp0] // 1
575 lfetch.nt1 [InPrefetch], 0x80
576 lfetch.excl.nt1 [OutPrefetch], 0x80
582 lfetch.excl.nt1 [OutPrefetch], 0x80
583 LKEY J = [StateTable], SZ
584 ADDP EndPtr = DataLen, InputBuffer
588 mov InPtr = InputBuffer
589 mov OutPtr = OutputBuffer
590 ADDP EndPtr = -1, EndPtr // Make it point to
595 mov KTable = StateTable
603 sub Remainder = 0, OutPtr
604 cmp.gtu pSmall, p0 = $threshold, DataLen
605 (pSmall) br.cond.dpnt .rc4Remainder // Data too small for
610 and Remainder = 0x7, Remainder
612 cmp.eq pAligned, pUnaligned = Remainder, r0
617 (pUnaligned) add Remainder = -1, Remainder
618 (pAligned) sub Remainder = EndPtr, InPtr
619 (pAligned) br.cond.dptk.many .rc4Aligned
625 mov.i ar.lc = Remainder
628 /* Do the initial few bytes via the compact, modulo-scheduled loop
629 until the output pointer is 8-byte-aligned. */
631 MODSCHED_RC4(.RC4AlignLoop)
635 sub Remainder = EndPtr, InPtr
637 clrrrb // Clear CFM.rrb.pr so
638 ;; // next "mov pr.rot = N"
639 // does the right thing.
652 Unrolled loop count = (Remainder - ($unroll_count+1)*$phases)/($unroll_count*$phases)
657 add LoopCount = 1 - ($unroll_count + 1)*$phases, Remainder
658 movl Remainder = 0xaaaaaaaaaaaaaaab
662 setf.sig f6 = LoopCount // M2, M3 6 cyc
663 setf.sig f7 = Remainder // M2, M3 6 cyc
674 getf.sig LoopCount = f6 // M2 5 cyc
682 shr.u LoopCount = LoopCount, 4
688 mov.i ar.lc = LoopCount
691 /* Now comes the unrolled loop: */
698 # Generate the prologue:
700 for ($i = 0; $i < $phases; ++$i) {
701 &emit_body (\$code, \$bypass, $iteration++, $predicates);
702 $predicates = ($predicates << 1) | 1;
710 for ($i = 0; $i < $unroll_count*$phases; ++$i) {
711 &emit_body (\$code, \$bypass, $iteration++, $predicates);
718 # Generate the epilogue:
719 for ($i = 0; $i < $phases; ++$i) {
721 &emit_body (\$code, \$bypass, $iteration++, $predicates);
727 lfetch.nt1 [EndPtr] // fetch line with last byte
735 sub Remainder = EndPtr, InPtr // Calculate
743 cmp.eq pDone, p0 = -1, Remainder // done already?
744 mov.i ar.lc = Remainder
745 (pDone) br.cond.dptk.few .rc4Complete
748 /* Do the remaining bytes via the compact, modulo-scheduled loop */
750 MODSCHED_RC4(.RC4RestLoop)
762 ADDP KTable = -2*SZ, KTable ;;
763 SKEY [KTable] = IFinal, SZ
775 mov pr = PRSave, 0x1FFFF
780 # Last but not least, emit the code for the bypass-code of the unrolled loop: