/* Algorithm Specification
http://info.isl.ntt.co.jp/crypt/eng/camellia/specifications.html
*/
+
+/*
+ * This release balances code size and performance. In particular key
+ * schedule setup is fully unrolled, because doing so *significantly*
+ * reduces amount of instructions per setup round and code increase is
+ * justifiable. In block functions on the other hand only inner loops
+ * are unrolled, as full unroll gives only nominal performance boost,
+ * while code size grows 4 or 7 times. Also, unlike previous versions
+ * this one "encourages" compiler to keep intermediate variables in
+ * registers, which should give better "all round" results, in other
+ * words reasonable performance even with not so modern compilers.
+ */
#include "camellia.h"
#include "cmll_locl.h"
#include <string.h>
#include <stdlib.h>
-/* These macro variables select what code is used in the creation of
- the Camellia library objects */
-
-#define USE_C_FEISTEL_CODE 0 /* Set to 1 to use C code, 0 to inline via macro */
-
-/* Word rotation */
-#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
-#define RightRotate(x, s) _lrotr(x, s)
-#define LeftRotate(x, s) _lrotl(x, s)
-#elif defined(__INTEL__) && defined(__MWERKS__)
-#define RightRotate(x, s) __ror(x, s)
-#define LeftRotate(x, s) __rol(x, s)
-#else
-#define RightRotate(x, s) ( ((x) >> (s)) + ((x) << (32 - s)) )
-#define LeftRotate(x, s) ( ((x) << (s)) + ((x) >> (32 - s)) )
+/* 32-bit rotations */
+#if !defined(PEDANTIC) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_NO_INLINE_ASM)
+# if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
+# define RightRotate(x, s) _lrotr(x, s)
+# define LeftRotate(x, s) _lrotl(x, s)
+# if _MSC_VER >= 1400
+# define SWAP(x) _byteswap_ulong(x)
+# else
+# define SWAP(x) (_lrotl(x, 8) & 0x00ff00ff | _lrotr(x, 8) & 0xff00ff00)
+# endif
+# define GETU32(p) SWAP(*((u32 *)(p)))
+# define PUTU32(p,v) (*((u32 *)(p)) = SWAP((v)))
+# elif defined(__GNUC__) && __GNUC__>=2
+# if defined(__i386) || defined(__x86_64)
+# define RightRotate(x,s) ({u32 ret; asm ("rorl %1,%0":"=r"(ret):"I"(s),"0"(x):"cc"); ret; })
+# define LeftRotate(x,s) ({u32 ret; asm ("roll %1,%0":"=r"(ret):"I"(s),"0"(x):"cc"); ret; })
+# if defined(B_ENDIAN) /* stratus.com does it */
+# define GETU32(p) (*(u32 *)(p))
+# define PUTU32(p,v) (*(u32 *)(p)=(v))
+# else
+# define GETU32(p) ({u32 r=*(const u32 *)(p); asm("bswapl %0":"=r"(r):"0"(r)); r; })
+# define PUTU32(p,v) ({u32 r=(v); asm("bswapl %0":"=r"(r):"0"(r)); *(u32 *)(p)=r; })
+# endif
+# elif defined(_ARCH_PPC) || defined(_ARCH_PPC64) || \
+ defined(__powerpc) || defined(__ppc__) || defined(__powerpc64__)
+# define LeftRotate(x,s) ({u32 ret; asm ("rlwinm %0,%1,%2,0,31":"=r"(ret):"r"(x),"I"(s)); ret; })
+# define RightRotate(x,s) LeftRotate(x,(32-s))
+# elif defined(__s390x__)
+# define LeftRotate(x,s) ({u32 ret; asm ("rll %0,%1,%2":"=r"(ret):"r"(x),"I"(s)); ret; })
+# define RightRotate(x,s) LeftRotate(x,(32-s))
+# define GETU32(p) (*(u32 *)(p))
+# define PUTU32(p,v) (*(u32 *)(p)=(v))
+# endif
+# endif
#endif
+#if !defined(RightRotate) && !defined(LeftRotate)
+# define RightRotate(x, s) ( ((x) >> (s)) + ((x) << (32 - s)) )
+# define LeftRotate(x, s) ( ((x) << (s)) + ((x) >> (32 - s)) )
+#endif
+
+#if !defined(GETU32) && !defined(PUTU32)
+# define GETU32(p) (((u32)(p)[0] << 24) ^ ((u32)(p)[1] << 16) ^ ((u32)(p)[2] << 8) ^ ((u32)(p)[3]))
+# define PUTU32(p,v) ((p)[0] = (u8)((v) >> 24), (p)[1] = (u8)((v) >> 16), (p)[2] = (u8)((v) >> 8), (p)[3] = (u8)(v))
+#endif
/* S-box data */
-const Word SBOX1_1110[] =
-{
- 0x70707000, 0x82828200, 0x2c2c2c00, 0xececec00, 0xb3b3b300, 0x27272700,
+#define SBOX1_1110 Camellia_SBOX[0]
+#define SBOX4_4404 Camellia_SBOX[1]
+#define SBOX2_0222 Camellia_SBOX[2]
+#define SBOX3_3033 Camellia_SBOX[3]
+static const u32 Camellia_SBOX[][256] = {
+{ 0x70707000, 0x82828200, 0x2c2c2c00, 0xececec00, 0xb3b3b300, 0x27272700,
0xc0c0c000, 0xe5e5e500, 0xe4e4e400, 0x85858500, 0x57575700, 0x35353500,
0xeaeaea00, 0x0c0c0c00, 0xaeaeae00, 0x41414100, 0x23232300, 0xefefef00,
0x6b6b6b00, 0x93939300, 0x45454500, 0x19191900, 0xa5a5a500, 0x21212100,
0x2a2a2a00, 0x68686800, 0x3c3c3c00, 0x38383800, 0xf1f1f100, 0xa4a4a400,
0x40404000, 0x28282800, 0xd3d3d300, 0x7b7b7b00, 0xbbbbbb00, 0xc9c9c900,
0x43434300, 0xc1c1c100, 0x15151500, 0xe3e3e300, 0xadadad00, 0xf4f4f400,
- 0x77777700, 0xc7c7c700, 0x80808000, 0x9e9e9e00
-};
-const Word SBOX4_4404[] =
-{
- 0x70700070, 0x2c2c002c, 0xb3b300b3, 0xc0c000c0, 0xe4e400e4, 0x57570057,
+ 0x77777700, 0xc7c7c700, 0x80808000, 0x9e9e9e00 },
+{ 0x70700070, 0x2c2c002c, 0xb3b300b3, 0xc0c000c0, 0xe4e400e4, 0x57570057,
0xeaea00ea, 0xaeae00ae, 0x23230023, 0x6b6b006b, 0x45450045, 0xa5a500a5,
0xeded00ed, 0x4f4f004f, 0x1d1d001d, 0x92920092, 0x86860086, 0xafaf00af,
0x7c7c007c, 0x1f1f001f, 0x3e3e003e, 0xdcdc00dc, 0x5e5e005e, 0x0b0b000b,
0x46460046, 0xbaba00ba, 0x25250025, 0x42420042, 0xa2a200a2, 0xfafa00fa,
0x07070007, 0x55550055, 0xeeee00ee, 0x0a0a000a, 0x49490049, 0x68680068,
0x38380038, 0xa4a400a4, 0x28280028, 0x7b7b007b, 0xc9c900c9, 0xc1c100c1,
- 0xe3e300e3, 0xf4f400f4, 0xc7c700c7, 0x9e9e009e
-};
-const Word SBOX2_0222[] =
-{
- 0x00e0e0e0, 0x00050505, 0x00585858, 0x00d9d9d9, 0x00676767, 0x004e4e4e,
+ 0xe3e300e3, 0xf4f400f4, 0xc7c700c7, 0x9e9e009e },
+{ 0x00e0e0e0, 0x00050505, 0x00585858, 0x00d9d9d9, 0x00676767, 0x004e4e4e,
0x00818181, 0x00cbcbcb, 0x00c9c9c9, 0x000b0b0b, 0x00aeaeae, 0x006a6a6a,
0x00d5d5d5, 0x00181818, 0x005d5d5d, 0x00828282, 0x00464646, 0x00dfdfdf,
0x00d6d6d6, 0x00272727, 0x008a8a8a, 0x00323232, 0x004b4b4b, 0x00424242,
0x00545454, 0x00d0d0d0, 0x00787878, 0x00707070, 0x00e3e3e3, 0x00494949,
0x00808080, 0x00505050, 0x00a7a7a7, 0x00f6f6f6, 0x00777777, 0x00939393,
0x00868686, 0x00838383, 0x002a2a2a, 0x00c7c7c7, 0x005b5b5b, 0x00e9e9e9,
- 0x00eeeeee, 0x008f8f8f, 0x00010101, 0x003d3d3d
-};
-const Word SBOX3_3033[] =
-{
- 0x38003838, 0x41004141, 0x16001616, 0x76007676, 0xd900d9d9, 0x93009393,
+ 0x00eeeeee, 0x008f8f8f, 0x00010101, 0x003d3d3d },
+{ 0x38003838, 0x41004141, 0x16001616, 0x76007676, 0xd900d9d9, 0x93009393,
0x60006060, 0xf200f2f2, 0x72007272, 0xc200c2c2, 0xab00abab, 0x9a009a9a,
0x75007575, 0x06000606, 0x57005757, 0xa000a0a0, 0x91009191, 0xf700f7f7,
0xb500b5b5, 0xc900c9c9, 0xa200a2a2, 0x8c008c8c, 0xd200d2d2, 0x90009090,
0x15001515, 0x34003434, 0x1e001e1e, 0x1c001c1c, 0xf800f8f8, 0x52005252,
0x20002020, 0x14001414, 0xe900e9e9, 0xbd00bdbd, 0xdd00dddd, 0xe400e4e4,
0xa100a1a1, 0xe000e0e0, 0x8a008a8a, 0xf100f1f1, 0xd600d6d6, 0x7a007a7a,
- 0xbb00bbbb, 0xe300e3e3, 0x40004040, 0x4f004f4f
+ 0xbb00bbbb, 0xe300e3e3, 0x40004040, 0x4f004f4f }
};
-#define CopyConvertEndianness16in(src, dst) \
-do {\
- (dst)[0] = GETU32((Byte *)(src));\
- (dst)[1] = GETU32((Byte *)(src) + 4);\
- (dst)[2] = GETU32((Byte *)(src) + 8);\
- (dst)[3] = GETU32((Byte *)(src) + 12);\
-} while(0)
-#define CopyConvertEndianness16out(src, dst) \
-do{\
- PUTU32(dst ,(src)[0]);\
- PUTU32(dst + 4,(src)[1]);\
- PUTU32(dst + 8,(src)[2]);\
- PUTU32(dst + 12,(src)[3]);\
-}while(0)
-
-/* Computes the exclusive-or of x and y and assigns it to z, ie,
- * z = x ^ y */
-#define XorBlock(x, y, z)\
-do {\
- (z)[0] = (x)[0] ^ (y)[0];\
- (z)[1] = (x)[1] ^ (y)[1];\
- (z)[2] = (x)[2] ^ (y)[2];\
- (z)[3] = (x)[3] ^ (y)[3];\
-} while(0)
-
-/* Transforms an array of 4 words by flipping the first two words
- * with the last 2 words. */
-#define SwapHalf(x)\
-do {\
- Word _t;\
- _t = (x)[0];\
- (x)[0] = (x)[2];\
- (x)[2] = _t;\
- _t = (x)[1];\
- (x)[1] = (x)[3];\
- (x)[3] = _t;\
-} while(0)
-
-/* This function is only used in key generation. */
-static void RotBlock(const Word x[], const int n, Word y[])
-{
- int r = (n & 31); /* Must not be 0 */
- int idx = (n >> 5);
- int idx1 = (idx + 1) & 3;
- int idx2 = (idx1 + 1) & 3;
-
- y[0] = (x[idx] << r) | (x[idx1] >> (32 - r));
- y[1] = (x[idx1] << r) | (x[idx2] >> (32 - r));
-}
-
+/* Key generation constants */
+static const u32 SIGMA[] = {
+ 0xa09e667f, 0x3bcc908b, 0xb67ae858, 0x4caa73b2, 0xc6ef372f, 0xe94f82be,
+ 0x54ff53a5, 0xf1d36f1c, 0x10e527fa, 0xde682d1d, 0xb05688c2, 0xb3e6c1fd
+};
/* The phi algorithm given in C.2.7 of the Camellia spec document. */
-#if (USE_C_FEISTEL_CODE)
-static void Camellia_Feistel(Word *x, const Word *k, const int key_offset)
- {
- Word D, U;
- Word s1, s2;
-
- s1 = x[0] ^ k[0];
- U = SBOX4_4404[(Byte)s1];
- U ^= SBOX3_3033[(Byte)(s1 >> 8)];
- U ^= SBOX2_0222[(Byte)(s1 >> 16)];
- U ^= SBOX1_1110[(Byte)(s1 >> 24)];
- s2 = x[1] ^ k[1];
- D = SBOX1_1110[(Byte)s2];
- D ^= SBOX4_4404[(Byte)(s2 >> 8)];
- D ^= SBOX3_3033[(Byte)(s2 >> 16)];
- D ^= SBOX2_0222[(Byte)(s2 >> 24)];
-
- x[2] ^= D ^ U;
- x[3] ^= D ^ U ^ RightRotate(U, 8);
-
- s1 = x[2] ^ k[key_offset];
- U = SBOX4_4404[(Byte)s1];
- U ^= SBOX3_3033[(Byte)(s1 >> 8)];
- U ^= SBOX2_0222[(Byte)(s1 >> 16)];
- U ^= SBOX1_1110[(Byte)(s1 >> 24)];
- s2 = x[3] ^ k[key_offset+1];
- D = SBOX1_1110[(Byte)s2];
- D ^= SBOX4_4404[(Byte)(s2 >> 8)];
- D ^= SBOX3_3033[(Byte)(s2 >> 16)];
- D ^= SBOX2_0222[(Byte)(s2 >> 24)];
-
- x[0] ^= D ^ U;
- x[1] ^= D ^ U ^ RightRotate(U, 8);
- }
-
-#else /* use macro code. Slower on PC due to code/cache interaction */
-
-#define Camellia_Feistel(xx,kk,oo)\
-do {\
- Word * const _x = (Word *)(xx);\
- const Word * const _k = (Word *)(kk);\
- Word _D, _U;\
- Word _s1, _s2;\
-\
- _s1 = _x[0] ^ _k[0];\
- _U = SBOX4_4404[(Byte)_s1];\
- _U ^= SBOX3_3033[(Byte)(_s1 >> 8)];\
- _U ^= SBOX2_0222[(Byte)(_s1 >> 16)];\
- _U ^= SBOX1_1110[(Byte)(_s1 >> 24)];\
- _s2 = _x[1] ^ _k[1];\
- _D = SBOX1_1110[(Byte)_s2];\
- _D ^= SBOX4_4404[(Byte)(_s2 >> 8)];\
- _D ^= SBOX3_3033[(Byte)(_s2 >> 16)];\
- _D ^= SBOX2_0222[(Byte)(_s2 >> 24)];\
-\
- _x[2] ^= _D ^ _U;\
- _x[3] ^= _D ^ _U ^ RightRotate(_U, 8);\
-\
- _s1 = _x[2] ^ _k[oo];\
- _U = SBOX4_4404[(Byte)_s1];\
- _U ^= SBOX3_3033[(Byte)(_s1 >> 8)];\
- _U ^= SBOX2_0222[(Byte)(_s1 >> 16)];\
- _U ^= SBOX1_1110[(Byte)(_s1 >> 24)];\
- _s2 = _x[3] ^ _k[(oo)+1];\
- _D = SBOX1_1110[(Byte)_s2];\
- _D ^= SBOX4_4404[(Byte)(_s2 >> 8)];\
- _D ^= SBOX3_3033[(Byte)(_s2 >> 16)];\
- _D ^= SBOX2_0222[(Byte)(_s2 >> 24)];\
+/*
+ * This version does not attempt to minimize amount of temporary
+ * variables, but instead explicitly exposes algorithm's parallelism.
+ * It is therefore most appropriate for platforms with not less than
+ * ~16 registers. For platforms with less registers [well, x86 to be
+ * specific] assembler version should be/is provided anyway...
+ */
+#define Camellia_Feistel(_s0,_s1,_s2,_s3,_key) do {\
+ register u32 _t0,_t1,_t2,_t3;\
\
- _x[0] ^= _D ^ _U;\
- _x[1] ^= _D ^ _U ^ RightRotate(_U, 8);\
+ _t0 = _s0 ^ (_key)[0];\
+ _t3 = SBOX4_4404[_t0&0xff];\
+ _t1 = _s1 ^ (_key)[1];\
+ _t3 ^= SBOX3_3033[(_t0 >> 8)&0xff];\
+ _t2 = SBOX1_1110[_t1&0xff];\
+ _t3 ^= SBOX2_0222[(_t0 >> 16)&0xff];\
+ _t2 ^= SBOX4_4404[(_t1 >> 8)&0xff];\
+ _t3 ^= SBOX1_1110[(_t0 >> 24)];\
+ _t2 ^= _t3;\
+ _t3 = RightRotate(_t3,8);\
+ _t2 ^= SBOX3_3033[(_t1 >> 16)&0xff];\
+ _s3 ^= _t3;\
+ _t2 ^= SBOX2_0222[(_t1 >> 24)];\
+ _s2 ^= _t2; \
+ _s3 ^= _t2;\
} while(0)
-#endif /* USE_C_FEISTEL_CODE */
-
-
-/* Key generation constants */
-
-const Word SIGMA[] = {
- 0xa09e667f, 0x3bcc908b,
- 0xb67ae858, 0x4caa73b2,
- 0xc6ef372f, 0xe94f82be,
- 0x54ff53a5, 0xf1d36f1c,
- 0x10e527fa, 0xde682d1d,
- 0xb05688c2, 0xb3e6c1fd
-};
-
-const int KSFT1[26] = {
- 0, 64, 0, 64, 15, 79, 15, 79, 30, 94, 45, 109, 45, 124, 60, 124, 77, 13,
- 94, 30, 94, 30, 111, 47, 111, 47
-};
-
-const int KIDX1[26] = {
- 0, 0, 8, 8, 0, 0, 8, 8, 8, 8, 0, 0, 8, 0, 8, 8, 0, 0, 0, 0, 8, 8, 0, 0, 8, 8
-};
-
-const int KSFT2[34] = {
- 0, 64, 0, 64, 15, 79, 15, 79, 30, 94, 30, 94, 45, 109, 45, 109, 60, 124,
- 60, 124, 60, 124, 77, 13, 77, 13, 94, 30, 94, 30, 111, 47, 111, 47
-};
-
-const int KIDX2[34] = {
- 0, 0, 12, 12, 4, 4, 8, 8, 4, 4, 12, 12, 0, 0, 8, 8, 0, 0, 4, 4, 12, 12,
- 0, 0, 8, 8, 4, 4, 8, 8, 0, 0, 12, 12
-};
-
-
-/* Generates the key table e from rawKey. The reference implementation has been
- * mangled to avoid the necessity of having to separately code the Feistel function. */
-void Camellia_Ekeygen(const int keyBitLength, const Byte *rawKey, KEY_TABLE_TYPE keyTable)
+/*
+ * Note that n has to be less than 32. Rotations for larger amount
+ * of bits are achieved by "rotating" order of s-elements and
+ * adjusting n accordingly, e.g. RotLeft128(s1,s2,s3,s0,n-32).
+ */
+#define RotLeft128(_s0,_s1,_s2,_s3,_n) do {\
+ u32 _t0=_s0>>(32-_n);\
+ _s0 = (_s0<<_n) | (_s1>>(32-_n));\
+ _s1 = (_s1<<_n) | (_s2>>(32-_n));\
+ _s2 = (_s2<<_n) | (_s3>>(32-_n));\
+ _s3 = (_s3<<_n) | _t0;\
+} while (0)
+
+int Camellia_Ekeygen(int keyBitLength, const u8 *rawKey, KEY_TABLE_TYPE k)
{
- Word t[16];
- int i;
+ register u32 s0,s1,s2,s3;
+ k[0] = s0 = GETU32(rawKey);
+ k[1] = s1 = GETU32(rawKey+4);
+ k[2] = s2 = GETU32(rawKey+8);
+ k[3] = s3 = GETU32(rawKey+12);
- /* Copy raw key material into the key table. Keep the bytes register- */
- /* endian-correct (ie. bytes appear in registers in the correct order */
- /* but will appear reversed in memory on a little-endian machine */
- if (keyBitLength == 128)
- {
- CopyConvertEndianness16in((Word *)rawKey, (Word *)t);
- for (i = 4; i < 8; i++)
- t[i] = 0;
- }
- else if (keyBitLength == 192)
+ if (keyBitLength != 128)
{
- CopyConvertEndianness16in((Word *)rawKey, (Word *)t);
- for (i = 4; i < 6; i++)
+ k[8] = s0 = GETU32(rawKey+16);
+ k[9] = s1 = GETU32(rawKey+20);
+ if (keyBitLength == 192)
{
- Word tmp = (rawKey[4*i] << 24) | (rawKey[4*i+1] << 16) |
- (rawKey[4*i+2] << 8) | (rawKey[4*i+3] << 0);
- t[i] = tmp;
- t[i+2] = ~tmp;
+ k[10] = s2 = ~s0;
+ k[11] = s3 = ~s1;
}
- }
- else if (keyBitLength == 256)
- {
- CopyConvertEndianness16in((Word *)rawKey, (Word *)t);
- CopyConvertEndianness16in(((Word *)rawKey)+4, ((Word *)t)+4);
+ else
+ {
+ k[10] = s2 = GETU32(rawKey+24);
+ k[11] = s3 = GETU32(rawKey+28);
+ }
+ s0 ^= k[0], s1 ^= k[1], s2 ^= k[2], s3 ^= k[3];
}
/* Use the Feistel routine to scramble the key material */
- XorBlock(t, t+4, t+8);
- Camellia_Feistel(t+8, SIGMA, 2);
- XorBlock(t+8, t, t+8);
- Camellia_Feistel(t+8, SIGMA+4,2);
-
+ Camellia_Feistel(s0,s1,s2,s3,SIGMA+0);
+ Camellia_Feistel(s2,s3,s0,s1,SIGMA+2);
+
+ s0 ^= k[0], s1 ^= k[1], s2 ^= k[2], s3 ^= k[3];
+ Camellia_Feistel(s0,s1,s2,s3,SIGMA+4);
+ Camellia_Feistel(s2,s3,s0,s1,SIGMA+6);
+
/* Fill the keyTable. Requires many block rotations. */
if (keyBitLength == 128)
{
- memcpy(keyTable, t, 16);
- memcpy(keyTable+4, t+8, 16);
- for (i = 4; i < 26; i += 2 )
- {
- RotBlock(t + KIDX1[i + 0], KSFT1[i + 0], keyTable+i*2);
- RotBlock(t + KIDX1[i + 1], KSFT1[i + 1], keyTable+i*2+2);
- }
+ k[ 4] = s0, k[ 5] = s1, k[ 6] = s2, k[ 7] = s3;
+ RotLeft128(s0,s1,s2,s3,15); /* KA <<< 15 */
+ k[12] = s0, k[13] = s1, k[14] = s2, k[15] = s3;
+ RotLeft128(s0,s1,s2,s3,15); /* KA <<< 30 */
+ k[16] = s0, k[17] = s1, k[18] = s2, k[19] = s3;
+ RotLeft128(s0,s1,s2,s3,15); /* KA <<< 45 */
+ k[24] = s0, k[25] = s1;
+ RotLeft128(s0,s1,s2,s3,15); /* KA <<< 60 */
+ k[28] = s0, k[29] = s1, k[30] = s2, k[31] = s3;
+ RotLeft128(s1,s2,s3,s0,2); /* KA <<< 94 */
+ k[40] = s1, k[41] = s2, k[42] = s3, k[43] = s0;
+ RotLeft128(s1,s2,s3,s0,17); /* KA <<<111 */
+ k[48] = s1, k[49] = s2, k[50] = s3, k[51] = s0;
+
+ s0 = k[ 0], s1 = k[ 1], s2 = k[ 2], s3 = k[ 3];
+ RotLeft128(s0,s1,s2,s3,15); /* KL <<< 15 */
+ k[ 8] = s0, k[ 9] = s1, k[10] = s2, k[11] = s3;
+ RotLeft128(s0,s1,s2,s3,30); /* KL <<< 45 */
+ k[20] = s0, k[21] = s1, k[22] = s2, k[23] = s3;
+ RotLeft128(s0,s1,s2,s3,15); /* KL <<< 60 */
+ k[26] = s2, k[27] = s3;
+ RotLeft128(s0,s1,s2,s3,17); /* KL <<< 77 */
+ k[32] = s0, k[33] = s1, k[34] = s2, k[35] = s3;
+ RotLeft128(s0,s1,s2,s3,17); /* KL <<< 94 */
+ k[36] = s0, k[37] = s1, k[38] = s2, k[39] = s3;
+ RotLeft128(s0,s1,s2,s3,17); /* KL <<<111 */
+ k[44] = s0, k[45] = s1, k[46] = s2, k[47] = s3;
+
+ return 3; /* grand rounds */
}
else
{
- XorBlock(t+8, t+4, t+12);
- Camellia_Feistel(t+12, SIGMA+8,2);
- memcpy(keyTable, t, 16);
- memcpy(keyTable+4, t+12, 16);
- for (i = 4; i < 34; i += 2)
- {
- RotBlock(t + KIDX2[i + 0], KSFT2[i + 0], keyTable+i*2);
- RotBlock(t + KIDX2[i + 1], KSFT2[i + 1], keyTable+i*2+2);
- }
+ k[12] = s0, k[13] = s1, k[14] = s2, k[15] = s3;
+ s0 ^= k[8], s1 ^= k[9], s2 ^=k[10], s3 ^=k[11];
+ Camellia_Feistel(s0,s1,s2,s3,(SIGMA+8));
+ Camellia_Feistel(s2,s3,s0,s1,(SIGMA+10));
+
+ k[ 4] = s0, k[ 5] = s1, k[ 6] = s2, k[ 7] = s3;
+ RotLeft128(s0,s1,s2,s3,30); /* KB <<< 30 */
+ k[20] = s0, k[21] = s1, k[22] = s2, k[23] = s3;
+ RotLeft128(s0,s1,s2,s3,30); /* KB <<< 60 */
+ k[40] = s0, k[41] = s1, k[42] = s2, k[43] = s3;
+ RotLeft128(s1,s2,s3,s0,19); /* KB <<<111 */
+ k[64] = s1, k[65] = s2, k[66] = s3, k[67] = s0;
+
+ s0 = k[ 8], s1 = k[ 9], s2 = k[10], s3 = k[11];
+ RotLeft128(s0,s1,s2,s3,15); /* KR <<< 15 */
+ k[ 8] = s0, k[ 9] = s1, k[10] = s2, k[11] = s3;
+ RotLeft128(s0,s1,s2,s3,15); /* KR <<< 30 */
+ k[16] = s0, k[17] = s1, k[18] = s2, k[19] = s3;
+ RotLeft128(s0,s1,s2,s3,30); /* KR <<< 60 */
+ k[36] = s0, k[37] = s1, k[38] = s2, k[39] = s3;
+ RotLeft128(s1,s2,s3,s0,2); /* KR <<< 94 */
+ k[52] = s1, k[53] = s2, k[54] = s3, k[55] = s0;
+
+ s0 = k[12], s1 = k[13], s2 = k[14], s3 = k[15];
+ RotLeft128(s0,s1,s2,s3,15); /* KA <<< 15 */
+ k[12] = s0, k[13] = s1, k[14] = s2, k[15] = s3;
+ RotLeft128(s0,s1,s2,s3,30); /* KA <<< 45 */
+ k[28] = s0, k[29] = s1, k[30] = s2, k[31] = s3;
+ /* KA <<< 77 */
+ k[48] = s1, k[49] = s2, k[50] = s3, k[51] = s0;
+ RotLeft128(s1,s2,s3,s0,17); /* KA <<< 94 */
+ k[56] = s1, k[57] = s2, k[58] = s3, k[59] = s0;
+
+ s0 = k[ 0], s1 = k[ 1], s2 = k[ 2], s3 = k[ 3];
+ RotLeft128(s1,s2,s3,s0,13); /* KL <<< 45 */
+ k[24] = s1, k[25] = s2, k[26] = s3, k[27] = s0;
+ RotLeft128(s1,s2,s3,s0,15); /* KL <<< 60 */
+ k[32] = s1, k[33] = s2, k[34] = s3, k[35] = s0;
+ RotLeft128(s1,s2,s3,s0,17); /* KL <<< 77 */
+ k[44] = s1, k[45] = s2, k[46] = s3, k[47] = s0;
+ RotLeft128(s2,s3,s0,s1,2); /* KL <<<111 */
+ k[60] = s2, k[61] = s3, k[62] = s0, k[63] = s1;
+
+ return 4; /* grand rounds */
}
-
+ /*
+ * It is possible to perform certain precalculations, which
+ * would spare few cycles in block procedure. It's not done,
+ * because it upsets the performance balance between key
+ * setup and block procedures, negatively affecting overall
+ * throughput in applications operating on short messages
+ * and volatile keys.
+ */
}
-
-/* Described in great length in the accompanying document. */
-void Camellia_EncryptBlock(const int keyBitLength, const Byte plaintext[],
- const KEY_TABLE_TYPE keyTable, Byte ciphertext[])
+void Camellia_EncryptBlock_Rounds(int grandRounds, const u8 plaintext[],
+ const KEY_TABLE_TYPE keyTable, u8 ciphertext[])
{
- int j;
- int grandRounds;
-
- int totalGrandRounds = (keyBitLength == 128)? 3 : 4;
- Word status[4];
- int flayerLimit = totalGrandRounds - 1;
- const Word *k = keyTable+4;
-
- /* Copy over plaintext to a ciphertext buffer */
- CopyConvertEndianness16in(plaintext, status);
-
- /* Encrypt plaintext block via multiple Feistel rounds */
-
- XorBlock(status, keyTable, status);
+ register u32 s0,s1,s2,s3;
+ const u32 *k = keyTable,*kend = keyTable+grandRounds*16;
+ s0 = GETU32(plaintext) ^ k[0];
+ s1 = GETU32(plaintext+4) ^ k[1];
+ s2 = GETU32(plaintext+8) ^ k[2];
+ s3 = GETU32(plaintext+12) ^ k[3];
+ k += 4;
- for (grandRounds = 0; grandRounds < totalGrandRounds; grandRounds++)
+ while (1)
{
/* Camellia makes 6 Feistel rounds */
- for (j = 0; j < 6; j+=2, k += 4)
- Camellia_Feistel(status,k,2);
-
- if (grandRounds < flayerLimit)
- {
- /* This is the same function as the diffusion function D of
- * the accompanying documentation. See section 3.2 of the
- * accompanying documentation for properties of the FLlayer function. */
- status[1] ^= LeftRotate(status[0] & k[0], 1);
- status[0] ^= status[1] | k[1];
- status[2] ^= status[3] | k[3];
- status[3] ^= LeftRotate(status[2] & k[2], 1);
- k += 4;
- }
+ Camellia_Feistel(s0,s1,s2,s3,k+0);
+ Camellia_Feistel(s2,s3,s0,s1,k+2);
+ Camellia_Feistel(s0,s1,s2,s3,k+4);
+ Camellia_Feistel(s2,s3,s0,s1,k+6);
+ Camellia_Feistel(s0,s1,s2,s3,k+8);
+ Camellia_Feistel(s2,s3,s0,s1,k+10);
+ k += 12;
+
+ if (k == kend) break;
+
+ /* This is the same function as the diffusion function D
+ * of the accompanying documentation. See section 3.2
+ * for properties of the FLlayer function. */
+ s1 ^= LeftRotate(s0 & k[0], 1);
+ s2 ^= s3 | k[3];
+ s0 ^= s1 | k[1];
+ s3 ^= LeftRotate(s2 & k[2], 1);
+ k += 4;
}
- /* Apply the permutation function Pi1. Since Pi1 uses the key words in ascending
- * order it is necessary when decrypting to decrement the key word ptr again. */
- SwapHalf(status);
- XorBlock(status, k, status);
-
- /* Convert endianness if needed and copy out to output array */
- CopyConvertEndianness16out(status, ciphertext);
+ s2 ^= k[0], s3 ^= k[1], s0 ^= k[2], s1 ^= k[3];
+ PUTU32(ciphertext, s2);
+ PUTU32(ciphertext+4, s3);
+ PUTU32(ciphertext+8, s0);
+ PUTU32(ciphertext+12,s1);
}
-
-/* Described in great length in the accompanying document. */
-void Camellia_DecryptBlock(const int keyBitLength, const Byte ciphertext[],
- const KEY_TABLE_TYPE keyTable, Byte plaintext[])
+void Camellia_EncryptBlock(int keyBitLength, const u8 plaintext[],
+ const KEY_TABLE_TYPE keyTable, u8 ciphertext[])
{
- int grandRounds;
- int flayerLimit;
- int totalGrandRounds;
- int keyTableOffset;
- int j;
- const Word *k;
- Word status[4];
+ Camellia_EncryptBlock_Rounds(keyBitLength==128?3:4,
+ plaintext,keyTable,ciphertext);
+ }
- if (keyBitLength == 128)
- {
- totalGrandRounds = 3;
- keyTableOffset = 48;
- }
- else
- {
- totalGrandRounds = 4;
- keyTableOffset = 64;
- }
- k = keyTable+keyTableOffset;
- flayerLimit = totalGrandRounds - 1;
-
- /* Copy over cipher text to a Word aligned buffer */
- CopyConvertEndianness16in(ciphertext, status);
+void Camellia_DecryptBlock_Rounds(int grandRounds, const u8 ciphertext[],
+ const KEY_TABLE_TYPE keyTable, u8 plaintext[])
+ {
+ u32 s0,s1,s2,s3;
+ const u32 *k = keyTable+grandRounds*16,*kend = keyTable+4;
- /* Decrypt ciphertext block */
- XorBlock(status, k, status);
- k -= 2;
+ s0 = GETU32(ciphertext) ^ k[0];
+ s1 = GETU32(ciphertext+4) ^ k[1];
+ s2 = GETU32(ciphertext+8) ^ k[2];
+ s3 = GETU32(ciphertext+12) ^ k[3];
- for (grandRounds = 0; grandRounds < totalGrandRounds; grandRounds++)
+ while (1)
{
/* Camellia makes 6 Feistel rounds */
- for (j = 0; j < 6; j+=2, k -= 4)
- Camellia_Feistel(status,k,-2);
-
- if (grandRounds < flayerLimit)
- {
- /* This is the same function as the diffusion function D of
- * the accompanying documentation. See section 3.2 of the
- * accompanying documentation for properties of the FLlayer function. */
- status[1] ^= LeftRotate(status[0] & k[0], 1);
- status[0] ^= status[1] | k[1];
- status[2] ^= status[3] | k[-1];
- status[3] ^= LeftRotate(status[2] & k[-2], 1);
- k -= 4;
- }
+ k -= 12;
+ Camellia_Feistel(s0,s1,s2,s3,k+10);
+ Camellia_Feistel(s2,s3,s0,s1,k+8);
+ Camellia_Feistel(s0,s1,s2,s3,k+6);
+ Camellia_Feistel(s2,s3,s0,s1,k+4);
+ Camellia_Feistel(s0,s1,s2,s3,k+2);
+ Camellia_Feistel(s2,s3,s0,s1,k+0);
+
+ if (k == kend) break;
+
+ /* This is the same function as the diffusion function D
+ * of the accompanying documentation. See section 3.2
+ * for properties of the FLlayer function. */
+ k -= 4;
+ s1 ^= LeftRotate(s0 & k[2], 1);
+ s2 ^= s3 | k[1];
+ s0 ^= s1 | k[3];
+ s3 ^= LeftRotate(s2 & k[0], 1);
}
- /* Apply the permutation function Pi1. Since Pi1 uses the key words in ascending
- * order it is necessary when decrypting to decrement the key word ptr again. */
- k -= 2;
- SwapHalf(status);
- XorBlock(status, k, status);
+ k -= 4;
+ s2 ^= k[0], s3 ^= k[1], s0 ^= k[2], s1 ^= k[3];
- /* Convert endianness if needed and copy out to output array */
- CopyConvertEndianness16out(status, plaintext);
-
+ PUTU32(plaintext, s2);
+ PUTU32(plaintext+4, s3);
+ PUTU32(plaintext+8, s0);
+ PUTU32(plaintext+12,s1);
+ }
+void Camellia_DecryptBlock(int keyBitLength, const u8 plaintext[],
+ const KEY_TABLE_TYPE keyTable, u8 ciphertext[])
+ {
+ Camellia_DecryptBlock_Rounds(keyBitLength==128?3:4,
+ plaintext,keyTable,ciphertext);
}
-