Add ARIA 32-bit implementation
authorletrhee-nsr <letrhee@nsr.re.kr>
Tue, 18 Apr 2017 10:39:46 +0000 (19:39 +0900)
committerAndy Polyakov <appro@openssl.org>
Thu, 20 Apr 2017 20:55:40 +0000 (22:55 +0200)
Modified code from http://seed.kisa.or.kr to human readable code.
Previous 8-bit code is available with -DOPENSSL_SMALL_FOOTPRINT.
New code is >2x faster.

Reviewed-by: Andy Polyakov <appro@openssl.org>
Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/3242)

crypto/aria/aria.c
crypto/include/internal/aria.h

index 7e17e30..8f212b1 100644 (file)
  * https://www.openssl.org/source/license.html
  */
 
+/*
+ * Copyright (C) 2017 National Security Research Institute. All Rights Reserved.
+ *
+ * Information for ARIA
+ *     http://210.104.33.10/ARIA/index-e.html (English)
+ *     http://seed.kisa.or.kr/ (Korean)
+ *
+ * Public domain version is distributed above.
+ */
+
 /* ====================================================================
  * Copyright (c) 2017 Oracle and/or its affiliates.  All rights reserved.
  */
 
-#include <assert.h>
 #include <openssl/e_os2.h>
-#include <string.h>
 #include "internal/aria.h"
 
+#include <assert.h>
+#include <string.h>
+
+#ifndef OPENSSL_SMALL_FOOTPRINT
+
+/* Begin macro */
+
+/* rotation */
+#define rotl32(v, r) (((uint32_t)(v) << (r)) | ((uint32_t)(v) >> (32 - r)))
+#define rotr32(v, r) (((uint32_t)(v) >> (r)) | ((uint32_t)(v) << (32 - r)))
+
+#define bswap32(v)                                          \
+    (((v) << 24) ^ ((v) >> 24) ^                            \
+    (((v) & 0x0000ff00) << 8) ^ (((v) & 0x00ff0000) >> 8))
+
+#define GET_U8_BE(X, Y) ((uint8_t)((X) >> ((3 - Y) * 8)))
+#define GET_U32_BE(X, Y) (                                  \
+    ((uint32_t)((const uint8_t *)(X))[Y * 4    ] << 24) ^   \
+    ((uint32_t)((const uint8_t *)(X))[Y * 4 + 1] << 16) ^   \
+    ((uint32_t)((const uint8_t *)(X))[Y * 4 + 2] <<  8) ^   \
+    ((uint32_t)((const uint8_t *)(X))[Y * 4 + 3]      )     )
+
+#define PUT_U32_BE(DEST, IDX, VAL)                              \
+    do {                                                        \
+        ((uint8_t *)(DEST))[IDX * 4    ] = GET_U8_BE(VAL, 0);   \
+        ((uint8_t *)(DEST))[IDX * 4 + 1] = GET_U8_BE(VAL, 1);   \
+        ((uint8_t *)(DEST))[IDX * 4 + 2] = GET_U8_BE(VAL, 2);   \
+        ((uint8_t *)(DEST))[IDX * 4 + 3] = GET_U8_BE(VAL, 3);   \
+    } while(0)
+
+#define MAKE_U32(V0, V1, V2, V3) (      \
+    ((uint32_t)((uint8_t)(V0)) << 24) | \
+    ((uint32_t)((uint8_t)(V1)) << 16) | \
+    ((uint32_t)((uint8_t)(V2)) <<  8) | \
+    ((uint32_t)((uint8_t)(V3))      )   )
+
+/* End Macro*/
+
+/* Key Constant
+ * 128bit : 0, 1,    2
+ * 192bit : 1, 2,    3(0)
+ * 256bit : 2, 3(0), 4(1)
+ */
+static const uint32_t Key_RC[5][4] = {
+    { 0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0 },
+    { 0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0 },
+    { 0xdb92371d, 0x2126e970, 0x03249775, 0x04e8c90e },
+    { 0x517cc1b7, 0x27220a94, 0xfe13abe8, 0xfa9a6ee0 },
+    { 0x6db14acc, 0x9e21c820, 0xff28b1d5, 0xef5de2b0 }
+};
+
+/* 32bit expanded s-box */
+static const uint32_t S1[256] = {
+    0x00636363, 0x007c7c7c, 0x00777777, 0x007b7b7b,
+    0x00f2f2f2, 0x006b6b6b, 0x006f6f6f, 0x00c5c5c5,
+    0x00303030, 0x00010101, 0x00676767, 0x002b2b2b,
+    0x00fefefe, 0x00d7d7d7, 0x00ababab, 0x00767676,
+    0x00cacaca, 0x00828282, 0x00c9c9c9, 0x007d7d7d,
+    0x00fafafa, 0x00595959, 0x00474747, 0x00f0f0f0,
+    0x00adadad, 0x00d4d4d4, 0x00a2a2a2, 0x00afafaf,
+    0x009c9c9c, 0x00a4a4a4, 0x00727272, 0x00c0c0c0,
+    0x00b7b7b7, 0x00fdfdfd, 0x00939393, 0x00262626,
+    0x00363636, 0x003f3f3f, 0x00f7f7f7, 0x00cccccc,
+    0x00343434, 0x00a5a5a5, 0x00e5e5e5, 0x00f1f1f1,
+    0x00717171, 0x00d8d8d8, 0x00313131, 0x00151515,
+    0x00040404, 0x00c7c7c7, 0x00232323, 0x00c3c3c3,
+    0x00181818, 0x00969696, 0x00050505, 0x009a9a9a,
+    0x00070707, 0x00121212, 0x00808080, 0x00e2e2e2,
+    0x00ebebeb, 0x00272727, 0x00b2b2b2, 0x00757575,
+    0x00090909, 0x00838383, 0x002c2c2c, 0x001a1a1a,
+    0x001b1b1b, 0x006e6e6e, 0x005a5a5a, 0x00a0a0a0,
+    0x00525252, 0x003b3b3b, 0x00d6d6d6, 0x00b3b3b3,
+    0x00292929, 0x00e3e3e3, 0x002f2f2f, 0x00848484,
+    0x00535353, 0x00d1d1d1, 0x00000000, 0x00ededed,
+    0x00202020, 0x00fcfcfc, 0x00b1b1b1, 0x005b5b5b,
+    0x006a6a6a, 0x00cbcbcb, 0x00bebebe, 0x00393939,
+    0x004a4a4a, 0x004c4c4c, 0x00585858, 0x00cfcfcf,
+    0x00d0d0d0, 0x00efefef, 0x00aaaaaa, 0x00fbfbfb,
+    0x00434343, 0x004d4d4d, 0x00333333, 0x00858585,
+    0x00454545, 0x00f9f9f9, 0x00020202, 0x007f7f7f,
+    0x00505050, 0x003c3c3c, 0x009f9f9f, 0x00a8a8a8,
+    0x00515151, 0x00a3a3a3, 0x00404040, 0x008f8f8f,
+    0x00929292, 0x009d9d9d, 0x00383838, 0x00f5f5f5,
+    0x00bcbcbc, 0x00b6b6b6, 0x00dadada, 0x00212121,
+    0x00101010, 0x00ffffff, 0x00f3f3f3, 0x00d2d2d2,
+    0x00cdcdcd, 0x000c0c0c, 0x00131313, 0x00ececec,
+    0x005f5f5f, 0x00979797, 0x00444444, 0x00171717,
+    0x00c4c4c4, 0x00a7a7a7, 0x007e7e7e, 0x003d3d3d,
+    0x00646464, 0x005d5d5d, 0x00191919, 0x00737373,
+    0x00606060, 0x00818181, 0x004f4f4f, 0x00dcdcdc,
+    0x00222222, 0x002a2a2a, 0x00909090, 0x00888888,
+    0x00464646, 0x00eeeeee, 0x00b8b8b8, 0x00141414,
+    0x00dedede, 0x005e5e5e, 0x000b0b0b, 0x00dbdbdb,
+    0x00e0e0e0, 0x00323232, 0x003a3a3a, 0x000a0a0a,
+    0x00494949, 0x00060606, 0x00242424, 0x005c5c5c,
+    0x00c2c2c2, 0x00d3d3d3, 0x00acacac, 0x00626262,
+    0x00919191, 0x00959595, 0x00e4e4e4, 0x00797979,
+    0x00e7e7e7, 0x00c8c8c8, 0x00373737, 0x006d6d6d,
+    0x008d8d8d, 0x00d5d5d5, 0x004e4e4e, 0x00a9a9a9,
+    0x006c6c6c, 0x00565656, 0x00f4f4f4, 0x00eaeaea,
+    0x00656565, 0x007a7a7a, 0x00aeaeae, 0x00080808,
+    0x00bababa, 0x00787878, 0x00252525, 0x002e2e2e,
+    0x001c1c1c, 0x00a6a6a6, 0x00b4b4b4, 0x00c6c6c6,
+    0x00e8e8e8, 0x00dddddd, 0x00747474, 0x001f1f1f,
+    0x004b4b4b, 0x00bdbdbd, 0x008b8b8b, 0x008a8a8a,
+    0x00707070, 0x003e3e3e, 0x00b5b5b5, 0x00666666,
+    0x00484848, 0x00030303, 0x00f6f6f6, 0x000e0e0e,
+    0x00616161, 0x00353535, 0x00575757, 0x00b9b9b9,
+    0x00868686, 0x00c1c1c1, 0x001d1d1d, 0x009e9e9e,
+    0x00e1e1e1, 0x00f8f8f8, 0x00989898, 0x00111111,
+    0x00696969, 0x00d9d9d9, 0x008e8e8e, 0x00949494,
+    0x009b9b9b, 0x001e1e1e, 0x00878787, 0x00e9e9e9,
+    0x00cecece, 0x00555555, 0x00282828, 0x00dfdfdf,
+    0x008c8c8c, 0x00a1a1a1, 0x00898989, 0x000d0d0d,
+    0x00bfbfbf, 0x00e6e6e6, 0x00424242, 0x00686868,
+    0x00414141, 0x00999999, 0x002d2d2d, 0x000f0f0f,
+    0x00b0b0b0, 0x00545454, 0x00bbbbbb, 0x00161616
+};
+
+static const uint32_t S2[256] = {
+    0xe200e2e2, 0x4e004e4e, 0x54005454, 0xfc00fcfc,
+    0x94009494, 0xc200c2c2, 0x4a004a4a, 0xcc00cccc,
+    0x62006262, 0x0d000d0d, 0x6a006a6a, 0x46004646,
+    0x3c003c3c, 0x4d004d4d, 0x8b008b8b, 0xd100d1d1,
+    0x5e005e5e, 0xfa00fafa, 0x64006464, 0xcb00cbcb,
+    0xb400b4b4, 0x97009797, 0xbe00bebe, 0x2b002b2b,
+    0xbc00bcbc, 0x77007777, 0x2e002e2e, 0x03000303,
+    0xd300d3d3, 0x19001919, 0x59005959, 0xc100c1c1,
+    0x1d001d1d, 0x06000606, 0x41004141, 0x6b006b6b,
+    0x55005555, 0xf000f0f0, 0x99009999, 0x69006969,
+    0xea00eaea, 0x9c009c9c, 0x18001818, 0xae00aeae,
+    0x63006363, 0xdf00dfdf, 0xe700e7e7, 0xbb00bbbb,
+    0x00000000, 0x73007373, 0x66006666, 0xfb00fbfb,
+    0x96009696, 0x4c004c4c, 0x85008585, 0xe400e4e4,
+    0x3a003a3a, 0x09000909, 0x45004545, 0xaa00aaaa,
+    0x0f000f0f, 0xee00eeee, 0x10001010, 0xeb00ebeb,
+    0x2d002d2d, 0x7f007f7f, 0xf400f4f4, 0x29002929,
+    0xac00acac, 0xcf00cfcf, 0xad00adad, 0x91009191,
+    0x8d008d8d, 0x78007878, 0xc800c8c8, 0x95009595,
+    0xf900f9f9, 0x2f002f2f, 0xce00cece, 0xcd00cdcd,
+    0x08000808, 0x7a007a7a, 0x88008888, 0x38003838,
+    0x5c005c5c, 0x83008383, 0x2a002a2a, 0x28002828,
+    0x47004747, 0xdb00dbdb, 0xb800b8b8, 0xc700c7c7,
+    0x93009393, 0xa400a4a4, 0x12001212, 0x53005353,
+    0xff00ffff, 0x87008787, 0x0e000e0e, 0x31003131,
+    0x36003636, 0x21002121, 0x58005858, 0x48004848,
+    0x01000101, 0x8e008e8e, 0x37003737, 0x74007474,
+    0x32003232, 0xca00caca, 0xe900e9e9, 0xb100b1b1,
+    0xb700b7b7, 0xab00abab, 0x0c000c0c, 0xd700d7d7,
+    0xc400c4c4, 0x56005656, 0x42004242, 0x26002626,
+    0x07000707, 0x98009898, 0x60006060, 0xd900d9d9,
+    0xb600b6b6, 0xb900b9b9, 0x11001111, 0x40004040,
+    0xec00ecec, 0x20002020, 0x8c008c8c, 0xbd00bdbd,
+    0xa000a0a0, 0xc900c9c9, 0x84008484, 0x04000404,
+    0x49004949, 0x23002323, 0xf100f1f1, 0x4f004f4f,
+    0x50005050, 0x1f001f1f, 0x13001313, 0xdc00dcdc,
+    0xd800d8d8, 0xc000c0c0, 0x9e009e9e, 0x57005757,
+    0xe300e3e3, 0xc300c3c3, 0x7b007b7b, 0x65006565,
+    0x3b003b3b, 0x02000202, 0x8f008f8f, 0x3e003e3e,
+    0xe800e8e8, 0x25002525, 0x92009292, 0xe500e5e5,
+    0x15001515, 0xdd00dddd, 0xfd00fdfd, 0x17001717,
+    0xa900a9a9, 0xbf00bfbf, 0xd400d4d4, 0x9a009a9a,
+    0x7e007e7e, 0xc500c5c5, 0x39003939, 0x67006767,
+    0xfe00fefe, 0x76007676, 0x9d009d9d, 0x43004343,
+    0xa700a7a7, 0xe100e1e1, 0xd000d0d0, 0xf500f5f5,
+    0x68006868, 0xf200f2f2, 0x1b001b1b, 0x34003434,
+    0x70007070, 0x05000505, 0xa300a3a3, 0x8a008a8a,
+    0xd500d5d5, 0x79007979, 0x86008686, 0xa800a8a8,
+    0x30003030, 0xc600c6c6, 0x51005151, 0x4b004b4b,
+    0x1e001e1e, 0xa600a6a6, 0x27002727, 0xf600f6f6,
+    0x35003535, 0xd200d2d2, 0x6e006e6e, 0x24002424,
+    0x16001616, 0x82008282, 0x5f005f5f, 0xda00dada,
+    0xe600e6e6, 0x75007575, 0xa200a2a2, 0xef00efef,
+    0x2c002c2c, 0xb200b2b2, 0x1c001c1c, 0x9f009f9f,
+    0x5d005d5d, 0x6f006f6f, 0x80008080, 0x0a000a0a,
+    0x72007272, 0x44004444, 0x9b009b9b, 0x6c006c6c,
+    0x90009090, 0x0b000b0b, 0x5b005b5b, 0x33003333,
+    0x7d007d7d, 0x5a005a5a, 0x52005252, 0xf300f3f3,
+    0x61006161, 0xa100a1a1, 0xf700f7f7, 0xb000b0b0,
+    0xd600d6d6, 0x3f003f3f, 0x7c007c7c, 0x6d006d6d,
+    0xed00eded, 0x14001414, 0xe000e0e0, 0xa500a5a5,
+    0x3d003d3d, 0x22002222, 0xb300b3b3, 0xf800f8f8,
+    0x89008989, 0xde00dede, 0x71007171, 0x1a001a1a,
+    0xaf00afaf, 0xba00baba, 0xb500b5b5, 0x81008181
+};
+
+static const uint32_t X1[256] = {
+    0x52520052, 0x09090009, 0x6a6a006a, 0xd5d500d5,
+    0x30300030, 0x36360036, 0xa5a500a5, 0x38380038,
+    0xbfbf00bf, 0x40400040, 0xa3a300a3, 0x9e9e009e,
+    0x81810081, 0xf3f300f3, 0xd7d700d7, 0xfbfb00fb,
+    0x7c7c007c, 0xe3e300e3, 0x39390039, 0x82820082,
+    0x9b9b009b, 0x2f2f002f, 0xffff00ff, 0x87870087,
+    0x34340034, 0x8e8e008e, 0x43430043, 0x44440044,
+    0xc4c400c4, 0xdede00de, 0xe9e900e9, 0xcbcb00cb,
+    0x54540054, 0x7b7b007b, 0x94940094, 0x32320032,
+    0xa6a600a6, 0xc2c200c2, 0x23230023, 0x3d3d003d,
+    0xeeee00ee, 0x4c4c004c, 0x95950095, 0x0b0b000b,
+    0x42420042, 0xfafa00fa, 0xc3c300c3, 0x4e4e004e,
+    0x08080008, 0x2e2e002e, 0xa1a100a1, 0x66660066,
+    0x28280028, 0xd9d900d9, 0x24240024, 0xb2b200b2,
+    0x76760076, 0x5b5b005b, 0xa2a200a2, 0x49490049,
+    0x6d6d006d, 0x8b8b008b, 0xd1d100d1, 0x25250025,
+    0x72720072, 0xf8f800f8, 0xf6f600f6, 0x64640064,
+    0x86860086, 0x68680068, 0x98980098, 0x16160016,
+    0xd4d400d4, 0xa4a400a4, 0x5c5c005c, 0xcccc00cc,
+    0x5d5d005d, 0x65650065, 0xb6b600b6, 0x92920092,
+    0x6c6c006c, 0x70700070, 0x48480048, 0x50500050,
+    0xfdfd00fd, 0xeded00ed, 0xb9b900b9, 0xdada00da,
+    0x5e5e005e, 0x15150015, 0x46460046, 0x57570057,
+    0xa7a700a7, 0x8d8d008d, 0x9d9d009d, 0x84840084,
+    0x90900090, 0xd8d800d8, 0xabab00ab, 0x00000000,
+    0x8c8c008c, 0xbcbc00bc, 0xd3d300d3, 0x0a0a000a,
+    0xf7f700f7, 0xe4e400e4, 0x58580058, 0x05050005,
+    0xb8b800b8, 0xb3b300b3, 0x45450045, 0x06060006,
+    0xd0d000d0, 0x2c2c002c, 0x1e1e001e, 0x8f8f008f,
+    0xcaca00ca, 0x3f3f003f, 0x0f0f000f, 0x02020002,
+    0xc1c100c1, 0xafaf00af, 0xbdbd00bd, 0x03030003,
+    0x01010001, 0x13130013, 0x8a8a008a, 0x6b6b006b,
+    0x3a3a003a, 0x91910091, 0x11110011, 0x41410041,
+    0x4f4f004f, 0x67670067, 0xdcdc00dc, 0xeaea00ea,
+    0x97970097, 0xf2f200f2, 0xcfcf00cf, 0xcece00ce,
+    0xf0f000f0, 0xb4b400b4, 0xe6e600e6, 0x73730073,
+    0x96960096, 0xacac00ac, 0x74740074, 0x22220022,
+    0xe7e700e7, 0xadad00ad, 0x35350035, 0x85850085,
+    0xe2e200e2, 0xf9f900f9, 0x37370037, 0xe8e800e8,
+    0x1c1c001c, 0x75750075, 0xdfdf00df, 0x6e6e006e,
+    0x47470047, 0xf1f100f1, 0x1a1a001a, 0x71710071,
+    0x1d1d001d, 0x29290029, 0xc5c500c5, 0x89890089,
+    0x6f6f006f, 0xb7b700b7, 0x62620062, 0x0e0e000e,
+    0xaaaa00aa, 0x18180018, 0xbebe00be, 0x1b1b001b,
+    0xfcfc00fc, 0x56560056, 0x3e3e003e, 0x4b4b004b,
+    0xc6c600c6, 0xd2d200d2, 0x79790079, 0x20200020,
+    0x9a9a009a, 0xdbdb00db, 0xc0c000c0, 0xfefe00fe,
+    0x78780078, 0xcdcd00cd, 0x5a5a005a, 0xf4f400f4,
+    0x1f1f001f, 0xdddd00dd, 0xa8a800a8, 0x33330033,
+    0x88880088, 0x07070007, 0xc7c700c7, 0x31310031,
+    0xb1b100b1, 0x12120012, 0x10100010, 0x59590059,
+    0x27270027, 0x80800080, 0xecec00ec, 0x5f5f005f,
+    0x60600060, 0x51510051, 0x7f7f007f, 0xa9a900a9,
+    0x19190019, 0xb5b500b5, 0x4a4a004a, 0x0d0d000d,
+    0x2d2d002d, 0xe5e500e5, 0x7a7a007a, 0x9f9f009f,
+    0x93930093, 0xc9c900c9, 0x9c9c009c, 0xefef00ef,
+    0xa0a000a0, 0xe0e000e0, 0x3b3b003b, 0x4d4d004d,
+    0xaeae00ae, 0x2a2a002a, 0xf5f500f5, 0xb0b000b0,
+    0xc8c800c8, 0xebeb00eb, 0xbbbb00bb, 0x3c3c003c,
+    0x83830083, 0x53530053, 0x99990099, 0x61610061,
+    0x17170017, 0x2b2b002b, 0x04040004, 0x7e7e007e,
+    0xbaba00ba, 0x77770077, 0xd6d600d6, 0x26260026,
+    0xe1e100e1, 0x69690069, 0x14140014, 0x63630063,
+    0x55550055, 0x21210021, 0x0c0c000c, 0x7d7d007d
+};
+
+static const uint32_t X2[256] = {
+    0x30303000, 0x68686800, 0x99999900, 0x1b1b1b00,
+    0x87878700, 0xb9b9b900, 0x21212100, 0x78787800,
+    0x50505000, 0x39393900, 0xdbdbdb00, 0xe1e1e100,
+    0x72727200, 0x09090900, 0x62626200, 0x3c3c3c00,
+    0x3e3e3e00, 0x7e7e7e00, 0x5e5e5e00, 0x8e8e8e00,
+    0xf1f1f100, 0xa0a0a000, 0xcccccc00, 0xa3a3a300,
+    0x2a2a2a00, 0x1d1d1d00, 0xfbfbfb00, 0xb6b6b600,
+    0xd6d6d600, 0x20202000, 0xc4c4c400, 0x8d8d8d00,
+    0x81818100, 0x65656500, 0xf5f5f500, 0x89898900,
+    0xcbcbcb00, 0x9d9d9d00, 0x77777700, 0xc6c6c600,
+    0x57575700, 0x43434300, 0x56565600, 0x17171700,
+    0xd4d4d400, 0x40404000, 0x1a1a1a00, 0x4d4d4d00,
+    0xc0c0c000, 0x63636300, 0x6c6c6c00, 0xe3e3e300,
+    0xb7b7b700, 0xc8c8c800, 0x64646400, 0x6a6a6a00,
+    0x53535300, 0xaaaaaa00, 0x38383800, 0x98989800,
+    0x0c0c0c00, 0xf4f4f400, 0x9b9b9b00, 0xededed00,
+    0x7f7f7f00, 0x22222200, 0x76767600, 0xafafaf00,
+    0xdddddd00, 0x3a3a3a00, 0x0b0b0b00, 0x58585800,
+    0x67676700, 0x88888800, 0x06060600, 0xc3c3c300,
+    0x35353500, 0x0d0d0d00, 0x01010100, 0x8b8b8b00,
+    0x8c8c8c00, 0xc2c2c200, 0xe6e6e600, 0x5f5f5f00,
+    0x02020200, 0x24242400, 0x75757500, 0x93939300,
+    0x66666600, 0x1e1e1e00, 0xe5e5e500, 0xe2e2e200,
+    0x54545400, 0xd8d8d800, 0x10101000, 0xcecece00,
+    0x7a7a7a00, 0xe8e8e800, 0x08080800, 0x2c2c2c00,
+    0x12121200, 0x97979700, 0x32323200, 0xababab00,
+    0xb4b4b400, 0x27272700, 0x0a0a0a00, 0x23232300,
+    0xdfdfdf00, 0xefefef00, 0xcacaca00, 0xd9d9d900,
+    0xb8b8b800, 0xfafafa00, 0xdcdcdc00, 0x31313100,
+    0x6b6b6b00, 0xd1d1d100, 0xadadad00, 0x19191900,
+    0x49494900, 0xbdbdbd00, 0x51515100, 0x96969600,
+    0xeeeeee00, 0xe4e4e400, 0xa8a8a800, 0x41414100,
+    0xdadada00, 0xffffff00, 0xcdcdcd00, 0x55555500,
+    0x86868600, 0x36363600, 0xbebebe00, 0x61616100,
+    0x52525200, 0xf8f8f800, 0xbbbbbb00, 0x0e0e0e00,
+    0x82828200, 0x48484800, 0x69696900, 0x9a9a9a00,
+    0xe0e0e000, 0x47474700, 0x9e9e9e00, 0x5c5c5c00,
+    0x04040400, 0x4b4b4b00, 0x34343400, 0x15151500,
+    0x79797900, 0x26262600, 0xa7a7a700, 0xdedede00,
+    0x29292900, 0xaeaeae00, 0x92929200, 0xd7d7d700,
+    0x84848400, 0xe9e9e900, 0xd2d2d200, 0xbababa00,
+    0x5d5d5d00, 0xf3f3f300, 0xc5c5c500, 0xb0b0b000,
+    0xbfbfbf00, 0xa4a4a400, 0x3b3b3b00, 0x71717100,
+    0x44444400, 0x46464600, 0x2b2b2b00, 0xfcfcfc00,
+    0xebebeb00, 0x6f6f6f00, 0xd5d5d500, 0xf6f6f600,
+    0x14141400, 0xfefefe00, 0x7c7c7c00, 0x70707000,
+    0x5a5a5a00, 0x7d7d7d00, 0xfdfdfd00, 0x2f2f2f00,
+    0x18181800, 0x83838300, 0x16161600, 0xa5a5a500,
+    0x91919100, 0x1f1f1f00, 0x05050500, 0x95959500,
+    0x74747400, 0xa9a9a900, 0xc1c1c100, 0x5b5b5b00,
+    0x4a4a4a00, 0x85858500, 0x6d6d6d00, 0x13131300,
+    0x07070700, 0x4f4f4f00, 0x4e4e4e00, 0x45454500,
+    0xb2b2b200, 0x0f0f0f00, 0xc9c9c900, 0x1c1c1c00,
+    0xa6a6a600, 0xbcbcbc00, 0xececec00, 0x73737300,
+    0x90909000, 0x7b7b7b00, 0xcfcfcf00, 0x59595900,
+    0x8f8f8f00, 0xa1a1a100, 0xf9f9f900, 0x2d2d2d00,
+    0xf2f2f200, 0xb1b1b100, 0x00000000, 0x94949400,
+    0x37373700, 0x9f9f9f00, 0xd0d0d000, 0x2e2e2e00,
+    0x9c9c9c00, 0x6e6e6e00, 0x28282800, 0x3f3f3f00,
+    0x80808000, 0xf0f0f000, 0x3d3d3d00, 0xd3d3d300,
+    0x25252500, 0x8a8a8a00, 0xb5b5b500, 0xe7e7e700,
+    0x42424200, 0xb3b3b300, 0xc7c7c700, 0xeaeaea00,
+    0xf7f7f700, 0x4c4c4c00, 0x11111100, 0x33333300,
+    0x03030300, 0xa2a2a200, 0xacacac00, 0x60606000
+};
+
+/* Key XOR Layer */
+#define ARIA_ADD_ROUND_KEY(RK, T0, T1, T2, T3)  \
+    do {                                        \
+        (T0) ^= (RK)->u[0];                     \
+        (T1) ^= (RK)->u[1];                     \
+        (T2) ^= (RK)->u[2];                     \
+        (T3) ^= (RK)->u[3];                     \
+    } while(0)
+
+/* S-Box Layer 1 + M */
+#define ARIA_SBOX_LAYER1_WITH_PRE_DIFF(T0, T1, T2, T3)  \
+    do {                                                \
+        (T0) =                                          \
+            S1[GET_U8_BE(T0, 0)] ^                      \
+            S2[GET_U8_BE(T0, 1)] ^                      \
+            X1[GET_U8_BE(T0, 2)] ^                      \
+            X2[GET_U8_BE(T0, 3)];                       \
+        (T1) =                                          \
+            S1[GET_U8_BE(T1, 0)] ^                      \
+            S2[GET_U8_BE(T1, 1)] ^                      \
+            X1[GET_U8_BE(T1, 2)] ^                      \
+            X2[GET_U8_BE(T1, 3)];                       \
+        (T2) =                                          \
+            S1[GET_U8_BE(T2, 0)] ^                      \
+            S2[GET_U8_BE(T2, 1)] ^                      \
+            X1[GET_U8_BE(T2, 2)] ^                      \
+            X2[GET_U8_BE(T2, 3)];                       \
+        (T3) =                                          \
+            S1[GET_U8_BE(T3, 0)] ^                      \
+            S2[GET_U8_BE(T3, 1)] ^                      \
+            X1[GET_U8_BE(T3, 2)] ^                      \
+            X2[GET_U8_BE(T3, 3)];                       \
+    } while(0)
+
+/* S-Box Layer 2 + M */
+#define ARIA_SBOX_LAYER2_WITH_PRE_DIFF(T0, T1, T2, T3)  \
+    do {                                                \
+        (T0) =                                          \
+            X1[GET_U8_BE(T0, 0)] ^                      \
+            X2[GET_U8_BE(T0, 1)] ^                      \
+            S1[GET_U8_BE(T0, 2)] ^                      \
+            S2[GET_U8_BE(T0, 3)];                       \
+        (T1) =                                          \
+            X1[GET_U8_BE(T1, 0)] ^                      \
+            X2[GET_U8_BE(T1, 1)] ^                      \
+            S1[GET_U8_BE(T1, 2)] ^                      \
+            S2[GET_U8_BE(T1, 3)];                       \
+        (T2) =                                          \
+            X1[GET_U8_BE(T2, 0)] ^                      \
+            X2[GET_U8_BE(T2, 1)] ^                      \
+            S1[GET_U8_BE(T2, 2)] ^                      \
+            S2[GET_U8_BE(T2, 3)];                       \
+        (T3) =                                          \
+            X1[GET_U8_BE(T3, 0)] ^                      \
+            X2[GET_U8_BE(T3, 1)] ^                      \
+            S1[GET_U8_BE(T3, 2)] ^                      \
+            S2[GET_U8_BE(T3, 3)];                       \
+    } while(0)
+
+/* Word-level diffusion */
+#define ARIA_DIFF_WORD(T0,T1,T2,T3) \
+    do {                            \
+        (T1) ^= (T2);               \
+        (T2) ^= (T3);               \
+        (T0) ^= (T1);               \
+                                    \
+        (T3) ^= (T1);               \
+        (T2) ^= (T0);               \
+        (T1) ^= (T2);               \
+    } while(0)
+
+/* Byte-level diffusion */
+#define ARIA_DIFF_BYTE(T0, T1, T2, T3)                                  \
+    do {                                                                \
+        (T1) = (((T1) << 8) & 0xff00ff00) ^ (((T1) >> 8) & 0x00ff00ff); \
+        (T2) = rotr32(T2, 16);                                          \
+        (T3) = bswap32(T3);                                             \
+    } while(0)
+
+/* Odd round Substitution & Diffusion */
+#define ARIA_SUBST_DIFF_ODD(T0, T1, T2, T3)             \
+    do {                                                \
+        ARIA_SBOX_LAYER1_WITH_PRE_DIFF(T0, T1, T2, T3); \
+        ARIA_DIFF_WORD(T0, T1, T2, T3);                 \
+        ARIA_DIFF_BYTE(T0, T1, T2, T3);                 \
+        ARIA_DIFF_WORD(T0, T1, T2, T3);                 \
+    } while(0)
+
+/* Even round Substitution & Diffusion */
+#define ARIA_SUBST_DIFF_EVEN(T0, T1, T2, T3)            \
+    do {                                                \
+        ARIA_SBOX_LAYER2_WITH_PRE_DIFF(T0, T1, T2, T3); \
+        ARIA_DIFF_WORD(T0, T1, T2, T3);                 \
+        ARIA_DIFF_BYTE(T2, T3, T0, T1);                 \
+        ARIA_DIFF_WORD(T0, T1, T2, T3);                 \
+    } while(0)
+
+/* Q, R Macro expanded ARIA GSRK */
+#define _ARIA_GSRK(RK, X, Y, Q, R)                  \
+    do {                                            \
+        (RK)->u[0] =                                \
+            ((X)[0]) ^                              \
+            (((Y)[((Q)    ) % 4]) >> (R)) ^         \
+            (((Y)[((Q) + 3) % 4]) << (32 - (R)));   \
+        (RK)->u[1] =                                \
+            ((X)[1]) ^                              \
+            (((Y)[((Q) + 1) % 4]) >> (R)) ^         \
+            (((Y)[((Q)    ) % 4]) << (32 - (R)));   \
+        (RK)->u[2] =                                \
+            ((X)[2]) ^                              \
+            (((Y)[((Q) + 2) % 4]) >> (R)) ^         \
+            (((Y)[((Q) + 1) % 4]) << (32 - (R)));   \
+        (RK)->u[3] =                                \
+            ((X)[3]) ^                              \
+            (((Y)[((Q) + 3) % 4]) >> (R)) ^         \
+            (((Y)[((Q) + 2) % 4]) << (32 - (R)));   \
+    } while(0)
+
+#define ARIA_GSRK(RK, X, Y, N) _ARIA_GSRK(RK, X, Y, 4 - ((N) / 32), (N) % 32)
+
+#define ARIA_DEC_DIFF_BYTE(X, Y, TMP, TMP2)         \
+    do {                                            \
+        (TMP) = (X);                                \
+        (TMP2) = rotr32((TMP), 8);                  \
+        (Y) = (TMP2) ^ rotr32((TMP) ^ (TMP2), 16);  \
+    } while(0)
+
+void aria_encrypt(const unsigned char *in, unsigned char *out,
+                  const ARIA_KEY *key)
+{
+    register uint32_t reg0, reg1, reg2, reg3;
+    int Nr;
+
+    const ARIA_u128 *rk = key->rd_key;
+
+    if (in == NULL || out == NULL || key == NULL) {
+        return;
+    }
+
+    Nr = key->rounds;
+
+    if (Nr != 12 && Nr != 14 && Nr != 16) {
+        return;
+    }
+
+    reg0 = GET_U32_BE(in, 0);
+    reg1 = GET_U32_BE(in, 1);
+    reg2 = GET_U32_BE(in, 2);
+    reg3 = GET_U32_BE(in, 3);
+
+    ARIA_ADD_ROUND_KEY(rk, reg0, reg1, reg2, reg3);
+    rk++;
+
+    ARIA_SUBST_DIFF_ODD(reg0, reg1, reg2, reg3);
+    ARIA_ADD_ROUND_KEY(rk, reg0, reg1, reg2, reg3);
+    rk++;
+
+    while(Nr -= 2){
+        ARIA_SUBST_DIFF_EVEN(reg0, reg1, reg2, reg3);
+        ARIA_ADD_ROUND_KEY(rk, reg0, reg1, reg2, reg3);
+        rk++;
+
+        ARIA_SUBST_DIFF_ODD(reg0, reg1, reg2, reg3);
+        ARIA_ADD_ROUND_KEY(rk, reg0, reg1, reg2, reg3);
+        rk++;
+    }
+
+    reg0 = rk->u[0] ^ MAKE_U32(
+        (uint8_t)(X1[GET_U8_BE(reg0, 0)]     ),
+        (uint8_t)(X2[GET_U8_BE(reg0, 1)] >> 8),
+        (uint8_t)(S1[GET_U8_BE(reg0, 2)]     ),
+        (uint8_t)(S2[GET_U8_BE(reg0, 3)]     ));
+    reg1 = rk->u[1] ^ MAKE_U32(
+        (uint8_t)(X1[GET_U8_BE(reg1, 0)]     ),
+        (uint8_t)(X2[GET_U8_BE(reg1, 1)] >> 8),
+        (uint8_t)(S1[GET_U8_BE(reg1, 2)]     ),
+        (uint8_t)(S2[GET_U8_BE(reg1, 3)]     ));
+    reg2 = rk->u[2] ^ MAKE_U32(
+        (uint8_t)(X1[GET_U8_BE(reg2, 0)]     ),
+        (uint8_t)(X2[GET_U8_BE(reg2, 1)] >> 8),
+        (uint8_t)(S1[GET_U8_BE(reg2, 2)]     ),
+        (uint8_t)(S2[GET_U8_BE(reg2, 3)]     ));
+    reg3 = rk->u[3] ^ MAKE_U32(
+        (uint8_t)(X1[GET_U8_BE(reg3, 0)]     ),
+        (uint8_t)(X2[GET_U8_BE(reg3, 1)] >> 8),
+        (uint8_t)(S1[GET_U8_BE(reg3, 2)]     ),
+        (uint8_t)(S2[GET_U8_BE(reg3, 3)]     ));
+
+    PUT_U32_BE(out, 0, reg0);
+    PUT_U32_BE(out, 1, reg1);
+    PUT_U32_BE(out, 2, reg2);
+    PUT_U32_BE(out, 3, reg3);
+}
+
+int aria_set_encrypt_key(const unsigned char *userKey, const int bits,
+                         ARIA_KEY *key)
+{
+    register uint32_t reg0, reg1, reg2, reg3;
+    uint32_t w0[4], w1[4], w2[4], w3[4];
+    const uint32_t *ck;
+
+    ARIA_u128 *rk = key->rd_key;
+    int Nr = (bits + 256) / 32;
+
+    if (userKey == NULL || key == NULL) {
+        return -1;
+    }
+    if (bits != 128 && bits != 192 && bits != 256) {
+        return -2;
+    }
+
+    key->rounds = Nr;
+    ck = &Key_RC[(bits - 128) / 64][0];
+
+    w0[0] = GET_U32_BE(userKey, 0);
+    w0[1] = GET_U32_BE(userKey, 1);
+    w0[2] = GET_U32_BE(userKey, 2);
+    w0[3] = GET_U32_BE(userKey, 3);
+
+    reg0 = w0[0] ^ ck[0];
+    reg1 = w0[1] ^ ck[1];
+    reg2 = w0[2] ^ ck[2];
+    reg3 = w0[3] ^ ck[3];
+
+    ARIA_SUBST_DIFF_ODD(reg0, reg1, reg2, reg3);
+
+    if (bits > 128) {
+        w1[0] = GET_U32_BE(userKey, 4);
+        w1[1] = GET_U32_BE(userKey, 5);
+        if (bits > 192) {
+            w1[2] = GET_U32_BE(userKey, 6);
+            w1[3] = GET_U32_BE(userKey, 7);
+        }
+        else {
+            w1[2] = w1[3] = 0;
+        }
+    }
+    else {
+        w1[0] = w1[1] = w1[2] = w1[3] = 0;
+    }
+
+    w1[0] ^= reg0;
+    w1[1] ^= reg1;
+    w1[2] ^= reg2;
+    w1[3] ^= reg3;
+
+    reg0 = w1[0];
+    reg1 = w1[1];
+    reg2 = w1[2];
+    reg3 = w1[3];
+
+    reg0 ^= ck[4];
+    reg1 ^= ck[5];
+    reg2 ^= ck[6];
+    reg3 ^= ck[7];
+
+    ARIA_SUBST_DIFF_EVEN(reg0, reg1, reg2, reg3);
+
+    reg0 ^= w0[0];
+    reg1 ^= w0[1];
+    reg2 ^= w0[2];
+    reg3 ^= w0[3];
+
+    w2[0] = reg0;
+    w2[1] = reg1;
+    w2[2] = reg2;
+    w2[3] = reg3;
+
+    reg0 ^= ck[8];
+    reg1 ^= ck[9];
+    reg2 ^= ck[10];
+    reg3 ^= ck[11];
+
+    ARIA_SUBST_DIFF_ODD(reg0, reg1, reg2, reg3);
+
+    w3[0] = reg0 ^ w1[0];
+    w3[1] = reg1 ^ w1[1];
+    w3[2] = reg2 ^ w1[2];
+    w3[3] = reg3 ^ w1[3];
+
+    ARIA_GSRK(rk, w0, w1, 19);
+    rk++;
+    ARIA_GSRK(rk, w1, w2, 19);
+    rk++;
+    ARIA_GSRK(rk, w2, w3, 19);
+    rk++;
+    ARIA_GSRK(rk, w3, w0, 19);
+
+    rk++;
+    ARIA_GSRK(rk, w0, w1, 31);
+    rk++;
+    ARIA_GSRK(rk, w1, w2, 31);
+    rk++;
+    ARIA_GSRK(rk, w2, w3, 31);
+    rk++;
+    ARIA_GSRK(rk, w3, w0, 31);
+
+    rk++;
+    ARIA_GSRK(rk, w0, w1, 67);
+    rk++;
+    ARIA_GSRK(rk, w1, w2, 67);
+    rk++;
+    ARIA_GSRK(rk, w2, w3, 67);
+    rk++;
+    ARIA_GSRK(rk, w3, w0, 67);
+
+    rk++;
+    ARIA_GSRK(rk, w0, w1, 97);
+    if (bits > 128) {
+        rk++;
+        ARIA_GSRK(rk, w1, w2, 97);
+        rk++;
+        ARIA_GSRK(rk, w2, w3, 97);
+    }
+    if (bits > 192) {
+        rk++;
+        ARIA_GSRK(rk, w3, w0, 97);
+
+        rk++;
+        ARIA_GSRK(rk, w0, w1, 109);
+    }
+
+    return 0;
+}
+
+int aria_set_decrypt_key(const unsigned char *userKey, const int bits,
+                         ARIA_KEY *key)
+{
+    ARIA_u128 *rk_head;
+    ARIA_u128 *rk_tail;
+    register uint32_t w1, w2;
+    register uint32_t reg0, reg1, reg2, reg3;
+    uint32_t s0, s1, s2, s3;
+
+    const int r = aria_set_encrypt_key(userKey, bits, key);
+
+    if (r != 0) {
+        return r;
+    }
+
+    rk_head = key->rd_key;
+    rk_tail = rk_head + key->rounds;
+
+    reg0 = rk_head->u[0];
+    reg1 = rk_head->u[1];
+    reg2 = rk_head->u[2];
+    reg3 = rk_head->u[3];
+
+    memcpy(rk_head, rk_tail, ARIA_BLOCK_SIZE);
+
+    rk_tail->u[0] = reg0;
+    rk_tail->u[1] = reg1;
+    rk_tail->u[2] = reg2;
+    rk_tail->u[3] = reg3;
+
+    rk_head++;
+    rk_tail--;
+
+    for (; rk_head < rk_tail; rk_head++, rk_tail--) {
+        ARIA_DEC_DIFF_BYTE(rk_head->u[0], reg0, w1, w2);
+        ARIA_DEC_DIFF_BYTE(rk_head->u[1], reg1, w1, w2);
+        ARIA_DEC_DIFF_BYTE(rk_head->u[2], reg2, w1, w2);
+        ARIA_DEC_DIFF_BYTE(rk_head->u[3], reg3, w1, w2);
+
+        ARIA_DIFF_WORD(reg0, reg1, reg2, reg3);
+        ARIA_DIFF_BYTE(reg0, reg1, reg2, reg3);
+        ARIA_DIFF_WORD(reg0, reg1, reg2, reg3);
+
+        s0 = reg0;
+        s1 = reg1;
+        s2 = reg2;
+        s3 = reg3;
+
+        ARIA_DEC_DIFF_BYTE(rk_tail->u[0], reg0, w1, w2);
+        ARIA_DEC_DIFF_BYTE(rk_tail->u[1], reg1, w1, w2);
+        ARIA_DEC_DIFF_BYTE(rk_tail->u[2], reg2, w1, w2);
+        ARIA_DEC_DIFF_BYTE(rk_tail->u[3], reg3, w1, w2);
+
+        ARIA_DIFF_WORD(reg0, reg1, reg2, reg3);
+        ARIA_DIFF_BYTE(reg0, reg1, reg2, reg3);
+        ARIA_DIFF_WORD(reg0, reg1, reg2, reg3);
+
+        rk_head->u[0] = reg0;
+        rk_head->u[1] = reg1;
+        rk_head->u[2] = reg2;
+        rk_head->u[3] = reg3;
+
+        rk_tail->u[0] = s0;
+        rk_tail->u[1] = s1;
+        rk_tail->u[2] = s2;
+        rk_tail->u[3] = s3;
+    }
+    ARIA_DEC_DIFF_BYTE(rk_head->u[0], reg0, w1, w2);
+    ARIA_DEC_DIFF_BYTE(rk_head->u[1], reg1, w1, w2);
+    ARIA_DEC_DIFF_BYTE(rk_head->u[2], reg2, w1, w2);
+    ARIA_DEC_DIFF_BYTE(rk_head->u[3], reg3, w1, w2);
+
+    ARIA_DIFF_WORD(reg0, reg1, reg2, reg3);
+    ARIA_DIFF_BYTE(reg0, reg1, reg2, reg3);
+    ARIA_DIFF_WORD(reg0, reg1, reg2, reg3);
+
+    rk_tail->u[0] = reg0;
+    rk_tail->u[1] = reg1;
+    rk_tail->u[2] = reg2;
+    rk_tail->u[3] = reg3;
+
+    return 0;
+}
+
+#else
+
 static const unsigned char sb1[256] = {
     0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5,
     0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
@@ -156,56 +895,57 @@ static const unsigned char sb4[256] = {
     0xf7, 0x4c, 0x11, 0x33, 0x03, 0xa2, 0xac, 0x60
 };
 
-static const ARIA_u128 c1 = {
+static const ARIA_u128 c1 = {{
     0x51, 0x7c, 0xc1, 0xb7, 0x27, 0x22, 0x0a, 0x94,
     0xfe, 0x13, 0xab, 0xe8, 0xfa, 0x9a, 0x6e, 0xe0
-};
+}};
 
-static const ARIA_u128 c2 = {
+static const ARIA_u128 c2 = {{
     0x6d, 0xb1, 0x4a, 0xcc, 0x9e, 0x21, 0xc8, 0x20,
     0xff, 0x28, 0xb1, 0xd5, 0xef, 0x5d, 0xe2, 0xb0
-};
+}};
 
-static const ARIA_u128 c3 = {
+static const ARIA_u128 c3 = {{
     0xdb, 0x92, 0x37, 0x1d, 0x21, 0x26, 0xe9, 0x70,
     0x03, 0x24, 0x97, 0x75, 0x04, 0xe8, 0xc9, 0x0e
-};
+}};
 
 /*
  * Exclusive or two 128 bit values into the result.
  * It is safe for the result to be the same as the either input.
  */
-static void xor128(ARIA_u128 o, const ARIA_u128 x, const ARIA_u128 y)
+static void xor128(ARIA_u128 *o, const ARIA_u128 *x, const ARIA_u128 *y)
 {
     int i;
 
     for (i = 0; i < ARIA_BLOCK_SIZE; i++)
-        o[i] = x[i] ^ y[i];
+        o->c[i] = x->c[i] ^ y->c[i];
 }
 
 /*
  * Generalised circular rotate right and exclusive or function.
  * It is safe for the output to overlap either input.
  */
-static ossl_inline void rotnr(unsigned int n, ARIA_u128 o, const ARIA_u128 xor,
-                              const ARIA_u128 z)
+static ossl_inline void rotnr(unsigned int n, ARIA_u128 *o,
+                              const ARIA_u128 *xor, const ARIA_u128 *z)
 {
     const unsigned int bytes = n / 8, bits = n % 8;
     unsigned int i;
     ARIA_u128 t;
 
     for (i = 0; i < ARIA_BLOCK_SIZE; i++)
-        t[(i + bytes) % ARIA_BLOCK_SIZE] = z[i];
+        t.c[(i + bytes) % ARIA_BLOCK_SIZE] = z->c[i];
     for (i = 0; i < ARIA_BLOCK_SIZE; i++)
-        o[i] = ((t[i] >> bits) |
-                (t[i ? i - 1 : ARIA_BLOCK_SIZE - 1] << (8 - bits))) ^ xor[i];
+        o->c[i] = ((t.c[i] >> bits) |
+                (t.c[i ? i - 1 : ARIA_BLOCK_SIZE - 1] << (8 - bits))) ^
+                xor->c[i];
 }
 
 /*
  * Circular rotate 19 bits right and xor.
  * It is safe for the output to overlap either input.
  */
-static void rot19r(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
+static void rot19r(ARIA_u128 *o, const ARIA_u128 *xor, const ARIA_u128 *z)
 {
     rotnr(19, o, xor, z);
 }
@@ -214,7 +954,7 @@ static void rot19r(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
  * Circular rotate 31 bits right and xor.
  * It is safe for the output to overlap either input.
  */
-static void rot31r(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
+static void rot31r(ARIA_u128 *o, const ARIA_u128 *xor, const ARIA_u128 *z)
 {
     rotnr(31, o, xor, z);
 }
@@ -223,7 +963,7 @@ static void rot31r(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
  * Circular rotate 61 bits left and xor.
  * It is safe for the output to overlap either input.
  */
-static void rot61l(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
+static void rot61l(ARIA_u128 *o, const ARIA_u128 *xor, const ARIA_u128 *z)
 {
     rotnr(8 * ARIA_BLOCK_SIZE - 61, o, xor, z);
 }
@@ -232,7 +972,7 @@ static void rot61l(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
  * Circular rotate 31 bits left and xor.
  * It is safe for the output to overlap either input.
  */
-static void rot31l(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
+static void rot31l(ARIA_u128 *o, const ARIA_u128 *xor, const ARIA_u128 *z)
 {
     rotnr(8 * ARIA_BLOCK_SIZE - 31, o, xor, z);
 }
@@ -241,7 +981,7 @@ static void rot31l(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
  * Circular rotate 19 bits left and xor.
  * It is safe for the output to overlap either input.
  */
-static void rot19l(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
+static void rot19l(ARIA_u128 *o, const ARIA_u128 *xor, const ARIA_u128 *z)
 {
     rotnr(8 * ARIA_BLOCK_SIZE - 19, o, xor, z);
 }
@@ -250,14 +990,14 @@ static void rot19l(ARIA_u128 o, const ARIA_u128 xor, const ARIA_u128 z)
  * First substitution and xor layer, used for odd steps.
  * It is safe for the input and output to be the same.
  */
-static void sl1(ARIA_u128 o, const ARIA_u128 x, const ARIA_u128 y)
+static void sl1(ARIA_u128 *o, const ARIA_u128 *x, const ARIA_u128 *y)
 {
     unsigned int i;
     for (i = 0; i < ARIA_BLOCK_SIZE; i += 4) {
-        o[i    ] = sb1[x[i    ] ^ y[i    ]];
-        o[i + 1] = sb2[x[i + 1] ^ y[i + 1]];
-        o[i + 2] = sb3[x[i + 2] ^ y[i + 2]];
-        o[i + 3] = sb4[x[i + 3] ^ y[i + 3]];
+        o->c[i    ] = sb1[x->c[i    ] ^ y->c[i    ]];
+        o->c[i + 1] = sb2[x->c[i + 1] ^ y->c[i + 1]];
+        o->c[i + 2] = sb3[x->c[i + 2] ^ y->c[i + 2]];
+        o->c[i + 3] = sb4[x->c[i + 3] ^ y->c[i + 3]];
     }
 }
 
@@ -265,14 +1005,14 @@ static void sl1(ARIA_u128 o, const ARIA_u128 x, const ARIA_u128 y)
  * Second substitution and xor layer, used for even steps.
  * It is safe for the input and output to be the same.
  */
-static void sl2(ARIA_u128 o, const ARIA_u128 x, const ARIA_u128 y)
+static void sl2(ARIA_u128 *o, const ARIA_u128 *x, const ARIA_u128 *y)
 {
     unsigned int i;
     for (i = 0; i < ARIA_BLOCK_SIZE; i += 4) {
-        o[i    ] = sb3[x[i    ] ^ y[i    ]];
-        o[i + 1] = sb4[x[i + 1] ^ y[i + 1]];
-        o[i + 2] = sb1[x[i + 2] ^ y[i + 2]];
-        o[i + 3] = sb2[x[i + 3] ^ y[i + 3]];
+        o->c[i    ] = sb3[x->c[i    ] ^ y->c[i    ]];
+        o->c[i + 1] = sb4[x->c[i + 1] ^ y->c[i + 1]];
+        o->c[i + 2] = sb1[x->c[i + 2] ^ y->c[i + 2]];
+        o->c[i + 3] = sb2[x->c[i + 3] ^ y->c[i + 3]];
     }
 }
 
@@ -280,24 +1020,40 @@ static void sl2(ARIA_u128 o, const ARIA_u128 x, const ARIA_u128 y)
  * Diffusion layer step
  * It is NOT safe for the input and output to overlap.
  */
-static void a(ARIA_u128 y, const ARIA_u128 x)
+static void a(ARIA_u128 *y, const ARIA_u128 *x)
 {
-    y[ 0] = x[3] ^ x[4] ^ x[6] ^ x[ 8] ^ x[ 9] ^ x[13] ^ x[14];
-    y[ 1] = x[2] ^ x[5] ^ x[7] ^ x[ 8] ^ x[ 9] ^ x[12] ^ x[15];
-    y[ 2] = x[1] ^ x[4] ^ x[6] ^ x[10] ^ x[11] ^ x[12] ^ x[15];
-    y[ 3] = x[0] ^ x[5] ^ x[7] ^ x[10] ^ x[11] ^ x[13] ^ x[14];
-    y[ 4] = x[0] ^ x[2] ^ x[5] ^ x[ 8] ^ x[11] ^ x[14] ^ x[15];
-    y[ 5] = x[1] ^ x[3] ^ x[4] ^ x[ 9] ^ x[10] ^ x[14] ^ x[15];
-    y[ 6] = x[0] ^ x[2] ^ x[7] ^ x[ 9] ^ x[10] ^ x[12] ^ x[13];
-    y[ 7] = x[1] ^ x[3] ^ x[6] ^ x[ 8] ^ x[11] ^ x[12] ^ x[13];
-    y[ 8] = x[0] ^ x[1] ^ x[4] ^ x[ 7] ^ x[10] ^ x[13] ^ x[15];
-    y[ 9] = x[0] ^ x[1] ^ x[5] ^ x[ 6] ^ x[11] ^ x[12] ^ x[14];
-    y[10] = x[2] ^ x[3] ^ x[5] ^ x[ 6] ^ x[ 8] ^ x[13] ^ x[15];
-    y[11] = x[2] ^ x[3] ^ x[4] ^ x[ 7] ^ x[ 9] ^ x[12] ^ x[14];
-    y[12] = x[1] ^ x[2] ^ x[6] ^ x[ 7] ^ x[ 9] ^ x[11] ^ x[12];
-    y[13] = x[0] ^ x[3] ^ x[6] ^ x[ 7] ^ x[ 8] ^ x[10] ^ x[13];
-    y[14] = x[0] ^ x[3] ^ x[4] ^ x[ 5] ^ x[ 9] ^ x[11] ^ x[14];
-    y[15] = x[1] ^ x[2] ^ x[4] ^ x[ 5] ^ x[ 8] ^ x[10] ^ x[15];
+    y->c[ 0] = x->c[ 3] ^ x->c[ 4] ^ x->c[ 6] ^ x->c[ 8] ^
+               x->c[ 9] ^ x->c[13] ^ x->c[14];
+    y->c[ 1] = x->c[ 2] ^ x->c[ 5] ^ x->c[ 7] ^ x->c[ 8] ^
+               x->c[ 9] ^ x->c[12] ^ x->c[15];
+    y->c[ 2] = x->c[ 1] ^ x->c[ 4] ^ x->c[ 6] ^ x->c[10] ^
+               x->c[11] ^ x->c[12] ^ x->c[15];
+    y->c[ 3] = x->c[ 0] ^ x->c[ 5] ^ x->c[ 7] ^ x->c[10] ^
+               x->c[11] ^ x->c[13] ^ x->c[14];
+    y->c[ 4] = x->c[ 0] ^ x->c[ 2] ^ x->c[ 5] ^ x->c[ 8] ^
+               x->c[11] ^ x->c[14] ^ x->c[15];
+    y->c[ 5] = x->c[ 1] ^ x->c[ 3] ^ x->c[ 4] ^ x->c[ 9] ^
+               x->c[10] ^ x->c[14] ^ x->c[15];
+    y->c[ 6] = x->c[ 0] ^ x->c[ 2] ^ x->c[ 7] ^ x->c[ 9] ^
+               x->c[10] ^ x->c[12] ^ x->c[13];
+    y->c[ 7] = x->c[ 1] ^ x->c[ 3] ^ x->c[ 6] ^ x->c[ 8] ^
+               x->c[11] ^ x->c[12] ^ x->c[13];
+    y->c[ 8] = x->c[ 0] ^ x->c[ 1] ^ x->c[ 4] ^ x->c[ 7] ^
+               x->c[10] ^ x->c[13] ^ x->c[15];
+    y->c[ 9] = x->c[ 0] ^ x->c[ 1] ^ x->c[ 5] ^ x->c[ 6] ^
+               x->c[11] ^ x->c[12] ^ x->c[14];
+    y->c[10] = x->c[ 2] ^ x->c[ 3] ^ x->c[ 5] ^ x->c[ 6] ^
+               x->c[ 8] ^ x->c[13] ^ x->c[15];
+    y->c[11] = x->c[ 2] ^ x->c[ 3] ^ x->c[ 4] ^ x->c[ 7] ^
+               x->c[ 9] ^ x->c[12] ^ x->c[14];
+    y->c[12] = x->c[ 1] ^ x->c[ 2] ^ x->c[ 6] ^ x->c[ 7] ^
+               x->c[ 9] ^ x->c[11] ^ x->c[12];
+    y->c[13] = x->c[ 0] ^ x->c[ 3] ^ x->c[ 6] ^ x->c[ 7] ^
+               x->c[ 8] ^ x->c[10] ^ x->c[13];
+    y->c[14] = x->c[ 0] ^ x->c[ 3] ^ x->c[ 4] ^ x->c[ 5] ^
+               x->c[ 9] ^ x->c[11] ^ x->c[14];
+    y->c[15] = x->c[ 1] ^ x->c[ 2] ^ x->c[ 4] ^ x->c[ 5] ^
+               x->c[ 8] ^ x->c[10] ^ x->c[15];
 }
 
 /*
@@ -305,12 +1061,13 @@ static void a(ARIA_u128 y, const ARIA_u128 x)
  * Apply the first substitution layer and then a diffusion step.
  * It is safe for the input and output to overlap.
  */
-static ossl_inline void FO(ARIA_u128 o, const ARIA_u128 d, const ARIA_u128 rk)
+static ossl_inline void FO(ARIA_u128 *o, const ARIA_u128 *d,
+                           const ARIA_u128 *rk)
 {
-       ARIA_u128 y;
+    ARIA_u128 y;
 
-       sl1(y, d, rk);
-       a(o, y);
+    sl1(&y, d, rk);
+    a(o, &y);
 }
 
 /*
@@ -318,32 +1075,34 @@ static ossl_inline void FO(ARIA_u128 o, const ARIA_u128 d, const ARIA_u128 rk)
  * Apply the second substitution layer and then a diffusion step.
  * It is safe for the input and output to overlap.
  */
-static ossl_inline void FE(ARIA_u128 o, const ARIA_u128 d, const ARIA_u128 rk)
+static ossl_inline void FE(ARIA_u128 *o, const ARIA_u128 *d,
+                           const ARIA_u128 *rk)
 {
-       ARIA_u128 y;
+    ARIA_u128 y;
 
-       sl2(y, d, rk);
-       a(o, y);
+    sl2(&y, d, rk);
+    a(o, &y);
 }
 
 /*
  * Encrypt or decrypt a single block
  * in and out can overlap
  */
-static void do_encrypt(ARIA_u128 o, const ARIA_u128 pin, unsigned int rounds,
-                       const ARIA_u128 *keys)
+static void do_encrypt(unsigned char *o, const unsigned char *pin,
+                       unsigned int rounds, const ARIA_u128 *keys)
 {
     ARIA_u128 p;
+    ARIA_u128 *o128 = (ARIA_u128 *)o;
     unsigned int i;
 
-    memcpy(p, pin, sizeof(p));
+    memcpy(&p, pin, sizeof(p));
     for (i = 0; i < rounds - 2; i += 2) {
-        FO(p, p, keys[i]);
-        FE(p, p, keys[i + 1]);
+        FO(&p, &p, &keys[i]);
+        FE(&p, &p, &keys[i + 1]);
     }
-    FO(p, p, keys[rounds - 2]);
-    sl2(o, p, keys[rounds - 1]);
-    xor128(o, o, keys[rounds]);
+    FO(&p, &p, &keys[rounds - 2]);
+    sl2(o128, &p, &keys[rounds - 1]);
+    xor128(o128, o128, &keys[rounds]);
 }
 
 /*
@@ -366,68 +1125,68 @@ void aria_encrypt(const unsigned char *in, unsigned char *out,
 int aria_set_encrypt_key(const unsigned char *userKey, const int bits,
                          ARIA_KEY *key)
 {
-    const unsigned char *ck1, *ck2, *ck3;
+    const ARIA_u128 *ck1, *ck2, *ck3;
     ARIA_u128 kr, w0, w1, w2, w3;
 
     if (!userKey || !key)
         return -1;
-    memcpy(w0, userKey, sizeof(w0));
+    memcpy(w0.c, userKey, sizeof(w0));
     switch (bits) {
     default:
         return -2;
     case 128:
         key->rounds = 12;
-        ck1 = c1;
-        ck2 = c2;
-        ck3 = c3;
-        memset(kr, 0, sizeof(kr));
+        ck1 = &c1;
+        ck2 = &c2;
+        ck3 = &c3;
+        memset(kr.c, 0, sizeof(kr));
         break;
 
     case 192:
         key->rounds = 14;
-        ck1 = c2;
-        ck2 = c3;
-        ck3 = c1;
-        memcpy(kr, userKey + ARIA_BLOCK_SIZE, sizeof(kr) / 2);
-        memset(kr + ARIA_BLOCK_SIZE / 2, 0, sizeof(kr) / 2);
+        ck1 = &c2;
+        ck2 = &c3;
+        ck3 = &c1;
+        memcpy(kr.c, userKey + ARIA_BLOCK_SIZE, sizeof(kr) / 2);
+        memset(kr.c + ARIA_BLOCK_SIZE / 2, 0, sizeof(kr) / 2);
         break;
 
     case 256:
         key->rounds = 16;
-        ck1 = c3;
-        ck2 = c1;
-        ck3 = c2;
-        memcpy(kr, userKey + ARIA_BLOCK_SIZE, sizeof(kr));
+        ck1 = &c3;
+        ck2 = &c1;
+        ck3 = &c2;
+        memcpy(kr.c, userKey + ARIA_BLOCK_SIZE, sizeof(kr));
         break;
     }
 
-    FO(w3, w0, ck1);   xor128(w1, w3, kr);
-    FE(w3, w1, ck2);   xor128(w2, w3, w0);
-    FO(kr, w2, ck3);   xor128(w3, kr, w1);
+    FO(&w3, &w0, ck1);    xor128(&w1, &w3, &kr);
+    FE(&w3, &w1, ck2);    xor128(&w2, &w3, &w0);
+    FO(&kr, &w2, ck3);    xor128(&w3, &kr, &w1);
 
-    rot19r(key->rd_key[ 0], w0, w1);
-    rot19r(key->rd_key[ 1], w1, w2);
-    rot19r(key->rd_key[ 2], w2, w3);
-    rot19r(key->rd_key[ 3], w3, w0);
+    rot19r(&key->rd_key[ 0], &w0, &w1);
+    rot19r(&key->rd_key[ 1], &w1, &w2);
+    rot19r(&key->rd_key[ 2], &w2, &w3);
+    rot19r(&key->rd_key[ 3], &w3, &w0);
 
-    rot31r(key->rd_key[ 4], w0, w1);
-    rot31r(key->rd_key[ 5], w1, w2);
-    rot31r(key->rd_key[ 6], w2, w3);
-    rot31r(key->rd_key[ 7], w3, w0);
+    rot31r(&key->rd_key[ 4], &w0, &w1);
+    rot31r(&key->rd_key[ 5], &w1, &w2);
+    rot31r(&key->rd_key[ 6], &w2, &w3);
+    rot31r(&key->rd_key[ 7], &w3, &w0);
 
-    rot61l(key->rd_key[ 8], w0, w1);
-    rot61l(key->rd_key[ 9], w1, w2);
-    rot61l(key->rd_key[10], w2, w3);
-    rot61l(key->rd_key[11], w3, w0);
+    rot61l(&key->rd_key[ 8], &w0, &w1);
+    rot61l(&key->rd_key[ 9], &w1, &w2);
+    rot61l(&key->rd_key[10], &w2, &w3);
+    rot61l(&key->rd_key[11], &w3, &w0);
 
-    rot31l(key->rd_key[12], w0, w1);
+    rot31l(&key->rd_key[12], &w0, &w1);
     if (key->rounds > 12) {
-        rot31l(key->rd_key[13], w1, w2);
-        rot31l(key->rd_key[14], w2, w3);
+        rot31l(&key->rd_key[13], &w1, &w2);
+        rot31l(&key->rd_key[14], &w2, &w3);
 
         if (key->rounds > 14) {
-            rot31l(key->rd_key[15], w3, w0);
-            rot19l(key->rd_key[16], w0, w1);
+            rot31l(&key->rd_key[15], &w3, &w0);
+            rot19l(&key->rd_key[16], &w0, &w1);
         }
     }
     return 0;
@@ -445,10 +1204,12 @@ int aria_set_decrypt_key(const unsigned char *userKey, const int bits,
 
     if (r == 0) {
         key->rounds = rounds;
-        memcpy(key->rd_key[0], ek.rd_key[rounds], sizeof(key->rd_key[0]));
+        memcpy(&key->rd_key[0], &ek.rd_key[rounds], sizeof(key->rd_key[0]));
         for (i = 1; i < rounds; i++)
-            a(key->rd_key[i], ek.rd_key[rounds - i]);
-        memcpy(key->rd_key[rounds], ek.rd_key[0], sizeof(key->rd_key[rounds]));
+            a(&key->rd_key[i], &ek.rd_key[rounds - i]);
+        memcpy(&key->rd_key[rounds], &ek.rd_key[0], sizeof(key->rd_key[rounds]));
     }
     return r;
 }
+
+#endif
index 4d37189..59b6f4f 100644 (file)
@@ -11,6 +11,8 @@
  * Copyright (c) 2017 Oracle and/or its affiliates.  All rights reserved.
  */
 
+ /* Copyright (c) 2017 National Security Resarch Institute.  All rights reserved. */
+
 #ifndef HEADER_ARIA_H
 # define HEADER_ARIA_H
 
@@ -20,8 +22,6 @@
 #  error ARIA is disabled.
 # endif
 
-# include <stddef.h>
-
 # define ARIA_ENCRYPT     1
 # define ARIA_DECRYPT     0
 
 extern "C" {
 # endif
 
-typedef unsigned char ARIA_u128[ARIA_BLOCK_SIZE];
+typedef union {
+    unsigned char c[ARIA_BLOCK_SIZE];
+    unsigned int u[ARIA_BLOCK_SIZE / sizeof(unsigned int)];
+} ARIA_u128;
 
 struct aria_key_st {
-    unsigned int rounds;
     ARIA_u128 rd_key[ARIA_MAX_KEYS];
+    unsigned int rounds;
 };
 typedef struct aria_key_st ARIA_KEY;