d323e265c05ad8c3ec8030db58f8ef780ba038f6
[openssl.git] / crypto / aes / aes_x86core.c
1 /* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2 /**
3  * rijndael-alg-fst.c
4  *
5  * @version 3.0 (December 2000)
6  *
7  * Optimised ANSI C code for the Rijndael cipher (now AES)
8  *
9  * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10  * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11  * @author Paulo Barreto <paulo.barreto@terra.com.br>
12  *
13  * This code is hereby placed in the public domain.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 /*
29  * This is experimental x86[_64] derivative. It assumes little-endian
30  * byte order and expects CPU to sustain unaligned memory references.
31  * It is used as playground for cache-time attack mitigations and
32  * serves as reference C implementation for x86[_64] assembler.
33  *
34  *                                      <appro@fy.chalmers.se>
35  */
36
37
38 #ifndef AES_DEBUG
39 # ifndef NDEBUG
40 #  define NDEBUG
41 # endif
42 #endif
43 #include <assert.h>
44
45 #include <stdlib.h>
46 #include <openssl/aes.h>
47 #include "aes_locl.h"
48
49 /*
50  * These two parameters control which table, 256-byte or 2KB, is
51  * referenced in outer and respectively inner rounds.
52  */
53 #define AES_COMPACT_IN_OUTER_ROUNDS
54 #ifdef  AES_COMPACT_IN_OUTER_ROUNDS
55 /* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
56  * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
57  * by factor of ~2. */
58 # undef  AES_COMPACT_IN_INNER_ROUNDS
59 #endif
60
61 #if 1
62 static void prefetch256(const void *table)
63 {
64         volatile unsigned long *t=(void *)table,ret;
65         unsigned long sum;
66         int i;
67
68         /* 32 is common least cache-line size */
69         for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0]))   sum ^= t[i];
70
71         ret = sum;
72 }
73 #else
74 # define prefetch256(t)
75 #endif
76
77 #undef GETU32
78 #define GETU32(p) (*((u32*)(p)))
79
80 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
81 typedef unsigned __int64 u64;
82 #define U64(C)  C##UI64
83 #elif defined(__arch64__)
84 typedef unsigned long u64;
85 #define U64(C)  C##UL
86 #else
87 typedef unsigned long long u64;
88 #define U64(C)  C##ULL
89 #endif
90
91 #undef ROTATE
92 #if defined(_MSC_VER) || defined(__ICC)
93 # define ROTATE(a,n)    _lrotl(a,n)
94 #elif defined(__GNUC__) && __GNUC__>=2
95 # if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
96 #   define ROTATE(a,n)  ({ register unsigned int ret;   \
97                                 asm (                   \
98                                 "roll %1,%0"            \
99                                 : "=r"(ret)             \
100                                 : "I"(n), "0"(a)        \
101                                 : "cc");                \
102                            ret;                         \
103                         })
104 # endif
105 #endif
106 /*
107 Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
108 Te0[x] = S [x].[02, 01, 01, 03];
109 Te1[x] = S [x].[03, 02, 01, 01];
110 Te2[x] = S [x].[01, 03, 02, 01];
111 Te3[x] = S [x].[01, 01, 03, 02];
112 */
113 #define Te0 (u32)((u64*)((u8*)Te+0))
114 #define Te1 (u32)((u64*)((u8*)Te+3))
115 #define Te2 (u32)((u64*)((u8*)Te+2))
116 #define Te3 (u32)((u64*)((u8*)Te+1))
117 /*
118 Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
119 Td0[x] = Si[x].[0e, 09, 0d, 0b];
120 Td1[x] = Si[x].[0b, 0e, 09, 0d];
121 Td2[x] = Si[x].[0d, 0b, 0e, 09];
122 Td3[x] = Si[x].[09, 0d, 0b, 0e];
123 Td4[x] = Si[x].[01];
124 */
125 #define Td0 (u32)((u64*)((u8*)Td+0))
126 #define Td1 (u32)((u64*)((u8*)Td+3))
127 #define Td2 (u32)((u64*)((u8*)Td+2))
128 #define Td3 (u32)((u64*)((u8*)Td+1))
129
130 static const u64 Te[256] = {
131     U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
132     U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
133     U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
134     U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
135     U64(0x5030306050303060), U64(0x0301010203010102),
136     U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
137     U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
138     U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
139     U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
140     U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
141     U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
142     U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
143     U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
144     U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
145     U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
146     U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
147     U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
148     U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
149     U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
150     U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
151     U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
152     U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
153     U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
154     U64(0x5331316253313162), U64(0x3f15152a3f15152a),
155     U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
156     U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
157     U64(0x2818183028181830), U64(0xa1969637a1969637),
158     U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
159     U64(0x0907070e0907070e), U64(0x3612122436121224),
160     U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
161     U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
162     U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
163     U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
164     U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
165     U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
166     U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
167     U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
168     U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
169     U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
170     U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
171     U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
172     U64(0x0000000000000000), U64(0x2cededc12cededc1),
173     U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
174     U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
175     U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
176     U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
177     U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
178     U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
179     U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
180     U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
181     U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
182     U64(0x5533336655333366), U64(0x9485851194858511),
183     U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
184     U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
185     U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
186     U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
187     U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
188     U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
189     U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
190     U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
191     U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
192     U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
193     U64(0x3010102030101020), U64(0x1affffe51affffe5),
194     U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
195     U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
196     U64(0x3513132635131326), U64(0x2fececc32fececc3),
197     U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
198     U64(0xcc444488cc444488), U64(0x3917172e3917172e),
199     U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
200     U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
201     U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
202     U64(0x2b1919322b191932), U64(0x957373e6957373e6),
203     U64(0xa06060c0a06060c0), U64(0x9881811998818119),
204     U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
205     U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
206     U64(0xab90903bab90903b), U64(0x8388880b8388880b),
207     U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
208     U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
209     U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
210     U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
211     U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
212     U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
213     U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
214     U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
215     U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
216     U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
217     U64(0xa8919139a8919139), U64(0xa4959531a4959531),
218     U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
219     U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
220     U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
221     U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
222     U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
223     U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
224     U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
225     U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
226     U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
227     U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
228     U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
229     U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
230     U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
231     U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
232     U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
233     U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
234     U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
235     U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
236     U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
237     U64(0xd8484890d8484890), U64(0x0503030605030306),
238     U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
239     U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
240     U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
241     U64(0x9186861791868617), U64(0x58c1c19958c1c199),
242     U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
243     U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
244     U64(0xb398982bb398982b), U64(0x3311112233111122),
245     U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
246     U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
247     U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
248     U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
249     U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
250     U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
251     U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
252     U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
253     U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
254     U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
255     U64(0xc3414182c3414182), U64(0xb0999929b0999929),
256     U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
257     U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
258     U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
259 };
260
261 static const u8 Te4[256] = {
262     0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
263     0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
264     0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
265     0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
266     0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
267     0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
268     0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
269     0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
270     0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
271     0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
272     0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
273     0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
274     0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
275     0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
276     0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
277     0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
278     0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
279     0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
280     0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
281     0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
282     0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
283     0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
284     0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
285     0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
286     0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
287     0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
288     0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
289     0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
290     0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
291     0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
292     0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
293     0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
294 };
295
296 static const u64 Td[256] = {
297     U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
298     U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
299     U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
300     U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
301     U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
302     U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
303     U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
304     U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
305     U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
306     U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
307     U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
308     U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
309     U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
310     U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
311     U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
312     U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
313     U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
314     U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
315     U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
316     U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
317     U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
318     U64(0x6033519760335197), U64(0x457f5362457f5362),
319     U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
320     U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
321     U64(0x5868487058684870), U64(0x19fd458f19fd458f),
322     U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
323     U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
324     U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
325     U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
326     U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
327     U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
328     U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
329     U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
330     U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
331     U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
332     U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
333     U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
334     U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
335     U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
336     U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
337     U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
338     U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
339     U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
340     U64(0x6fd406046fd40604), U64(0xff155060ff155060),
341     U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
342     U64(0xcc434089cc434089), U64(0x779ed967779ed967),
343     U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
344     U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
345     U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
346     U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
347     U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
348     U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
349     U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
350     U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
351     U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
352     U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
353     U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
354     U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
355     U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
356     U64(0x694b775a694b775a), U64(0x161a121c161a121c),
357     U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
358     U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
359     U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
360     U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
361     U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
362     U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
363     U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
364     U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
365     U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
366     U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
367     U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
368     U64(0x4022971340229713), U64(0x2011c6842011c684),
369     U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
370     U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
371     U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
372     U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
373     U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
374     U64(0xfa489411fa489411), U64(0x2264e9472264e947),
375     U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
376     U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
377     U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
378     U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
379     U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
380     U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
381     U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
382     U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
383     U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
384     U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
385     U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
386     U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
387     U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
388     U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
389     U64(0x097826cd097826cd), U64(0xf418596ef418596e),
390     U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
391     U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
392     U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
393     U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
394     U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
395     U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
396     U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
397     U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
398     U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
399     U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
400     U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
401     U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
402     U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
403     U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
404     U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
405     U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
406     U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
407     U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
408     U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
409     U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
410     U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
411     U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
412     U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
413     U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
414     U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
415     U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
416     U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
417     U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
418     U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
419     U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
420     U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
421     U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
422     U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
423     U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
424     U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
425 };
426 static const u8 Td4[256] = {
427     0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
428     0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
429     0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
430     0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
431     0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
432     0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
433     0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
434     0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
435     0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
436     0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
437     0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
438     0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
439     0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
440     0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
441     0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
442     0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
443     0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
444     0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
445     0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
446     0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
447     0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
448     0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
449     0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
450     0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
451     0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
452     0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
453     0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
454     0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
455     0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
456     0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
457     0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
458     0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
459 };
460
461 static const u32 rcon[] = {
462     0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
463     0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
464     0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
465 };
466
467 /**
468  * Expand the cipher key into the encryption key schedule.
469  */
470 int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
471                         AES_KEY *key) {
472
473         u32 *rk;
474         int i = 0;
475         u32 temp;
476
477         if (!userKey || !key)
478                 return -1;
479         if (bits != 128 && bits != 192 && bits != 256)
480                 return -2;
481
482         rk = key->rd_key;
483
484         if (bits==128)
485                 key->rounds = 10;
486         else if (bits==192)
487                 key->rounds = 12;
488         else
489                 key->rounds = 14;
490
491         rk[0] = GETU32(userKey     );
492         rk[1] = GETU32(userKey +  4);
493         rk[2] = GETU32(userKey +  8);
494         rk[3] = GETU32(userKey + 12);
495         if (bits == 128) {
496                 while (1) {
497                         temp  = rk[3];
498                         rk[4] = rk[0] ^
499                                 (Te4[(temp >>  8) & 0xff]      ) ^
500                                 (Te4[(temp >> 16) & 0xff] <<  8) ^
501                                 (Te4[(temp >> 24)       ] << 16) ^
502                                 (Te4[(temp      ) & 0xff] << 24) ^
503                                 rcon[i];
504                         rk[5] = rk[1] ^ rk[4];
505                         rk[6] = rk[2] ^ rk[5];
506                         rk[7] = rk[3] ^ rk[6];
507                         if (++i == 10) {
508                                 return 0;
509                         }
510                         rk += 4;
511                 }
512         }
513         rk[4] = GETU32(userKey + 16);
514         rk[5] = GETU32(userKey + 20);
515         if (bits == 192) {
516                 while (1) {
517                         temp = rk[ 5];
518                         rk[ 6] = rk[ 0] ^
519                                 (Te4[(temp >>  8) & 0xff]      ) ^
520                                 (Te4[(temp >> 16) & 0xff] <<  8) ^
521                                 (Te4[(temp >> 24)       ] << 16) ^
522                                 (Te4[(temp      ) & 0xff] << 24) ^
523                                 rcon[i];
524                         rk[ 7] = rk[ 1] ^ rk[ 6];
525                         rk[ 8] = rk[ 2] ^ rk[ 7];
526                         rk[ 9] = rk[ 3] ^ rk[ 8];
527                         if (++i == 8) {
528                                 return 0;
529                         }
530                         rk[10] = rk[ 4] ^ rk[ 9];
531                         rk[11] = rk[ 5] ^ rk[10];
532                         rk += 6;
533                 }
534         }
535         rk[6] = GETU32(userKey + 24);
536         rk[7] = GETU32(userKey + 28);
537         if (bits == 256) {
538                 while (1) {
539                         temp = rk[ 7];
540                         rk[ 8] = rk[ 0] ^
541                                 (Te4[(temp >>  8) & 0xff]      ) ^
542                                 (Te4[(temp >> 16) & 0xff] <<  8) ^
543                                 (Te4[(temp >> 24)       ] << 16) ^
544                                 (Te4[(temp      ) & 0xff] << 24) ^
545                                 rcon[i];
546                         rk[ 9] = rk[ 1] ^ rk[ 8];
547                         rk[10] = rk[ 2] ^ rk[ 9];
548                         rk[11] = rk[ 3] ^ rk[10];
549                         if (++i == 7) {
550                                 return 0;
551                         }
552                         temp = rk[11];
553                         rk[12] = rk[ 4] ^
554                                 (Te4[(temp      ) & 0xff]      ) ^
555                                 (Te4[(temp >>  8) & 0xff] <<  8) ^
556                                 (Te4[(temp >> 16) & 0xff] << 16) ^
557                                 (Te4[(temp >> 24)       ] << 24);
558                         rk[13] = rk[ 5] ^ rk[12];
559                         rk[14] = rk[ 6] ^ rk[13];
560                         rk[15] = rk[ 7] ^ rk[14];
561
562                         rk += 8;
563                 }
564         }
565         return 0;
566 }
567
568 /**
569  * Expand the cipher key into the decryption key schedule.
570  */
571 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
572                          AES_KEY *key) {
573
574         u32 *rk;
575         int i, j, status;
576         u32 temp;
577
578         /* first, start with an encryption schedule */
579         status = AES_set_encrypt_key(userKey, bits, key);
580         if (status < 0)
581                 return status;
582
583         rk = key->rd_key;
584
585         /* invert the order of the round keys: */
586         for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
587                 temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
588                 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
589                 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
590                 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
591         }
592         /* apply the inverse MixColumn transform to all round keys but the first and the last: */
593         for (i = 1; i < (key->rounds); i++) {
594                 rk += 4;
595 #if 1
596                 for (j = 0; j < 4; j++) {
597                         u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
598
599                         tp1 = rk[j];
600                         m = tp1 & 0x80808080;
601                         tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
602                                 ((m - (m >> 7)) & 0x1b1b1b1b);
603                         m = tp2 & 0x80808080;
604                         tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
605                                 ((m - (m >> 7)) & 0x1b1b1b1b);
606                         m = tp4 & 0x80808080;
607                         tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
608                                 ((m - (m >> 7)) & 0x1b1b1b1b);
609                         tp9 = tp8 ^ tp1;
610                         tpb = tp9 ^ tp2;
611                         tpd = tp9 ^ tp4;
612                         tpe = tp8 ^ tp4 ^ tp2;
613 #if defined(ROTATE)
614                         rk[j] = tpe ^ ROTATE(tpd,16) ^
615                                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
616 #else
617                         rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
618                                 (tp9 >> 24) ^ (tp9 << 8) ^
619                                 (tpb >> 8) ^ (tpb << 24);
620 #endif
621                 }
622 #else
623                 rk[0] =
624                         Td0[Te2[(rk[0]      ) & 0xff] & 0xff] ^
625                         Td1[Te2[(rk[0] >>  8) & 0xff] & 0xff] ^
626                         Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
627                         Td3[Te2[(rk[0] >> 24)       ] & 0xff];
628                 rk[1] =
629                         Td0[Te2[(rk[1]      ) & 0xff] & 0xff] ^
630                         Td1[Te2[(rk[1] >>  8) & 0xff] & 0xff] ^
631                         Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
632                         Td3[Te2[(rk[1] >> 24)       ] & 0xff];
633                 rk[2] =
634                         Td0[Te2[(rk[2]      ) & 0xff] & 0xff] ^
635                         Td1[Te2[(rk[2] >>  8) & 0xff] & 0xff] ^
636                         Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
637                         Td3[Te2[(rk[2] >> 24)       ] & 0xff];
638                 rk[3] =
639                         Td0[Te2[(rk[3]      ) & 0xff] & 0xff] ^
640                         Td1[Te2[(rk[3] >>  8) & 0xff] & 0xff] ^
641                         Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
642                         Td3[Te2[(rk[3] >> 24)       ] & 0xff];
643 #endif
644         }
645         return 0;
646 }
647
648 /*
649  * Encrypt a single block
650  * in and out can overlap
651  */
652 void AES_encrypt(const unsigned char *in, unsigned char *out,
653                  const AES_KEY *key) {
654
655         const u32 *rk;
656         u32 s0, s1, s2, s3, t[4];
657         int r;
658
659         assert(in && out && key);
660         rk = key->rd_key;
661
662         /*
663          * map byte array block to cipher state
664          * and add initial round key:
665          */
666         s0 = GETU32(in     ) ^ rk[0];
667         s1 = GETU32(in +  4) ^ rk[1];
668         s2 = GETU32(in +  8) ^ rk[2];
669         s3 = GETU32(in + 12) ^ rk[3];
670
671 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
672         prefetch256(Te4);
673
674         t[0] =  Te4[(s0      ) & 0xff]       ^
675                 Te4[(s1 >>  8) & 0xff] <<  8 ^
676                 Te4[(s2 >> 16) & 0xff] << 16 ^
677                 Te4[(s3 >> 24)       ] << 24;
678         t[1] =  Te4[(s1      ) & 0xff]       ^
679                 Te4[(s2 >>  8) & 0xff] <<  8 ^
680                 Te4[(s3 >> 16) & 0xff] << 16 ^
681                 Te4[(s0 >> 24)       ] << 24;
682         t[2] =  Te4[(s2      ) & 0xff]       ^
683                 Te4[(s3 >>  8) & 0xff] <<  8 ^
684                 Te4[(s0 >> 16) & 0xff] << 16 ^
685                 Te4[(s1 >> 24)       ] << 24;
686         t[3] =  Te4[(s3      ) & 0xff]       ^
687                 Te4[(s0 >>  8) & 0xff] <<  8 ^
688                 Te4[(s1 >> 16) & 0xff] << 16 ^
689                 Te4[(s2 >> 24)       ] << 24;
690
691         /* now do the linear transform using words */
692         {       int i;
693                 u32 r0, r1, r2;
694
695                 for (i = 0; i < 4; i++) {
696                         r0 = t[i];
697                         r1 = r0 & 0x80808080;
698                         r2 = ((r0 & 0x7f7f7f7f) << 1) ^
699                                 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
700 #if defined(ROTATE)
701                         t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
702                                 ROTATE(r0,16) ^ ROTATE(r0,8);
703 #else
704                         t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
705                                 (r0 << 16) ^ (r0 >> 16) ^
706                                 (r0 << 8) ^ (r0 >> 24);
707 #endif
708                         t[i] ^= rk[4+i];
709                 }
710         }
711 #else
712         t[0] =  Te0[(s0      ) & 0xff] ^
713                 Te1[(s1 >>  8) & 0xff] ^
714                 Te2[(s2 >> 16) & 0xff] ^
715                 Te3[(s3 >> 24)       ] ^
716                 rk[4];
717         t[1] =  Te0[(s1      ) & 0xff] ^
718                 Te1[(s2 >>  8) & 0xff] ^
719                 Te2[(s3 >> 16) & 0xff] ^
720                 Te3[(s0 >> 24)       ] ^
721                 rk[5];
722         t[2] =  Te0[(s2      ) & 0xff] ^
723                 Te1[(s3 >>  8) & 0xff] ^
724                 Te2[(s0 >> 16) & 0xff] ^
725                 Te3[(s1 >> 24)       ] ^
726                 rk[6];
727         t[3] =  Te0[(s3      ) & 0xff] ^
728                 Te1[(s0 >>  8) & 0xff] ^
729                 Te2[(s1 >> 16) & 0xff] ^
730                 Te3[(s2 >> 24)       ] ^
731                 rk[7];
732 #endif
733         s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
734
735     /*
736      * Nr - 2 full rounds:
737      */
738     for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
739 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
740         t[0] =  Te4[(s0      ) & 0xff]       ^
741                 Te4[(s1 >>  8) & 0xff] <<  8 ^
742                 Te4[(s2 >> 16) & 0xff] << 16 ^
743                 Te4[(s3 >> 24)       ] << 24;
744         t[1] =  Te4[(s1      ) & 0xff]       ^
745                 Te4[(s2 >>  8) & 0xff] <<  8 ^
746                 Te4[(s3 >> 16) & 0xff] << 16 ^
747                 Te4[(s0 >> 24)       ] << 24;
748         t[2] =  Te4[(s2      ) & 0xff]       ^
749                 Te4[(s3 >>  8) & 0xff] <<  8 ^
750                 Te4[(s0 >> 16) & 0xff] << 16 ^
751                 Te4[(s1 >> 24)       ] << 24;
752         t[3] =  Te4[(s3      ) & 0xff]       ^
753                 Te4[(s0 >>  8) & 0xff] <<  8 ^
754                 Te4[(s1 >> 16) & 0xff] << 16 ^
755                 Te4[(s2 >> 24)       ] << 24;
756
757         /* now do the linear transform using words */
758         {       int i;
759                 u32 r0, r1, r2;
760
761                 for (i = 0; i < 4; i++) {
762                         r0 = t[i];
763                         r1 = r0 & 0x80808080;
764                         r2 = ((r0 & 0x7f7f7f7f) << 1) ^
765                                 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
766 #if defined(ROTATE)
767                         t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
768                                 ROTATE(r0,16) ^ ROTATE(r0,8);
769 #else
770                         t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
771                                 (r0 << 16) ^ (r0 >> 16) ^
772                                 (r0 << 8) ^ (r0 >> 24);
773 #endif
774                         t[i] ^= rk[i];
775                 }
776         }
777 #else
778         t[0] =  Te0[(s0      ) & 0xff] ^
779                 Te1[(s1 >>  8) & 0xff] ^
780                 Te2[(s2 >> 16) & 0xff] ^
781                 Te3[(s3 >> 24)       ] ^
782                 rk[0];
783         t[1] =  Te0[(s1      ) & 0xff] ^
784                 Te1[(s2 >>  8) & 0xff] ^
785                 Te2[(s3 >> 16) & 0xff] ^
786                 Te3[(s0 >> 24)       ] ^
787                 rk[1];
788         t[2] =  Te0[(s2      ) & 0xff] ^
789                 Te1[(s3 >>  8) & 0xff] ^
790                 Te2[(s0 >> 16) & 0xff] ^
791                 Te3[(s1 >> 24)       ] ^
792                 rk[2];
793         t[3] =  Te0[(s3      ) & 0xff] ^
794                 Te1[(s0 >>  8) & 0xff] ^
795                 Te2[(s1 >> 16) & 0xff] ^
796                 Te3[(s2 >> 24)       ] ^
797                 rk[3];
798 #endif
799         s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
800     }
801     /*
802          * apply last round and
803          * map cipher state to byte array block:
804          */
805 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
806         prefetch256(Te4);
807
808         *(u32*)(out+0) =
809                 Te4[(s0      ) & 0xff]       ^
810                 Te4[(s1 >>  8) & 0xff] <<  8 ^
811                 Te4[(s2 >> 16) & 0xff] << 16 ^
812                 Te4[(s3 >> 24)       ] << 24 ^
813                 rk[0];
814         *(u32*)(out+4) =
815                 Te4[(s1      ) & 0xff]       ^
816                 Te4[(s2 >>  8) & 0xff] <<  8 ^
817                 Te4[(s3 >> 16) & 0xff] << 16 ^
818                 Te4[(s0 >> 24)       ] << 24 ^
819                 rk[1];
820         *(u32*)(out+8) =
821                 Te4[(s2      ) & 0xff]       ^
822                 Te4[(s3 >>  8) & 0xff] <<  8 ^
823                 Te4[(s0 >> 16) & 0xff] << 16 ^
824                 Te4[(s1 >> 24)       ] << 24 ^
825                 rk[2];
826         *(u32*)(out+12) =
827                 Te4[(s3      ) & 0xff]       ^
828                 Te4[(s0 >>  8) & 0xff] <<  8 ^
829                 Te4[(s1 >> 16) & 0xff] << 16 ^
830                 Te4[(s2 >> 24)       ] << 24 ^
831                 rk[3];
832 #else
833         *(u32*)(out+0) =
834                 (Te2[(s0      ) & 0xff] & 0x000000ffU) ^
835                 (Te3[(s1 >>  8) & 0xff] & 0x0000ff00U) ^
836                 (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
837                 (Te1[(s3 >> 24)       ] & 0xff000000U) ^
838                 rk[0];
839         *(u32*)(out+4) =
840                 (Te2[(s1      ) & 0xff] & 0x000000ffU) ^
841                 (Te3[(s2 >>  8) & 0xff] & 0x0000ff00U) ^
842                 (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
843                 (Te1[(s0 >> 24)       ] & 0xff000000U) ^
844                 rk[1];
845         *(u32*)(out+8) =
846                 (Te2[(s2      ) & 0xff] & 0x000000ffU) ^
847                 (Te3[(s3 >>  8) & 0xff] & 0x0000ff00U) ^
848                 (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
849                 (Te1[(s1 >> 24)       ] & 0xff000000U) ^
850                 rk[2];
851         *(u32*)(out+12) =
852                 (Te2[(s3      ) & 0xff] & 0x000000ffU) ^
853                 (Te3[(s0 >>  8) & 0xff] & 0x0000ff00U) ^
854                 (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
855                 (Te1[(s2 >> 24)       ] & 0xff000000U) ^
856                 rk[3];
857 #endif
858 }
859
860 /*
861  * Decrypt a single block
862  * in and out can overlap
863  */
864 void AES_decrypt(const unsigned char *in, unsigned char *out,
865                  const AES_KEY *key) {
866
867         const u32 *rk;
868         u32 s0, s1, s2, s3, t[4];
869         int r;
870
871         assert(in && out && key);
872         rk = key->rd_key;
873
874         /*
875          * map byte array block to cipher state
876          * and add initial round key:
877          */
878         s0 = GETU32(in     ) ^ rk[0];
879         s1 = GETU32(in +  4) ^ rk[1];
880         s2 = GETU32(in +  8) ^ rk[2];
881         s3 = GETU32(in + 12) ^ rk[3];
882
883 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
884         prefetch256(Td4);
885
886         t[0] =  Td4[(s0      ) & 0xff]       ^
887                 Td4[(s3 >>  8) & 0xff] <<  8 ^
888                 Td4[(s2 >> 16) & 0xff] << 16 ^
889                 Td4[(s1 >> 24)       ] << 24;
890         t[1] =  Td4[(s1      ) & 0xff]       ^
891                 Td4[(s0 >>  8) & 0xff] <<  8 ^
892                 Td4[(s3 >> 16) & 0xff] << 16 ^
893                 Td4[(s2 >> 24)       ] << 24;
894         t[2] =  Td4[(s2      ) & 0xff]       ^
895                 Td4[(s1 >>  8) & 0xff] <<  8 ^
896                 Td4[(s0 >> 16) & 0xff] << 16 ^
897                 Td4[(s3 >> 24)       ] << 24;
898         t[3] =  Td4[(s3      ) & 0xff]       ^
899                 Td4[(s2 >>  8) & 0xff] <<  8 ^
900                 Td4[(s1 >> 16) & 0xff] << 16 ^
901                 Td4[(s0 >> 24)       ] << 24;
902
903         /* now do the linear transform using words */ 
904         {       int i;
905                 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
906
907                 for (i = 0; i < 4; i++) {
908                         tp1 = t[i];
909                         m = tp1 & 0x80808080;
910                         tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
911                                 ((m - (m >> 7)) & 0x1b1b1b1b);
912                         m = tp2 & 0x80808080;
913                         tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
914                                 ((m - (m >> 7)) & 0x1b1b1b1b);
915                         m = tp4 & 0x80808080;
916                         tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
917                                 ((m - (m >> 7)) & 0x1b1b1b1b);
918                         tp9 = tp8 ^ tp1;
919                         tpb = tp9 ^ tp2;
920                         tpd = tp9 ^ tp4;
921                         tpe = tp8 ^ tp4 ^ tp2;
922 #if defined(ROTATE)
923                         t[i] = tpe ^ ROTATE(tpd,16) ^
924                                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
925 #else
926                         t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
927                                 (tp9 >> 24) ^ (tp9 << 8) ^
928                                 (tpb >> 8) ^ (tpb << 24);
929 #endif
930                         t[i] ^= rk[4+i];
931                 }
932         }
933 #else
934         t[0] =  Td0[(s0      ) & 0xff] ^
935                 Td1[(s3 >>  8) & 0xff] ^
936                 Td2[(s2 >> 16) & 0xff] ^
937                 Td3[(s1 >> 24)       ] ^
938                 rk[4];
939         t[1] =  Td0[(s1      ) & 0xff] ^
940                 Td1[(s0 >>  8) & 0xff] ^
941                 Td2[(s3 >> 16) & 0xff] ^
942                 Td3[(s2 >> 24)       ] ^
943                 rk[5];
944         t[2] =  Td0[(s2      ) & 0xff] ^
945                 Td1[(s1 >>  8) & 0xff] ^
946                 Td2[(s0 >> 16) & 0xff] ^
947                 Td3[(s3 >> 24)       ] ^
948                 rk[6];
949         t[3] =  Td0[(s3      ) & 0xff] ^
950                 Td1[(s2 >>  8) & 0xff] ^
951                 Td2[(s1 >> 16) & 0xff] ^
952                 Td3[(s0 >> 24)       ] ^
953                 rk[7];
954 #endif
955         s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
956
957     /*
958      * Nr - 2 full rounds:
959      */
960     for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
961 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
962         t[0] =  Td4[(s0      ) & 0xff]       ^
963                 Td4[(s3 >>  8) & 0xff] <<  8 ^
964                 Td4[(s2 >> 16) & 0xff] << 16 ^
965                 Td4[(s1 >> 24)       ] << 24;
966         t[1] =  Td4[(s1      ) & 0xff]       ^
967                 Td4[(s0 >>  8) & 0xff] <<  8 ^
968                 Td4[(s3 >> 16) & 0xff] << 16 ^
969                 Td4[(s2 >> 24)       ] << 24;
970         t[2] =  Td4[(s2      ) & 0xff]       ^
971                 Td4[(s1 >>  8) & 0xff] <<  8 ^
972                 Td4[(s0 >> 16) & 0xff] << 16 ^
973                 Td4[(s3 >> 24)       ] << 24;
974         t[3] =  Td4[(s3      ) & 0xff]       ^
975                 Td4[(s2 >>  8) & 0xff] <<  8 ^
976                 Td4[(s1 >> 16) & 0xff] << 16 ^
977                 Td4[(s0 >> 24)       ] << 24;
978
979         /* now do the linear transform using words */ 
980         {       int i;
981                 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
982
983                 for (i = 0; i < 4; i++) {
984                         tp1 = t[i];
985                         m = tp1 & 0x80808080;
986                         tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
987                                 ((m - (m >> 7)) & 0x1b1b1b1b);
988                         m = tp2 & 0x80808080;
989                         tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
990                                 ((m - (m >> 7)) & 0x1b1b1b1b);
991                         m = tp4 & 0x80808080;
992                         tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
993                                 ((m - (m >> 7)) & 0x1b1b1b1b);
994                         tp9 = tp8 ^ tp1;
995                         tpb = tp9 ^ tp2;
996                         tpd = tp9 ^ tp4;
997                         tpe = tp8 ^ tp4 ^ tp2;
998 #if defined(ROTATE)
999                         t[i] = tpe ^ ROTATE(tpd,16) ^
1000                                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
1001 #else
1002                         t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
1003                                 (tp9 >> 24) ^ (tp9 << 8) ^
1004                                 (tpb >> 8) ^ (tpb << 24);
1005 #endif
1006                         t[i] ^= rk[i];
1007                 }
1008         }
1009 #else
1010         t[0] =  Td0[(s0      ) & 0xff] ^
1011                 Td1[(s3 >>  8) & 0xff] ^
1012                 Td2[(s2 >> 16) & 0xff] ^
1013                 Td3[(s1 >> 24)       ] ^
1014                 rk[0];
1015         t[1] =  Td0[(s1      ) & 0xff] ^
1016                 Td1[(s0 >>  8) & 0xff] ^
1017                 Td2[(s3 >> 16) & 0xff] ^
1018                 Td3[(s2 >> 24)       ] ^
1019                 rk[1];
1020         t[2] =  Td0[(s2      ) & 0xff] ^
1021                 Td1[(s1 >>  8) & 0xff] ^
1022                 Td2[(s0 >> 16) & 0xff] ^
1023                 Td3[(s3 >> 24)       ] ^
1024                 rk[2];
1025         t[3] =  Td0[(s3      ) & 0xff] ^
1026                 Td1[(s2 >>  8) & 0xff] ^
1027                 Td2[(s1 >> 16) & 0xff] ^
1028                 Td3[(s0 >> 24)       ] ^
1029                 rk[3];
1030 #endif
1031         s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1032     }
1033     /*
1034          * apply last round and
1035          * map cipher state to byte array block:
1036          */
1037         prefetch256(Td4);
1038
1039         *(u32*)(out+0) =
1040                 (Td4[(s0      ) & 0xff])        ^
1041                 (Td4[(s3 >>  8) & 0xff] <<  8) ^
1042                 (Td4[(s2 >> 16) & 0xff] << 16) ^
1043                 (Td4[(s1 >> 24)       ] << 24) ^
1044                 rk[0];
1045         *(u32*)(out+4) =
1046                 (Td4[(s1      ) & 0xff])         ^
1047                 (Td4[(s0 >>  8) & 0xff] <<  8) ^
1048                 (Td4[(s3 >> 16) & 0xff] << 16) ^
1049                 (Td4[(s2 >> 24)       ] << 24) ^
1050                 rk[1];
1051         *(u32*)(out+8) =
1052                 (Td4[(s2      ) & 0xff])         ^
1053                 (Td4[(s1 >>  8) & 0xff] <<  8) ^
1054                 (Td4[(s0 >> 16) & 0xff] << 16) ^
1055                 (Td4[(s3 >> 24)       ] << 24) ^
1056                 rk[2];
1057         *(u32*)(out+12) =
1058                 (Td4[(s3      ) & 0xff])         ^
1059                 (Td4[(s2 >>  8) & 0xff] <<  8) ^
1060                 (Td4[(s1 >> 16) & 0xff] << 16) ^
1061                 (Td4[(s0 >> 24)       ] << 24) ^
1062                 rk[3];
1063 }