vpaes-x86[_64]*.pl: fix typo.
[openssl.git] / crypto / aes / aes_x86core.c
1 /* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2 /**
3  * rijndael-alg-fst.c
4  *
5  * @version 3.0 (December 2000)
6  *
7  * Optimised ANSI C code for the Rijndael cipher (now AES)
8  *
9  * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10  * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11  * @author Paulo Barreto <paulo.barreto@terra.com.br>
12  *
13  * This code is hereby placed in the public domain.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 /*
29  * This is experimental x86[_64] derivative. It assumes little-endian
30  * byte order and expects CPU to sustain unaligned memory references.
31  * It is used as playground for cache-time attack mitigations and
32  * serves as reference C implementation for x86[_64] assembler.
33  *
34  *                                      <appro@fy.chalmers.se>
35  */
36
37
38 #ifndef AES_DEBUG
39 # ifndef NDEBUG
40 #  define NDEBUG
41 # endif
42 #endif
43 #include <assert.h>
44
45 #include <stdlib.h>
46 #include <crypto/aes.h>
47 #include <openssl/aes.h>
48 #include "aes_locl.h"
49
50 /*
51  * These two parameters control which table, 256-byte or 2KB, is
52  * referenced in outer and respectively inner rounds.
53  */
54 #define AES_COMPACT_IN_OUTER_ROUNDS
55 #ifdef  AES_COMPACT_IN_OUTER_ROUNDS
56 /* AES_COMPACT_IN_OUTER_ROUNDS costs ~30% in performance, while
57  * adding AES_COMPACT_IN_INNER_ROUNDS reduces benchmark *further*
58  * by factor of ~2. */
59 # undef  AES_COMPACT_IN_INNER_ROUNDS
60 #endif
61
62 #if 1
63 static void prefetch256(const void *table)
64 {
65         volatile unsigned long *t=(void *)table,ret;
66         unsigned long sum;
67         int i;
68
69         /* 32 is common least cache-line size */
70         for (sum=0,i=0;i<256/sizeof(t[0]);i+=32/sizeof(t[0]))   sum ^= t[i];
71
72         ret = sum;
73 }
74 #else
75 # define prefetch256(t)
76 #endif
77
78 #undef GETU32
79 #define GETU32(p) (*((u32*)(p)))
80
81 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
82 typedef unsigned __int64 u64;
83 #define U64(C)  C##UI64
84 #elif defined(__arch64__)
85 typedef unsigned long u64;
86 #define U64(C)  C##UL
87 #else
88 typedef unsigned long long u64;
89 #define U64(C)  C##ULL
90 #endif
91
92 #undef ROTATE
93 #if defined(_MSC_VER) || defined(__ICC)
94 # define ROTATE(a,n)    _lrotl(a,n)
95 #elif defined(__GNUC__) && __GNUC__>=2
96 # if defined(__i386) || defined(__i386__) || defined(__x86_64) || defined(__x86_64__)
97 #   define ROTATE(a,n)  ({ register unsigned int ret;   \
98                                 asm (                   \
99                                 "roll %1,%0"            \
100                                 : "=r"(ret)             \
101                                 : "I"(n), "0"(a)        \
102                                 : "cc");                \
103                            ret;                         \
104                         })
105 # endif
106 #endif
107 /*
108 Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
109 Te0[x] = S [x].[02, 01, 01, 03];
110 Te1[x] = S [x].[03, 02, 01, 01];
111 Te2[x] = S [x].[01, 03, 02, 01];
112 Te3[x] = S [x].[01, 01, 03, 02];
113 */
114 #define Te0 (u32)((u64*)((u8*)Te+0))
115 #define Te1 (u32)((u64*)((u8*)Te+3))
116 #define Te2 (u32)((u64*)((u8*)Te+2))
117 #define Te3 (u32)((u64*)((u8*)Te+1))
118 /*
119 Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
120 Td0[x] = Si[x].[0e, 09, 0d, 0b];
121 Td1[x] = Si[x].[0b, 0e, 09, 0d];
122 Td2[x] = Si[x].[0d, 0b, 0e, 09];
123 Td3[x] = Si[x].[09, 0d, 0b, 0e];
124 Td4[x] = Si[x].[01];
125 */
126 #define Td0 (u32)((u64*)((u8*)Td+0))
127 #define Td1 (u32)((u64*)((u8*)Td+3))
128 #define Td2 (u32)((u64*)((u8*)Td+2))
129 #define Td3 (u32)((u64*)((u8*)Td+1))
130
131 static const u64 Te[256] = {
132     U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
133     U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
134     U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
135     U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
136     U64(0x5030306050303060), U64(0x0301010203010102),
137     U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
138     U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
139     U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
140     U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
141     U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
142     U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
143     U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
144     U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
145     U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
146     U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
147     U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
148     U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
149     U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
150     U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
151     U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
152     U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
153     U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
154     U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
155     U64(0x5331316253313162), U64(0x3f15152a3f15152a),
156     U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
157     U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
158     U64(0x2818183028181830), U64(0xa1969637a1969637),
159     U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
160     U64(0x0907070e0907070e), U64(0x3612122436121224),
161     U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
162     U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
163     U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
164     U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
165     U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
166     U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
167     U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
168     U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
169     U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
170     U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
171     U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
172     U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
173     U64(0x0000000000000000), U64(0x2cededc12cededc1),
174     U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
175     U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
176     U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
177     U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
178     U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
179     U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
180     U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
181     U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
182     U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
183     U64(0x5533336655333366), U64(0x9485851194858511),
184     U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
185     U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
186     U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
187     U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
188     U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
189     U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
190     U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
191     U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
192     U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
193     U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
194     U64(0x3010102030101020), U64(0x1affffe51affffe5),
195     U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
196     U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
197     U64(0x3513132635131326), U64(0x2fececc32fececc3),
198     U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
199     U64(0xcc444488cc444488), U64(0x3917172e3917172e),
200     U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
201     U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
202     U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
203     U64(0x2b1919322b191932), U64(0x957373e6957373e6),
204     U64(0xa06060c0a06060c0), U64(0x9881811998818119),
205     U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
206     U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
207     U64(0xab90903bab90903b), U64(0x8388880b8388880b),
208     U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
209     U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
210     U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
211     U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
212     U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
213     U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
214     U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
215     U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
216     U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
217     U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
218     U64(0xa8919139a8919139), U64(0xa4959531a4959531),
219     U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
220     U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
221     U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
222     U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
223     U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
224     U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
225     U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
226     U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
227     U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
228     U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
229     U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
230     U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
231     U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
232     U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
233     U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
234     U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
235     U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
236     U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
237     U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
238     U64(0xd8484890d8484890), U64(0x0503030605030306),
239     U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
240     U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
241     U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
242     U64(0x9186861791868617), U64(0x58c1c19958c1c199),
243     U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
244     U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
245     U64(0xb398982bb398982b), U64(0x3311112233111122),
246     U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
247     U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
248     U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
249     U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
250     U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
251     U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
252     U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
253     U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
254     U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
255     U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
256     U64(0xc3414182c3414182), U64(0xb0999929b0999929),
257     U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
258     U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
259     U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
260 };
261
262 static const u8 Te4[256] = {
263     0x63U, 0x7cU, 0x77U, 0x7bU, 0xf2U, 0x6bU, 0x6fU, 0xc5U,
264     0x30U, 0x01U, 0x67U, 0x2bU, 0xfeU, 0xd7U, 0xabU, 0x76U,
265     0xcaU, 0x82U, 0xc9U, 0x7dU, 0xfaU, 0x59U, 0x47U, 0xf0U,
266     0xadU, 0xd4U, 0xa2U, 0xafU, 0x9cU, 0xa4U, 0x72U, 0xc0U,
267     0xb7U, 0xfdU, 0x93U, 0x26U, 0x36U, 0x3fU, 0xf7U, 0xccU,
268     0x34U, 0xa5U, 0xe5U, 0xf1U, 0x71U, 0xd8U, 0x31U, 0x15U,
269     0x04U, 0xc7U, 0x23U, 0xc3U, 0x18U, 0x96U, 0x05U, 0x9aU,
270     0x07U, 0x12U, 0x80U, 0xe2U, 0xebU, 0x27U, 0xb2U, 0x75U,
271     0x09U, 0x83U, 0x2cU, 0x1aU, 0x1bU, 0x6eU, 0x5aU, 0xa0U,
272     0x52U, 0x3bU, 0xd6U, 0xb3U, 0x29U, 0xe3U, 0x2fU, 0x84U,
273     0x53U, 0xd1U, 0x00U, 0xedU, 0x20U, 0xfcU, 0xb1U, 0x5bU,
274     0x6aU, 0xcbU, 0xbeU, 0x39U, 0x4aU, 0x4cU, 0x58U, 0xcfU,
275     0xd0U, 0xefU, 0xaaU, 0xfbU, 0x43U, 0x4dU, 0x33U, 0x85U,
276     0x45U, 0xf9U, 0x02U, 0x7fU, 0x50U, 0x3cU, 0x9fU, 0xa8U,
277     0x51U, 0xa3U, 0x40U, 0x8fU, 0x92U, 0x9dU, 0x38U, 0xf5U,
278     0xbcU, 0xb6U, 0xdaU, 0x21U, 0x10U, 0xffU, 0xf3U, 0xd2U,
279     0xcdU, 0x0cU, 0x13U, 0xecU, 0x5fU, 0x97U, 0x44U, 0x17U,
280     0xc4U, 0xa7U, 0x7eU, 0x3dU, 0x64U, 0x5dU, 0x19U, 0x73U,
281     0x60U, 0x81U, 0x4fU, 0xdcU, 0x22U, 0x2aU, 0x90U, 0x88U,
282     0x46U, 0xeeU, 0xb8U, 0x14U, 0xdeU, 0x5eU, 0x0bU, 0xdbU,
283     0xe0U, 0x32U, 0x3aU, 0x0aU, 0x49U, 0x06U, 0x24U, 0x5cU,
284     0xc2U, 0xd3U, 0xacU, 0x62U, 0x91U, 0x95U, 0xe4U, 0x79U,
285     0xe7U, 0xc8U, 0x37U, 0x6dU, 0x8dU, 0xd5U, 0x4eU, 0xa9U,
286     0x6cU, 0x56U, 0xf4U, 0xeaU, 0x65U, 0x7aU, 0xaeU, 0x08U,
287     0xbaU, 0x78U, 0x25U, 0x2eU, 0x1cU, 0xa6U, 0xb4U, 0xc6U,
288     0xe8U, 0xddU, 0x74U, 0x1fU, 0x4bU, 0xbdU, 0x8bU, 0x8aU,
289     0x70U, 0x3eU, 0xb5U, 0x66U, 0x48U, 0x03U, 0xf6U, 0x0eU,
290     0x61U, 0x35U, 0x57U, 0xb9U, 0x86U, 0xc1U, 0x1dU, 0x9eU,
291     0xe1U, 0xf8U, 0x98U, 0x11U, 0x69U, 0xd9U, 0x8eU, 0x94U,
292     0x9bU, 0x1eU, 0x87U, 0xe9U, 0xceU, 0x55U, 0x28U, 0xdfU,
293     0x8cU, 0xa1U, 0x89U, 0x0dU, 0xbfU, 0xe6U, 0x42U, 0x68U,
294     0x41U, 0x99U, 0x2dU, 0x0fU, 0xb0U, 0x54U, 0xbbU, 0x16U
295 };
296
297 static const u64 Td[256] = {
298     U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
299     U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
300     U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
301     U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
302     U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
303     U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
304     U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
305     U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
306     U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
307     U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
308     U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
309     U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
310     U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
311     U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
312     U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
313     U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
314     U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
315     U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
316     U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
317     U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
318     U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
319     U64(0x6033519760335197), U64(0x457f5362457f5362),
320     U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
321     U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
322     U64(0x5868487058684870), U64(0x19fd458f19fd458f),
323     U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
324     U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
325     U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
326     U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
327     U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
328     U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
329     U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
330     U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
331     U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
332     U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
333     U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
334     U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
335     U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
336     U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
337     U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
338     U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
339     U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
340     U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
341     U64(0x6fd406046fd40604), U64(0xff155060ff155060),
342     U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
343     U64(0xcc434089cc434089), U64(0x779ed967779ed967),
344     U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
345     U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
346     U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
347     U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
348     U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
349     U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
350     U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
351     U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
352     U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
353     U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
354     U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
355     U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
356     U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
357     U64(0x694b775a694b775a), U64(0x161a121c161a121c),
358     U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
359     U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
360     U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
361     U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
362     U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
363     U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
364     U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
365     U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
366     U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
367     U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
368     U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
369     U64(0x4022971340229713), U64(0x2011c6842011c684),
370     U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
371     U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
372     U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
373     U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
374     U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
375     U64(0xfa489411fa489411), U64(0x2264e9472264e947),
376     U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
377     U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
378     U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
379     U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
380     U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
381     U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
382     U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
383     U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
384     U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
385     U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
386     U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
387     U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
388     U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
389     U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
390     U64(0x097826cd097826cd), U64(0xf418596ef418596e),
391     U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
392     U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
393     U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
394     U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
395     U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
396     U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
397     U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
398     U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
399     U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
400     U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
401     U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
402     U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
403     U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
404     U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
405     U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
406     U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
407     U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
408     U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
409     U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
410     U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
411     U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
412     U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
413     U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
414     U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
415     U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
416     U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
417     U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
418     U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
419     U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
420     U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
421     U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
422     U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
423     U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
424     U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
425     U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
426 };
427 static const u8 Td4[256] = {
428     0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
429     0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
430     0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
431     0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
432     0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
433     0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
434     0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
435     0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
436     0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
437     0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
438     0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
439     0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
440     0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
441     0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
442     0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
443     0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
444     0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
445     0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
446     0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
447     0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
448     0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
449     0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
450     0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
451     0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
452     0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
453     0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
454     0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
455     0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
456     0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
457     0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
458     0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
459     0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
460 };
461
462 static const u32 rcon[] = {
463     0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
464     0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
465     0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
466 };
467
468 /**
469  * Expand the cipher key into the encryption key schedule.
470  */
471 int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
472                         AES_KEY *key) {
473
474         u32 *rk;
475         int i = 0;
476         u32 temp;
477
478         if (!userKey || !key)
479                 return -1;
480         if (bits != 128 && bits != 192 && bits != 256)
481                 return -2;
482
483         rk = key->rd_key;
484
485         if (bits==128)
486                 key->rounds = 10;
487         else if (bits==192)
488                 key->rounds = 12;
489         else
490                 key->rounds = 14;
491
492         rk[0] = GETU32(userKey     );
493         rk[1] = GETU32(userKey +  4);
494         rk[2] = GETU32(userKey +  8);
495         rk[3] = GETU32(userKey + 12);
496         if (bits == 128) {
497                 while (1) {
498                         temp  = rk[3];
499                         rk[4] = rk[0] ^
500                                 (Te4[(temp >>  8) & 0xff]      ) ^
501                                 (Te4[(temp >> 16) & 0xff] <<  8) ^
502                                 (Te4[(temp >> 24)       ] << 16) ^
503                                 (Te4[(temp      ) & 0xff] << 24) ^
504                                 rcon[i];
505                         rk[5] = rk[1] ^ rk[4];
506                         rk[6] = rk[2] ^ rk[5];
507                         rk[7] = rk[3] ^ rk[6];
508                         if (++i == 10) {
509                                 return 0;
510                         }
511                         rk += 4;
512                 }
513         }
514         rk[4] = GETU32(userKey + 16);
515         rk[5] = GETU32(userKey + 20);
516         if (bits == 192) {
517                 while (1) {
518                         temp = rk[ 5];
519                         rk[ 6] = rk[ 0] ^
520                                 (Te4[(temp >>  8) & 0xff]      ) ^
521                                 (Te4[(temp >> 16) & 0xff] <<  8) ^
522                                 (Te4[(temp >> 24)       ] << 16) ^
523                                 (Te4[(temp      ) & 0xff] << 24) ^
524                                 rcon[i];
525                         rk[ 7] = rk[ 1] ^ rk[ 6];
526                         rk[ 8] = rk[ 2] ^ rk[ 7];
527                         rk[ 9] = rk[ 3] ^ rk[ 8];
528                         if (++i == 8) {
529                                 return 0;
530                         }
531                         rk[10] = rk[ 4] ^ rk[ 9];
532                         rk[11] = rk[ 5] ^ rk[10];
533                         rk += 6;
534                 }
535         }
536         rk[6] = GETU32(userKey + 24);
537         rk[7] = GETU32(userKey + 28);
538         if (bits == 256) {
539                 while (1) {
540                         temp = rk[ 7];
541                         rk[ 8] = rk[ 0] ^
542                                 (Te4[(temp >>  8) & 0xff]      ) ^
543                                 (Te4[(temp >> 16) & 0xff] <<  8) ^
544                                 (Te4[(temp >> 24)       ] << 16) ^
545                                 (Te4[(temp      ) & 0xff] << 24) ^
546                                 rcon[i];
547                         rk[ 9] = rk[ 1] ^ rk[ 8];
548                         rk[10] = rk[ 2] ^ rk[ 9];
549                         rk[11] = rk[ 3] ^ rk[10];
550                         if (++i == 7) {
551                                 return 0;
552                         }
553                         temp = rk[11];
554                         rk[12] = rk[ 4] ^
555                                 (Te4[(temp      ) & 0xff]      ) ^
556                                 (Te4[(temp >>  8) & 0xff] <<  8) ^
557                                 (Te4[(temp >> 16) & 0xff] << 16) ^
558                                 (Te4[(temp >> 24)       ] << 24);
559                         rk[13] = rk[ 5] ^ rk[12];
560                         rk[14] = rk[ 6] ^ rk[13];
561                         rk[15] = rk[ 7] ^ rk[14];
562
563                         rk += 8;
564                 }
565         }
566         return 0;
567 }
568
569 /**
570  * Expand the cipher key into the decryption key schedule.
571  */
572 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
573                          AES_KEY *key) {
574
575         u32 *rk;
576         int i, j, status;
577         u32 temp;
578
579         /* first, start with an encryption schedule */
580         status = AES_set_encrypt_key(userKey, bits, key);
581         if (status < 0)
582                 return status;
583
584         rk = key->rd_key;
585
586         /* invert the order of the round keys: */
587         for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
588                 temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
589                 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
590                 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
591                 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
592         }
593         /* apply the inverse MixColumn transform to all round keys but the first and the last: */
594         for (i = 1; i < (key->rounds); i++) {
595                 rk += 4;
596 #if 1
597                 for (j = 0; j < 4; j++) {
598                         u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
599
600                         tp1 = rk[j];
601                         m = tp1 & 0x80808080;
602                         tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
603                                 ((m - (m >> 7)) & 0x1b1b1b1b);
604                         m = tp2 & 0x80808080;
605                         tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
606                                 ((m - (m >> 7)) & 0x1b1b1b1b);
607                         m = tp4 & 0x80808080;
608                         tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
609                                 ((m - (m >> 7)) & 0x1b1b1b1b);
610                         tp9 = tp8 ^ tp1;
611                         tpb = tp9 ^ tp2;
612                         tpd = tp9 ^ tp4;
613                         tpe = tp8 ^ tp4 ^ tp2;
614 #if defined(ROTATE)
615                         rk[j] = tpe ^ ROTATE(tpd,16) ^
616                                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
617 #else
618                         rk[j] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
619                                 (tp9 >> 24) ^ (tp9 << 8) ^
620                                 (tpb >> 8) ^ (tpb << 24);
621 #endif
622                 }
623 #else
624                 rk[0] =
625                         Td0[Te2[(rk[0]      ) & 0xff] & 0xff] ^
626                         Td1[Te2[(rk[0] >>  8) & 0xff] & 0xff] ^
627                         Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
628                         Td3[Te2[(rk[0] >> 24)       ] & 0xff];
629                 rk[1] =
630                         Td0[Te2[(rk[1]      ) & 0xff] & 0xff] ^
631                         Td1[Te2[(rk[1] >>  8) & 0xff] & 0xff] ^
632                         Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
633                         Td3[Te2[(rk[1] >> 24)       ] & 0xff];
634                 rk[2] =
635                         Td0[Te2[(rk[2]      ) & 0xff] & 0xff] ^
636                         Td1[Te2[(rk[2] >>  8) & 0xff] & 0xff] ^
637                         Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
638                         Td3[Te2[(rk[2] >> 24)       ] & 0xff];
639                 rk[3] =
640                         Td0[Te2[(rk[3]      ) & 0xff] & 0xff] ^
641                         Td1[Te2[(rk[3] >>  8) & 0xff] & 0xff] ^
642                         Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
643                         Td3[Te2[(rk[3] >> 24)       ] & 0xff];
644 #endif
645         }
646         return 0;
647 }
648
649 /*
650  * Encrypt a single block
651  * in and out can overlap
652  */
653 void AES_encrypt(const unsigned char *in, unsigned char *out,
654                  const AES_KEY *key) {
655
656         const u32 *rk;
657         u32 s0, s1, s2, s3, t[4];
658         int r;
659
660         assert(in && out && key);
661         rk = key->rd_key;
662
663         /*
664          * map byte array block to cipher state
665          * and add initial round key:
666          */
667         s0 = GETU32(in     ) ^ rk[0];
668         s1 = GETU32(in +  4) ^ rk[1];
669         s2 = GETU32(in +  8) ^ rk[2];
670         s3 = GETU32(in + 12) ^ rk[3];
671
672 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
673         prefetch256(Te4);
674
675         t[0] =  Te4[(s0      ) & 0xff]       ^
676                 Te4[(s1 >>  8) & 0xff] <<  8 ^
677                 Te4[(s2 >> 16) & 0xff] << 16 ^
678                 Te4[(s3 >> 24)       ] << 24;
679         t[1] =  Te4[(s1      ) & 0xff]       ^
680                 Te4[(s2 >>  8) & 0xff] <<  8 ^
681                 Te4[(s3 >> 16) & 0xff] << 16 ^
682                 Te4[(s0 >> 24)       ] << 24;
683         t[2] =  Te4[(s2      ) & 0xff]       ^
684                 Te4[(s3 >>  8) & 0xff] <<  8 ^
685                 Te4[(s0 >> 16) & 0xff] << 16 ^
686                 Te4[(s1 >> 24)       ] << 24;
687         t[3] =  Te4[(s3      ) & 0xff]       ^
688                 Te4[(s0 >>  8) & 0xff] <<  8 ^
689                 Te4[(s1 >> 16) & 0xff] << 16 ^
690                 Te4[(s2 >> 24)       ] << 24;
691
692         /* now do the linear transform using words */
693         {       int i;
694                 u32 r0, r1, r2;
695
696                 for (i = 0; i < 4; i++) {
697                         r0 = t[i];
698                         r1 = r0 & 0x80808080;
699                         r2 = ((r0 & 0x7f7f7f7f) << 1) ^
700                                 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
701 #if defined(ROTATE)
702                         t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
703                                 ROTATE(r0,16) ^ ROTATE(r0,8);
704 #else
705                         t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
706                                 (r0 << 16) ^ (r0 >> 16) ^
707                                 (r0 << 8) ^ (r0 >> 24);
708 #endif
709                         t[i] ^= rk[4+i];
710                 }
711         }
712 #else
713         t[0] =  Te0[(s0      ) & 0xff] ^
714                 Te1[(s1 >>  8) & 0xff] ^
715                 Te2[(s2 >> 16) & 0xff] ^
716                 Te3[(s3 >> 24)       ] ^
717                 rk[4];
718         t[1] =  Te0[(s1      ) & 0xff] ^
719                 Te1[(s2 >>  8) & 0xff] ^
720                 Te2[(s3 >> 16) & 0xff] ^
721                 Te3[(s0 >> 24)       ] ^
722                 rk[5];
723         t[2] =  Te0[(s2      ) & 0xff] ^
724                 Te1[(s3 >>  8) & 0xff] ^
725                 Te2[(s0 >> 16) & 0xff] ^
726                 Te3[(s1 >> 24)       ] ^
727                 rk[6];
728         t[3] =  Te0[(s3      ) & 0xff] ^
729                 Te1[(s0 >>  8) & 0xff] ^
730                 Te2[(s1 >> 16) & 0xff] ^
731                 Te3[(s2 >> 24)       ] ^
732                 rk[7];
733 #endif
734         s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
735
736     /*
737      * Nr - 2 full rounds:
738      */
739     for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
740 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
741         t[0] =  Te4[(s0      ) & 0xff]       ^
742                 Te4[(s1 >>  8) & 0xff] <<  8 ^
743                 Te4[(s2 >> 16) & 0xff] << 16 ^
744                 Te4[(s3 >> 24)       ] << 24;
745         t[1] =  Te4[(s1      ) & 0xff]       ^
746                 Te4[(s2 >>  8) & 0xff] <<  8 ^
747                 Te4[(s3 >> 16) & 0xff] << 16 ^
748                 Te4[(s0 >> 24)       ] << 24;
749         t[2] =  Te4[(s2      ) & 0xff]       ^
750                 Te4[(s3 >>  8) & 0xff] <<  8 ^
751                 Te4[(s0 >> 16) & 0xff] << 16 ^
752                 Te4[(s1 >> 24)       ] << 24;
753         t[3] =  Te4[(s3      ) & 0xff]       ^
754                 Te4[(s0 >>  8) & 0xff] <<  8 ^
755                 Te4[(s1 >> 16) & 0xff] << 16 ^
756                 Te4[(s2 >> 24)       ] << 24;
757
758         /* now do the linear transform using words */
759         {       int i;
760                 u32 r0, r1, r2;
761
762                 for (i = 0; i < 4; i++) {
763                         r0 = t[i];
764                         r1 = r0 & 0x80808080;
765                         r2 = ((r0 & 0x7f7f7f7f) << 1) ^
766                                 ((r1 - (r1 >> 7)) & 0x1b1b1b1b);
767 #if defined(ROTATE)
768                         t[i] = r2 ^ ROTATE(r2,24) ^ ROTATE(r0,24) ^
769                                 ROTATE(r0,16) ^ ROTATE(r0,8);
770 #else
771                         t[i] = r2 ^ ((r2 ^ r0) << 24) ^ ((r2 ^ r0) >> 8) ^
772                                 (r0 << 16) ^ (r0 >> 16) ^
773                                 (r0 << 8) ^ (r0 >> 24);
774 #endif
775                         t[i] ^= rk[i];
776                 }
777         }
778 #else
779         t[0] =  Te0[(s0      ) & 0xff] ^
780                 Te1[(s1 >>  8) & 0xff] ^
781                 Te2[(s2 >> 16) & 0xff] ^
782                 Te3[(s3 >> 24)       ] ^
783                 rk[0];
784         t[1] =  Te0[(s1      ) & 0xff] ^
785                 Te1[(s2 >>  8) & 0xff] ^
786                 Te2[(s3 >> 16) & 0xff] ^
787                 Te3[(s0 >> 24)       ] ^
788                 rk[1];
789         t[2] =  Te0[(s2      ) & 0xff] ^
790                 Te1[(s3 >>  8) & 0xff] ^
791                 Te2[(s0 >> 16) & 0xff] ^
792                 Te3[(s1 >> 24)       ] ^
793                 rk[2];
794         t[3] =  Te0[(s3      ) & 0xff] ^
795                 Te1[(s0 >>  8) & 0xff] ^
796                 Te2[(s1 >> 16) & 0xff] ^
797                 Te3[(s2 >> 24)       ] ^
798                 rk[3];
799 #endif
800         s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
801     }
802     /*
803          * apply last round and
804          * map cipher state to byte array block:
805          */
806 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
807         prefetch256(Te4);
808
809         *(u32*)(out+0) =
810                 Te4[(s0      ) & 0xff]       ^
811                 Te4[(s1 >>  8) & 0xff] <<  8 ^
812                 Te4[(s2 >> 16) & 0xff] << 16 ^
813                 Te4[(s3 >> 24)       ] << 24 ^
814                 rk[0];
815         *(u32*)(out+4) =
816                 Te4[(s1      ) & 0xff]       ^
817                 Te4[(s2 >>  8) & 0xff] <<  8 ^
818                 Te4[(s3 >> 16) & 0xff] << 16 ^
819                 Te4[(s0 >> 24)       ] << 24 ^
820                 rk[1];
821         *(u32*)(out+8) =
822                 Te4[(s2      ) & 0xff]       ^
823                 Te4[(s3 >>  8) & 0xff] <<  8 ^
824                 Te4[(s0 >> 16) & 0xff] << 16 ^
825                 Te4[(s1 >> 24)       ] << 24 ^
826                 rk[2];
827         *(u32*)(out+12) =
828                 Te4[(s3      ) & 0xff]       ^
829                 Te4[(s0 >>  8) & 0xff] <<  8 ^
830                 Te4[(s1 >> 16) & 0xff] << 16 ^
831                 Te4[(s2 >> 24)       ] << 24 ^
832                 rk[3];
833 #else
834         *(u32*)(out+0) =
835                 (Te2[(s0      ) & 0xff] & 0x000000ffU) ^
836                 (Te3[(s1 >>  8) & 0xff] & 0x0000ff00U) ^
837                 (Te0[(s2 >> 16) & 0xff] & 0x00ff0000U) ^
838                 (Te1[(s3 >> 24)       ] & 0xff000000U) ^
839                 rk[0];
840         *(u32*)(out+4) =
841                 (Te2[(s1      ) & 0xff] & 0x000000ffU) ^
842                 (Te3[(s2 >>  8) & 0xff] & 0x0000ff00U) ^
843                 (Te0[(s3 >> 16) & 0xff] & 0x00ff0000U) ^
844                 (Te1[(s0 >> 24)       ] & 0xff000000U) ^
845                 rk[1];
846         *(u32*)(out+8) =
847                 (Te2[(s2      ) & 0xff] & 0x000000ffU) ^
848                 (Te3[(s3 >>  8) & 0xff] & 0x0000ff00U) ^
849                 (Te0[(s0 >> 16) & 0xff] & 0x00ff0000U) ^
850                 (Te1[(s1 >> 24)       ] & 0xff000000U) ^
851                 rk[2];
852         *(u32*)(out+12) =
853                 (Te2[(s3      ) & 0xff] & 0x000000ffU) ^
854                 (Te3[(s0 >>  8) & 0xff] & 0x0000ff00U) ^
855                 (Te0[(s1 >> 16) & 0xff] & 0x00ff0000U) ^
856                 (Te1[(s2 >> 24)       ] & 0xff000000U) ^
857                 rk[3];
858 #endif
859 }
860
861 /*
862  * Decrypt a single block
863  * in and out can overlap
864  */
865 void AES_decrypt(const unsigned char *in, unsigned char *out,
866                  const AES_KEY *key) {
867
868         const u32 *rk;
869         u32 s0, s1, s2, s3, t[4];
870         int r;
871
872         assert(in && out && key);
873         rk = key->rd_key;
874
875         /*
876          * map byte array block to cipher state
877          * and add initial round key:
878          */
879         s0 = GETU32(in     ) ^ rk[0];
880         s1 = GETU32(in +  4) ^ rk[1];
881         s2 = GETU32(in +  8) ^ rk[2];
882         s3 = GETU32(in + 12) ^ rk[3];
883
884 #if defined(AES_COMPACT_IN_OUTER_ROUNDS)
885         prefetch256(Td4);
886
887         t[0] =  Td4[(s0      ) & 0xff]       ^
888                 Td4[(s3 >>  8) & 0xff] <<  8 ^
889                 Td4[(s2 >> 16) & 0xff] << 16 ^
890                 Td4[(s1 >> 24)       ] << 24;
891         t[1] =  Td4[(s1      ) & 0xff]       ^
892                 Td4[(s0 >>  8) & 0xff] <<  8 ^
893                 Td4[(s3 >> 16) & 0xff] << 16 ^
894                 Td4[(s2 >> 24)       ] << 24;
895         t[2] =  Td4[(s2      ) & 0xff]       ^
896                 Td4[(s1 >>  8) & 0xff] <<  8 ^
897                 Td4[(s0 >> 16) & 0xff] << 16 ^
898                 Td4[(s3 >> 24)       ] << 24;
899         t[3] =  Td4[(s3      ) & 0xff]       ^
900                 Td4[(s2 >>  8) & 0xff] <<  8 ^
901                 Td4[(s1 >> 16) & 0xff] << 16 ^
902                 Td4[(s0 >> 24)       ] << 24;
903
904         /* now do the linear transform using words */ 
905         {       int i;
906                 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
907
908                 for (i = 0; i < 4; i++) {
909                         tp1 = t[i];
910                         m = tp1 & 0x80808080;
911                         tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
912                                 ((m - (m >> 7)) & 0x1b1b1b1b);
913                         m = tp2 & 0x80808080;
914                         tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
915                                 ((m - (m >> 7)) & 0x1b1b1b1b);
916                         m = tp4 & 0x80808080;
917                         tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
918                                 ((m - (m >> 7)) & 0x1b1b1b1b);
919                         tp9 = tp8 ^ tp1;
920                         tpb = tp9 ^ tp2;
921                         tpd = tp9 ^ tp4;
922                         tpe = tp8 ^ tp4 ^ tp2;
923 #if defined(ROTATE)
924                         t[i] = tpe ^ ROTATE(tpd,16) ^
925                                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
926 #else
927                         t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
928                                 (tp9 >> 24) ^ (tp9 << 8) ^
929                                 (tpb >> 8) ^ (tpb << 24);
930 #endif
931                         t[i] ^= rk[4+i];
932                 }
933         }
934 #else
935         t[0] =  Td0[(s0      ) & 0xff] ^
936                 Td1[(s3 >>  8) & 0xff] ^
937                 Td2[(s2 >> 16) & 0xff] ^
938                 Td3[(s1 >> 24)       ] ^
939                 rk[4];
940         t[1] =  Td0[(s1      ) & 0xff] ^
941                 Td1[(s0 >>  8) & 0xff] ^
942                 Td2[(s3 >> 16) & 0xff] ^
943                 Td3[(s2 >> 24)       ] ^
944                 rk[5];
945         t[2] =  Td0[(s2      ) & 0xff] ^
946                 Td1[(s1 >>  8) & 0xff] ^
947                 Td2[(s0 >> 16) & 0xff] ^
948                 Td3[(s3 >> 24)       ] ^
949                 rk[6];
950         t[3] =  Td0[(s3      ) & 0xff] ^
951                 Td1[(s2 >>  8) & 0xff] ^
952                 Td2[(s1 >> 16) & 0xff] ^
953                 Td3[(s0 >> 24)       ] ^
954                 rk[7];
955 #endif
956         s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
957
958     /*
959      * Nr - 2 full rounds:
960      */
961     for (rk+=8,r=key->rounds-2; r>0; rk+=4,r--) {
962 #if defined(AES_COMPACT_IN_INNER_ROUNDS)
963         t[0] =  Td4[(s0      ) & 0xff]       ^
964                 Td4[(s3 >>  8) & 0xff] <<  8 ^
965                 Td4[(s2 >> 16) & 0xff] << 16 ^
966                 Td4[(s1 >> 24)       ] << 24;
967         t[1] =  Td4[(s1      ) & 0xff]       ^
968                 Td4[(s0 >>  8) & 0xff] <<  8 ^
969                 Td4[(s3 >> 16) & 0xff] << 16 ^
970                 Td4[(s2 >> 24)       ] << 24;
971         t[2] =  Td4[(s2      ) & 0xff]       ^
972                 Td4[(s1 >>  8) & 0xff] <<  8 ^
973                 Td4[(s0 >> 16) & 0xff] << 16 ^
974                 Td4[(s3 >> 24)       ] << 24;
975         t[3] =  Td4[(s3      ) & 0xff]       ^
976                 Td4[(s2 >>  8) & 0xff] <<  8 ^
977                 Td4[(s1 >> 16) & 0xff] << 16 ^
978                 Td4[(s0 >> 24)       ] << 24;
979
980         /* now do the linear transform using words */ 
981         {       int i;
982                 u32 tp1, tp2, tp4, tp8, tp9, tpb, tpd, tpe, m;
983
984                 for (i = 0; i < 4; i++) {
985                         tp1 = t[i];
986                         m = tp1 & 0x80808080;
987                         tp2 = ((tp1 & 0x7f7f7f7f) << 1) ^
988                                 ((m - (m >> 7)) & 0x1b1b1b1b);
989                         m = tp2 & 0x80808080;
990                         tp4 = ((tp2 & 0x7f7f7f7f) << 1) ^
991                                 ((m - (m >> 7)) & 0x1b1b1b1b);
992                         m = tp4 & 0x80808080;
993                         tp8 = ((tp4 & 0x7f7f7f7f) << 1) ^
994                                 ((m - (m >> 7)) & 0x1b1b1b1b);
995                         tp9 = tp8 ^ tp1;
996                         tpb = tp9 ^ tp2;
997                         tpd = tp9 ^ tp4;
998                         tpe = tp8 ^ tp4 ^ tp2;
999 #if defined(ROTATE)
1000                         t[i] = tpe ^ ROTATE(tpd,16) ^
1001                                 ROTATE(tp9,8) ^ ROTATE(tpb,24);
1002 #else
1003                         t[i] = tpe ^ (tpd >> 16) ^ (tpd << 16) ^ 
1004                                 (tp9 >> 24) ^ (tp9 << 8) ^
1005                                 (tpb >> 8) ^ (tpb << 24);
1006 #endif
1007                         t[i] ^= rk[i];
1008                 }
1009         }
1010 #else
1011         t[0] =  Td0[(s0      ) & 0xff] ^
1012                 Td1[(s3 >>  8) & 0xff] ^
1013                 Td2[(s2 >> 16) & 0xff] ^
1014                 Td3[(s1 >> 24)       ] ^
1015                 rk[0];
1016         t[1] =  Td0[(s1      ) & 0xff] ^
1017                 Td1[(s0 >>  8) & 0xff] ^
1018                 Td2[(s3 >> 16) & 0xff] ^
1019                 Td3[(s2 >> 24)       ] ^
1020                 rk[1];
1021         t[2] =  Td0[(s2      ) & 0xff] ^
1022                 Td1[(s1 >>  8) & 0xff] ^
1023                 Td2[(s0 >> 16) & 0xff] ^
1024                 Td3[(s3 >> 24)       ] ^
1025                 rk[2];
1026         t[3] =  Td0[(s3      ) & 0xff] ^
1027                 Td1[(s2 >>  8) & 0xff] ^
1028                 Td2[(s1 >> 16) & 0xff] ^
1029                 Td3[(s0 >> 24)       ] ^
1030                 rk[3];
1031 #endif
1032         s0 = t[0]; s1 = t[1]; s2 = t[2]; s3 = t[3];
1033     }
1034     /*
1035          * apply last round and
1036          * map cipher state to byte array block:
1037          */
1038         prefetch256(Td4);
1039
1040         *(u32*)(out+0) =
1041                 (Td4[(s0      ) & 0xff])        ^
1042                 (Td4[(s3 >>  8) & 0xff] <<  8) ^
1043                 (Td4[(s2 >> 16) & 0xff] << 16) ^
1044                 (Td4[(s1 >> 24)       ] << 24) ^
1045                 rk[0];
1046         *(u32*)(out+4) =
1047                 (Td4[(s1      ) & 0xff])         ^
1048                 (Td4[(s0 >>  8) & 0xff] <<  8) ^
1049                 (Td4[(s3 >> 16) & 0xff] << 16) ^
1050                 (Td4[(s2 >> 24)       ] << 24) ^
1051                 rk[1];
1052         *(u32*)(out+8) =
1053                 (Td4[(s2      ) & 0xff])         ^
1054                 (Td4[(s1 >>  8) & 0xff] <<  8) ^
1055                 (Td4[(s0 >> 16) & 0xff] << 16) ^
1056                 (Td4[(s3 >> 24)       ] << 24) ^
1057                 rk[2];
1058         *(u32*)(out+12) =
1059                 (Td4[(s3      ) & 0xff])         ^
1060                 (Td4[(s2 >>  8) & 0xff] <<  8) ^
1061                 (Td4[(s1 >> 16) & 0xff] << 16) ^
1062                 (Td4[(s0 >> 24)       ] << 24) ^
1063                 rk[3];
1064 }