Prepare playground for AES experimental code.
[openssl.git] / crypto / aes / aes_x86core.c
1 /* crypto/aes/aes_core.c -*- mode:C; c-file-style: "eay" -*- */
2 /**
3  * rijndael-alg-fst.c
4  *
5  * @version 3.0 (December 2000)
6  *
7  * Optimised ANSI C code for the Rijndael cipher (now AES)
8  *
9  * @author Vincent Rijmen <vincent.rijmen@esat.kuleuven.ac.be>
10  * @author Antoon Bosselaers <antoon.bosselaers@esat.kuleuven.ac.be>
11  * @author Paulo Barreto <paulo.barreto@terra.com.br>
12  *
13  * This code is hereby placed in the public domain.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE
19  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
20  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
21  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
22  * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
23  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
24  * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
25  * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26  */
27
28 /*
29  * This is experimental x86[_64] derivative. It assumes little-endian
30  * byte order and expects CPU to sustain unaligned memory references.
31  * It is used as playground for cache-time attack mitigations and
32  * serves as reference C implementation for x86[_64] assembler.
33  *
34  *                                      <appro@fy.chalmers.se>
35  */
36
37
38 #ifndef AES_DEBUG
39 # ifndef NDEBUG
40 #  define NDEBUG
41 # endif
42 #endif
43 #include <assert.h>
44
45 #include <stdlib.h>
46 #include <openssl/aes.h>
47 #include "aes_locl.h"
48
49 #undef GETU32
50 #define GETU32(p) (*((u32*)(p)))
51 #undef PUTU32
52 #define PUTU32(ct,st) { *((u32*)(ct)) = (st); }
53
54 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
55 typedef unsigned __int64 u64;
56 #define U64(C)  C##UI64
57 #elif defined(__arch64__)
58 typedef unsigned long u64;
59 #define U64(C)  C##UL
60 #else
61 typedef unsigned long long u64;
62 #define U64(C)  C##ULL
63 #endif
64
65 /*
66 Te [x] = S [x].[02, 01, 01, 03, 02, 01, 01, 03];
67 Te0[x] = S [x].[02, 01, 01, 03];
68 Te1[x] = S [x].[03, 02, 01, 01];
69 Te2[x] = S [x].[01, 03, 02, 01];
70 Te3[x] = S [x].[01, 01, 03, 02];
71 */
72 #define Te0 ((u64*)((u8*)Te+0))
73 #define Te1 ((u64*)((u8*)Te+3))
74 #define Te2 ((u64*)((u8*)Te+2))
75 #define Te3 ((u64*)((u8*)Te+1))
76 /*
77 Td [x] = Si[x].[0e, 09, 0d, 0b, 0e, 09, 0d, 0b];
78 Td0[x] = Si[x].[0e, 09, 0d, 0b];
79 Td1[x] = Si[x].[0b, 0e, 09, 0d];
80 Td2[x] = Si[x].[0d, 0b, 0e, 09];
81 Td3[x] = Si[x].[09, 0d, 0b, 0e];
82 Td4[x] = Si[x].[01];
83 */
84 #define Td0 ((u64*)((u8*)Td+0))
85 #define Td1 ((u64*)((u8*)Td+3))
86 #define Td2 ((u64*)((u8*)Td+2))
87 #define Td3 ((u64*)((u8*)Td+1))
88
89 static const u64 Te[256] = {
90     U64(0xa56363c6a56363c6), U64(0x847c7cf8847c7cf8),
91     U64(0x997777ee997777ee), U64(0x8d7b7bf68d7b7bf6),
92     U64(0x0df2f2ff0df2f2ff), U64(0xbd6b6bd6bd6b6bd6),
93     U64(0xb16f6fdeb16f6fde), U64(0x54c5c59154c5c591),
94     U64(0x5030306050303060), U64(0x0301010203010102),
95     U64(0xa96767cea96767ce), U64(0x7d2b2b567d2b2b56),
96     U64(0x19fefee719fefee7), U64(0x62d7d7b562d7d7b5),
97     U64(0xe6abab4de6abab4d), U64(0x9a7676ec9a7676ec),
98     U64(0x45caca8f45caca8f), U64(0x9d82821f9d82821f),
99     U64(0x40c9c98940c9c989), U64(0x877d7dfa877d7dfa),
100     U64(0x15fafaef15fafaef), U64(0xeb5959b2eb5959b2),
101     U64(0xc947478ec947478e), U64(0x0bf0f0fb0bf0f0fb),
102     U64(0xecadad41ecadad41), U64(0x67d4d4b367d4d4b3),
103     U64(0xfda2a25ffda2a25f), U64(0xeaafaf45eaafaf45),
104     U64(0xbf9c9c23bf9c9c23), U64(0xf7a4a453f7a4a453),
105     U64(0x967272e4967272e4), U64(0x5bc0c09b5bc0c09b),
106     U64(0xc2b7b775c2b7b775), U64(0x1cfdfde11cfdfde1),
107     U64(0xae93933dae93933d), U64(0x6a26264c6a26264c),
108     U64(0x5a36366c5a36366c), U64(0x413f3f7e413f3f7e),
109     U64(0x02f7f7f502f7f7f5), U64(0x4fcccc834fcccc83),
110     U64(0x5c3434685c343468), U64(0xf4a5a551f4a5a551),
111     U64(0x34e5e5d134e5e5d1), U64(0x08f1f1f908f1f1f9),
112     U64(0x937171e2937171e2), U64(0x73d8d8ab73d8d8ab),
113     U64(0x5331316253313162), U64(0x3f15152a3f15152a),
114     U64(0x0c0404080c040408), U64(0x52c7c79552c7c795),
115     U64(0x6523234665232346), U64(0x5ec3c39d5ec3c39d),
116     U64(0x2818183028181830), U64(0xa1969637a1969637),
117     U64(0x0f05050a0f05050a), U64(0xb59a9a2fb59a9a2f),
118     U64(0x0907070e0907070e), U64(0x3612122436121224),
119     U64(0x9b80801b9b80801b), U64(0x3de2e2df3de2e2df),
120     U64(0x26ebebcd26ebebcd), U64(0x6927274e6927274e),
121     U64(0xcdb2b27fcdb2b27f), U64(0x9f7575ea9f7575ea),
122     U64(0x1b0909121b090912), U64(0x9e83831d9e83831d),
123     U64(0x742c2c58742c2c58), U64(0x2e1a1a342e1a1a34),
124     U64(0x2d1b1b362d1b1b36), U64(0xb26e6edcb26e6edc),
125     U64(0xee5a5ab4ee5a5ab4), U64(0xfba0a05bfba0a05b),
126     U64(0xf65252a4f65252a4), U64(0x4d3b3b764d3b3b76),
127     U64(0x61d6d6b761d6d6b7), U64(0xceb3b37dceb3b37d),
128     U64(0x7b2929527b292952), U64(0x3ee3e3dd3ee3e3dd),
129     U64(0x712f2f5e712f2f5e), U64(0x9784841397848413),
130     U64(0xf55353a6f55353a6), U64(0x68d1d1b968d1d1b9),
131     U64(0x0000000000000000), U64(0x2cededc12cededc1),
132     U64(0x6020204060202040), U64(0x1ffcfce31ffcfce3),
133     U64(0xc8b1b179c8b1b179), U64(0xed5b5bb6ed5b5bb6),
134     U64(0xbe6a6ad4be6a6ad4), U64(0x46cbcb8d46cbcb8d),
135     U64(0xd9bebe67d9bebe67), U64(0x4b3939724b393972),
136     U64(0xde4a4a94de4a4a94), U64(0xd44c4c98d44c4c98),
137     U64(0xe85858b0e85858b0), U64(0x4acfcf854acfcf85),
138     U64(0x6bd0d0bb6bd0d0bb), U64(0x2aefefc52aefefc5),
139     U64(0xe5aaaa4fe5aaaa4f), U64(0x16fbfbed16fbfbed),
140     U64(0xc5434386c5434386), U64(0xd74d4d9ad74d4d9a),
141     U64(0x5533336655333366), U64(0x9485851194858511),
142     U64(0xcf45458acf45458a), U64(0x10f9f9e910f9f9e9),
143     U64(0x0602020406020204), U64(0x817f7ffe817f7ffe),
144     U64(0xf05050a0f05050a0), U64(0x443c3c78443c3c78),
145     U64(0xba9f9f25ba9f9f25), U64(0xe3a8a84be3a8a84b),
146     U64(0xf35151a2f35151a2), U64(0xfea3a35dfea3a35d),
147     U64(0xc0404080c0404080), U64(0x8a8f8f058a8f8f05),
148     U64(0xad92923fad92923f), U64(0xbc9d9d21bc9d9d21),
149     U64(0x4838387048383870), U64(0x04f5f5f104f5f5f1),
150     U64(0xdfbcbc63dfbcbc63), U64(0xc1b6b677c1b6b677),
151     U64(0x75dadaaf75dadaaf), U64(0x6321214263212142),
152     U64(0x3010102030101020), U64(0x1affffe51affffe5),
153     U64(0x0ef3f3fd0ef3f3fd), U64(0x6dd2d2bf6dd2d2bf),
154     U64(0x4ccdcd814ccdcd81), U64(0x140c0c18140c0c18),
155     U64(0x3513132635131326), U64(0x2fececc32fececc3),
156     U64(0xe15f5fbee15f5fbe), U64(0xa2979735a2979735),
157     U64(0xcc444488cc444488), U64(0x3917172e3917172e),
158     U64(0x57c4c49357c4c493), U64(0xf2a7a755f2a7a755),
159     U64(0x827e7efc827e7efc), U64(0x473d3d7a473d3d7a),
160     U64(0xac6464c8ac6464c8), U64(0xe75d5dbae75d5dba),
161     U64(0x2b1919322b191932), U64(0x957373e6957373e6),
162     U64(0xa06060c0a06060c0), U64(0x9881811998818119),
163     U64(0xd14f4f9ed14f4f9e), U64(0x7fdcdca37fdcdca3),
164     U64(0x6622224466222244), U64(0x7e2a2a547e2a2a54),
165     U64(0xab90903bab90903b), U64(0x8388880b8388880b),
166     U64(0xca46468cca46468c), U64(0x29eeeec729eeeec7),
167     U64(0xd3b8b86bd3b8b86b), U64(0x3c1414283c141428),
168     U64(0x79dedea779dedea7), U64(0xe25e5ebce25e5ebc),
169     U64(0x1d0b0b161d0b0b16), U64(0x76dbdbad76dbdbad),
170     U64(0x3be0e0db3be0e0db), U64(0x5632326456323264),
171     U64(0x4e3a3a744e3a3a74), U64(0x1e0a0a141e0a0a14),
172     U64(0xdb494992db494992), U64(0x0a06060c0a06060c),
173     U64(0x6c2424486c242448), U64(0xe45c5cb8e45c5cb8),
174     U64(0x5dc2c29f5dc2c29f), U64(0x6ed3d3bd6ed3d3bd),
175     U64(0xefacac43efacac43), U64(0xa66262c4a66262c4),
176     U64(0xa8919139a8919139), U64(0xa4959531a4959531),
177     U64(0x37e4e4d337e4e4d3), U64(0x8b7979f28b7979f2),
178     U64(0x32e7e7d532e7e7d5), U64(0x43c8c88b43c8c88b),
179     U64(0x5937376e5937376e), U64(0xb76d6ddab76d6dda),
180     U64(0x8c8d8d018c8d8d01), U64(0x64d5d5b164d5d5b1),
181     U64(0xd24e4e9cd24e4e9c), U64(0xe0a9a949e0a9a949),
182     U64(0xb46c6cd8b46c6cd8), U64(0xfa5656acfa5656ac),
183     U64(0x07f4f4f307f4f4f3), U64(0x25eaeacf25eaeacf),
184     U64(0xaf6565caaf6565ca), U64(0x8e7a7af48e7a7af4),
185     U64(0xe9aeae47e9aeae47), U64(0x1808081018080810),
186     U64(0xd5baba6fd5baba6f), U64(0x887878f0887878f0),
187     U64(0x6f25254a6f25254a), U64(0x722e2e5c722e2e5c),
188     U64(0x241c1c38241c1c38), U64(0xf1a6a657f1a6a657),
189     U64(0xc7b4b473c7b4b473), U64(0x51c6c69751c6c697),
190     U64(0x23e8e8cb23e8e8cb), U64(0x7cdddda17cdddda1),
191     U64(0x9c7474e89c7474e8), U64(0x211f1f3e211f1f3e),
192     U64(0xdd4b4b96dd4b4b96), U64(0xdcbdbd61dcbdbd61),
193     U64(0x868b8b0d868b8b0d), U64(0x858a8a0f858a8a0f),
194     U64(0x907070e0907070e0), U64(0x423e3e7c423e3e7c),
195     U64(0xc4b5b571c4b5b571), U64(0xaa6666ccaa6666cc),
196     U64(0xd8484890d8484890), U64(0x0503030605030306),
197     U64(0x01f6f6f701f6f6f7), U64(0x120e0e1c120e0e1c),
198     U64(0xa36161c2a36161c2), U64(0x5f35356a5f35356a),
199     U64(0xf95757aef95757ae), U64(0xd0b9b969d0b9b969),
200     U64(0x9186861791868617), U64(0x58c1c19958c1c199),
201     U64(0x271d1d3a271d1d3a), U64(0xb99e9e27b99e9e27),
202     U64(0x38e1e1d938e1e1d9), U64(0x13f8f8eb13f8f8eb),
203     U64(0xb398982bb398982b), U64(0x3311112233111122),
204     U64(0xbb6969d2bb6969d2), U64(0x70d9d9a970d9d9a9),
205     U64(0x898e8e07898e8e07), U64(0xa7949433a7949433),
206     U64(0xb69b9b2db69b9b2d), U64(0x221e1e3c221e1e3c),
207     U64(0x9287871592878715), U64(0x20e9e9c920e9e9c9),
208     U64(0x49cece8749cece87), U64(0xff5555aaff5555aa),
209     U64(0x7828285078282850), U64(0x7adfdfa57adfdfa5),
210     U64(0x8f8c8c038f8c8c03), U64(0xf8a1a159f8a1a159),
211     U64(0x8089890980898909), U64(0x170d0d1a170d0d1a),
212     U64(0xdabfbf65dabfbf65), U64(0x31e6e6d731e6e6d7),
213     U64(0xc6424284c6424284), U64(0xb86868d0b86868d0),
214     U64(0xc3414182c3414182), U64(0xb0999929b0999929),
215     U64(0x772d2d5a772d2d5a), U64(0x110f0f1e110f0f1e),
216     U64(0xcbb0b07bcbb0b07b), U64(0xfc5454a8fc5454a8),
217     U64(0xd6bbbb6dd6bbbb6d), U64(0x3a16162c3a16162c)
218 };
219
220 static const u64 Td[256] = {
221     U64(0x50a7f45150a7f451), U64(0x5365417e5365417e),
222     U64(0xc3a4171ac3a4171a), U64(0x965e273a965e273a),
223     U64(0xcb6bab3bcb6bab3b), U64(0xf1459d1ff1459d1f),
224     U64(0xab58faacab58faac), U64(0x9303e34b9303e34b),
225     U64(0x55fa302055fa3020), U64(0xf66d76adf66d76ad),
226     U64(0x9176cc889176cc88), U64(0x254c02f5254c02f5),
227     U64(0xfcd7e54ffcd7e54f), U64(0xd7cb2ac5d7cb2ac5),
228     U64(0x8044352680443526), U64(0x8fa362b58fa362b5),
229     U64(0x495ab1de495ab1de), U64(0x671bba25671bba25),
230     U64(0x980eea45980eea45), U64(0xe1c0fe5de1c0fe5d),
231     U64(0x02752fc302752fc3), U64(0x12f04c8112f04c81),
232     U64(0xa397468da397468d), U64(0xc6f9d36bc6f9d36b),
233     U64(0xe75f8f03e75f8f03), U64(0x959c9215959c9215),
234     U64(0xeb7a6dbfeb7a6dbf), U64(0xda595295da595295),
235     U64(0x2d83bed42d83bed4), U64(0xd3217458d3217458),
236     U64(0x2969e0492969e049), U64(0x44c8c98e44c8c98e),
237     U64(0x6a89c2756a89c275), U64(0x78798ef478798ef4),
238     U64(0x6b3e58996b3e5899), U64(0xdd71b927dd71b927),
239     U64(0xb64fe1beb64fe1be), U64(0x17ad88f017ad88f0),
240     U64(0x66ac20c966ac20c9), U64(0xb43ace7db43ace7d),
241     U64(0x184adf63184adf63), U64(0x82311ae582311ae5),
242     U64(0x6033519760335197), U64(0x457f5362457f5362),
243     U64(0xe07764b1e07764b1), U64(0x84ae6bbb84ae6bbb),
244     U64(0x1ca081fe1ca081fe), U64(0x942b08f9942b08f9),
245     U64(0x5868487058684870), U64(0x19fd458f19fd458f),
246     U64(0x876cde94876cde94), U64(0xb7f87b52b7f87b52),
247     U64(0x23d373ab23d373ab), U64(0xe2024b72e2024b72),
248     U64(0x578f1fe3578f1fe3), U64(0x2aab55662aab5566),
249     U64(0x0728ebb20728ebb2), U64(0x03c2b52f03c2b52f),
250     U64(0x9a7bc5869a7bc586), U64(0xa50837d3a50837d3),
251     U64(0xf2872830f2872830), U64(0xb2a5bf23b2a5bf23),
252     U64(0xba6a0302ba6a0302), U64(0x5c8216ed5c8216ed),
253     U64(0x2b1ccf8a2b1ccf8a), U64(0x92b479a792b479a7),
254     U64(0xf0f207f3f0f207f3), U64(0xa1e2694ea1e2694e),
255     U64(0xcdf4da65cdf4da65), U64(0xd5be0506d5be0506),
256     U64(0x1f6234d11f6234d1), U64(0x8afea6c48afea6c4),
257     U64(0x9d532e349d532e34), U64(0xa055f3a2a055f3a2),
258     U64(0x32e18a0532e18a05), U64(0x75ebf6a475ebf6a4),
259     U64(0x39ec830b39ec830b), U64(0xaaef6040aaef6040),
260     U64(0x069f715e069f715e), U64(0x51106ebd51106ebd),
261     U64(0xf98a213ef98a213e), U64(0x3d06dd963d06dd96),
262     U64(0xae053eddae053edd), U64(0x46bde64d46bde64d),
263     U64(0xb58d5491b58d5491), U64(0x055dc471055dc471),
264     U64(0x6fd406046fd40604), U64(0xff155060ff155060),
265     U64(0x24fb981924fb9819), U64(0x97e9bdd697e9bdd6),
266     U64(0xcc434089cc434089), U64(0x779ed967779ed967),
267     U64(0xbd42e8b0bd42e8b0), U64(0x888b8907888b8907),
268     U64(0x385b19e7385b19e7), U64(0xdbeec879dbeec879),
269     U64(0x470a7ca1470a7ca1), U64(0xe90f427ce90f427c),
270     U64(0xc91e84f8c91e84f8), U64(0x0000000000000000),
271     U64(0x8386800983868009), U64(0x48ed2b3248ed2b32),
272     U64(0xac70111eac70111e), U64(0x4e725a6c4e725a6c),
273     U64(0xfbff0efdfbff0efd), U64(0x5638850f5638850f),
274     U64(0x1ed5ae3d1ed5ae3d), U64(0x27392d3627392d36),
275     U64(0x64d90f0a64d90f0a), U64(0x21a65c6821a65c68),
276     U64(0xd1545b9bd1545b9b), U64(0x3a2e36243a2e3624),
277     U64(0xb1670a0cb1670a0c), U64(0x0fe757930fe75793),
278     U64(0xd296eeb4d296eeb4), U64(0x9e919b1b9e919b1b),
279     U64(0x4fc5c0804fc5c080), U64(0xa220dc61a220dc61),
280     U64(0x694b775a694b775a), U64(0x161a121c161a121c),
281     U64(0x0aba93e20aba93e2), U64(0xe52aa0c0e52aa0c0),
282     U64(0x43e0223c43e0223c), U64(0x1d171b121d171b12),
283     U64(0x0b0d090e0b0d090e), U64(0xadc78bf2adc78bf2),
284     U64(0xb9a8b62db9a8b62d), U64(0xc8a91e14c8a91e14),
285     U64(0x8519f1578519f157), U64(0x4c0775af4c0775af),
286     U64(0xbbdd99eebbdd99ee), U64(0xfd607fa3fd607fa3),
287     U64(0x9f2601f79f2601f7), U64(0xbcf5725cbcf5725c),
288     U64(0xc53b6644c53b6644), U64(0x347efb5b347efb5b),
289     U64(0x7629438b7629438b), U64(0xdcc623cbdcc623cb),
290     U64(0x68fcedb668fcedb6), U64(0x63f1e4b863f1e4b8),
291     U64(0xcadc31d7cadc31d7), U64(0x1085634210856342),
292     U64(0x4022971340229713), U64(0x2011c6842011c684),
293     U64(0x7d244a857d244a85), U64(0xf83dbbd2f83dbbd2),
294     U64(0x1132f9ae1132f9ae), U64(0x6da129c76da129c7),
295     U64(0x4b2f9e1d4b2f9e1d), U64(0xf330b2dcf330b2dc),
296     U64(0xec52860dec52860d), U64(0xd0e3c177d0e3c177),
297     U64(0x6c16b32b6c16b32b), U64(0x99b970a999b970a9),
298     U64(0xfa489411fa489411), U64(0x2264e9472264e947),
299     U64(0xc48cfca8c48cfca8), U64(0x1a3ff0a01a3ff0a0),
300     U64(0xd82c7d56d82c7d56), U64(0xef903322ef903322),
301     U64(0xc74e4987c74e4987), U64(0xc1d138d9c1d138d9),
302     U64(0xfea2ca8cfea2ca8c), U64(0x360bd498360bd498),
303     U64(0xcf81f5a6cf81f5a6), U64(0x28de7aa528de7aa5),
304     U64(0x268eb7da268eb7da), U64(0xa4bfad3fa4bfad3f),
305     U64(0xe49d3a2ce49d3a2c), U64(0x0d9278500d927850),
306     U64(0x9bcc5f6a9bcc5f6a), U64(0x62467e5462467e54),
307     U64(0xc2138df6c2138df6), U64(0xe8b8d890e8b8d890),
308     U64(0x5ef7392e5ef7392e), U64(0xf5afc382f5afc382),
309     U64(0xbe805d9fbe805d9f), U64(0x7c93d0697c93d069),
310     U64(0xa92dd56fa92dd56f), U64(0xb31225cfb31225cf),
311     U64(0x3b99acc83b99acc8), U64(0xa77d1810a77d1810),
312     U64(0x6e639ce86e639ce8), U64(0x7bbb3bdb7bbb3bdb),
313     U64(0x097826cd097826cd), U64(0xf418596ef418596e),
314     U64(0x01b79aec01b79aec), U64(0xa89a4f83a89a4f83),
315     U64(0x656e95e6656e95e6), U64(0x7ee6ffaa7ee6ffaa),
316     U64(0x08cfbc2108cfbc21), U64(0xe6e815efe6e815ef),
317     U64(0xd99be7bad99be7ba), U64(0xce366f4ace366f4a),
318     U64(0xd4099fead4099fea), U64(0xd67cb029d67cb029),
319     U64(0xafb2a431afb2a431), U64(0x31233f2a31233f2a),
320     U64(0x3094a5c63094a5c6), U64(0xc066a235c066a235),
321     U64(0x37bc4e7437bc4e74), U64(0xa6ca82fca6ca82fc),
322     U64(0xb0d090e0b0d090e0), U64(0x15d8a73315d8a733),
323     U64(0x4a9804f14a9804f1), U64(0xf7daec41f7daec41),
324     U64(0x0e50cd7f0e50cd7f), U64(0x2ff691172ff69117),
325     U64(0x8dd64d768dd64d76), U64(0x4db0ef434db0ef43),
326     U64(0x544daacc544daacc), U64(0xdf0496e4df0496e4),
327     U64(0xe3b5d19ee3b5d19e), U64(0x1b886a4c1b886a4c),
328     U64(0xb81f2cc1b81f2cc1), U64(0x7f5165467f516546),
329     U64(0x04ea5e9d04ea5e9d), U64(0x5d358c015d358c01),
330     U64(0x737487fa737487fa), U64(0x2e410bfb2e410bfb),
331     U64(0x5a1d67b35a1d67b3), U64(0x52d2db9252d2db92),
332     U64(0x335610e9335610e9), U64(0x1347d66d1347d66d),
333     U64(0x8c61d79a8c61d79a), U64(0x7a0ca1377a0ca137),
334     U64(0x8e14f8598e14f859), U64(0x893c13eb893c13eb),
335     U64(0xee27a9ceee27a9ce), U64(0x35c961b735c961b7),
336     U64(0xede51ce1ede51ce1), U64(0x3cb1477a3cb1477a),
337     U64(0x59dfd29c59dfd29c), U64(0x3f73f2553f73f255),
338     U64(0x79ce141879ce1418), U64(0xbf37c773bf37c773),
339     U64(0xeacdf753eacdf753), U64(0x5baafd5f5baafd5f),
340     U64(0x146f3ddf146f3ddf), U64(0x86db447886db4478),
341     U64(0x81f3afca81f3afca), U64(0x3ec468b93ec468b9),
342     U64(0x2c3424382c342438), U64(0x5f40a3c25f40a3c2),
343     U64(0x72c31d1672c31d16), U64(0x0c25e2bc0c25e2bc),
344     U64(0x8b493c288b493c28), U64(0x41950dff41950dff),
345     U64(0x7101a8397101a839), U64(0xdeb30c08deb30c08),
346     U64(0x9ce4b4d89ce4b4d8), U64(0x90c1566490c15664),
347     U64(0x6184cb7b6184cb7b), U64(0x70b632d570b632d5),
348     U64(0x745c6c48745c6c48), U64(0x4257b8d04257b8d0)
349 };
350 static const u8 Td4[256] = {
351     0x52U, 0x09U, 0x6aU, 0xd5U, 0x30U, 0x36U, 0xa5U, 0x38U,
352     0xbfU, 0x40U, 0xa3U, 0x9eU, 0x81U, 0xf3U, 0xd7U, 0xfbU,
353     0x7cU, 0xe3U, 0x39U, 0x82U, 0x9bU, 0x2fU, 0xffU, 0x87U,
354     0x34U, 0x8eU, 0x43U, 0x44U, 0xc4U, 0xdeU, 0xe9U, 0xcbU,
355     0x54U, 0x7bU, 0x94U, 0x32U, 0xa6U, 0xc2U, 0x23U, 0x3dU,
356     0xeeU, 0x4cU, 0x95U, 0x0bU, 0x42U, 0xfaU, 0xc3U, 0x4eU,
357     0x08U, 0x2eU, 0xa1U, 0x66U, 0x28U, 0xd9U, 0x24U, 0xb2U,
358     0x76U, 0x5bU, 0xa2U, 0x49U, 0x6dU, 0x8bU, 0xd1U, 0x25U,
359     0x72U, 0xf8U, 0xf6U, 0x64U, 0x86U, 0x68U, 0x98U, 0x16U,
360     0xd4U, 0xa4U, 0x5cU, 0xccU, 0x5dU, 0x65U, 0xb6U, 0x92U,
361     0x6cU, 0x70U, 0x48U, 0x50U, 0xfdU, 0xedU, 0xb9U, 0xdaU,
362     0x5eU, 0x15U, 0x46U, 0x57U, 0xa7U, 0x8dU, 0x9dU, 0x84U,
363     0x90U, 0xd8U, 0xabU, 0x00U, 0x8cU, 0xbcU, 0xd3U, 0x0aU,
364     0xf7U, 0xe4U, 0x58U, 0x05U, 0xb8U, 0xb3U, 0x45U, 0x06U,
365     0xd0U, 0x2cU, 0x1eU, 0x8fU, 0xcaU, 0x3fU, 0x0fU, 0x02U,
366     0xc1U, 0xafU, 0xbdU, 0x03U, 0x01U, 0x13U, 0x8aU, 0x6bU,
367     0x3aU, 0x91U, 0x11U, 0x41U, 0x4fU, 0x67U, 0xdcU, 0xeaU,
368     0x97U, 0xf2U, 0xcfU, 0xceU, 0xf0U, 0xb4U, 0xe6U, 0x73U,
369     0x96U, 0xacU, 0x74U, 0x22U, 0xe7U, 0xadU, 0x35U, 0x85U,
370     0xe2U, 0xf9U, 0x37U, 0xe8U, 0x1cU, 0x75U, 0xdfU, 0x6eU,
371     0x47U, 0xf1U, 0x1aU, 0x71U, 0x1dU, 0x29U, 0xc5U, 0x89U,
372     0x6fU, 0xb7U, 0x62U, 0x0eU, 0xaaU, 0x18U, 0xbeU, 0x1bU,
373     0xfcU, 0x56U, 0x3eU, 0x4bU, 0xc6U, 0xd2U, 0x79U, 0x20U,
374     0x9aU, 0xdbU, 0xc0U, 0xfeU, 0x78U, 0xcdU, 0x5aU, 0xf4U,
375     0x1fU, 0xddU, 0xa8U, 0x33U, 0x88U, 0x07U, 0xc7U, 0x31U,
376     0xb1U, 0x12U, 0x10U, 0x59U, 0x27U, 0x80U, 0xecU, 0x5fU,
377     0x60U, 0x51U, 0x7fU, 0xa9U, 0x19U, 0xb5U, 0x4aU, 0x0dU,
378     0x2dU, 0xe5U, 0x7aU, 0x9fU, 0x93U, 0xc9U, 0x9cU, 0xefU,
379     0xa0U, 0xe0U, 0x3bU, 0x4dU, 0xaeU, 0x2aU, 0xf5U, 0xb0U,
380     0xc8U, 0xebU, 0xbbU, 0x3cU, 0x83U, 0x53U, 0x99U, 0x61U,
381     0x17U, 0x2bU, 0x04U, 0x7eU, 0xbaU, 0x77U, 0xd6U, 0x26U,
382     0xe1U, 0x69U, 0x14U, 0x63U, 0x55U, 0x21U, 0x0cU, 0x7dU
383 };
384
385 static const u32 rcon[] = {
386     0x00000001U, 0x00000002U, 0x00000004U, 0x00000008U,
387     0x00000010U, 0x00000020U, 0x00000040U, 0x00000080U,
388     0x0000001bU, 0x00000036U, /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */
389 };
390
391 /**
392  * Expand the cipher key into the encryption key schedule.
393  */
394 int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
395                         AES_KEY *key) {
396
397         u32 *rk;
398         int i = 0;
399         u32 temp;
400
401         if (!userKey || !key)
402                 return -1;
403         if (bits != 128 && bits != 192 && bits != 256)
404                 return -2;
405
406         rk = key->rd_key;
407
408         if (bits==128)
409                 key->rounds = 10;
410         else if (bits==192)
411                 key->rounds = 12;
412         else
413                 key->rounds = 14;
414
415         rk[0] = GETU32(userKey     );
416         rk[1] = GETU32(userKey +  4);
417         rk[2] = GETU32(userKey +  8);
418         rk[3] = GETU32(userKey + 12);
419         if (bits == 128) {
420                 while (1) {
421                         temp  = rk[3];
422                         rk[4] = rk[0] ^
423                                 (Te2[(temp >>  8) & 0xff] & 0x000000ffU) ^
424                                 (Te3[(temp >> 16) & 0xff] & 0x0000ff00U) ^
425                                 (Te0[(temp >> 24)       ] & 0x00ff0000U) ^
426                                 (Te1[(temp      ) & 0xff] & 0xff000000U) ^
427                                 rcon[i];
428                         rk[5] = rk[1] ^ rk[4];
429                         rk[6] = rk[2] ^ rk[5];
430                         rk[7] = rk[3] ^ rk[6];
431                         if (++i == 10) {
432                                 return 0;
433                         }
434                         rk += 4;
435                 }
436         }
437         rk[4] = GETU32(userKey + 16);
438         rk[5] = GETU32(userKey + 20);
439         if (bits == 192) {
440                 while (1) {
441                         temp = rk[ 5];
442                         rk[ 6] = rk[ 0] ^
443                                 (Te2[(temp >>  8) & 0xff] & 0x000000ffU) ^
444                                 (Te3[(temp >> 16) & 0xff] & 0x0000ff00U) ^
445                                 (Te0[(temp >> 24)       ] & 0x00ff0000U) ^
446                                 (Te1[(temp      ) & 0xff] & 0xff000000U) ^
447                                 rcon[i];
448                         rk[ 7] = rk[ 1] ^ rk[ 6];
449                         rk[ 8] = rk[ 2] ^ rk[ 7];
450                         rk[ 9] = rk[ 3] ^ rk[ 8];
451                         if (++i == 8) {
452                                 return 0;
453                         }
454                         rk[10] = rk[ 4] ^ rk[ 9];
455                         rk[11] = rk[ 5] ^ rk[10];
456                         rk += 6;
457                 }
458         }
459         rk[6] = GETU32(userKey + 24);
460         rk[7] = GETU32(userKey + 28);
461         if (bits == 256) {
462                 while (1) {
463                         temp = rk[ 7];
464                         rk[ 8] = rk[ 0] ^
465                                 (Te2[(temp >>  8) & 0xff] & 0x000000ffU) ^
466                                 (Te3[(temp >> 16) & 0xff] & 0x0000ff00U) ^
467                                 (Te0[(temp >> 24)       ] & 0x00ff0000U) ^
468                                 (Te1[(temp      ) & 0xff] & 0xff000000U) ^
469                                 rcon[i];
470                         rk[ 9] = rk[ 1] ^ rk[ 8];
471                         rk[10] = rk[ 2] ^ rk[ 9];
472                         rk[11] = rk[ 3] ^ rk[10];
473                         if (++i == 7) {
474                                 return 0;
475                         }
476                         temp = rk[11];
477                         rk[12] = rk[ 4] ^
478                                 (Te2[(temp      ) & 0xff] & 0x000000ffU) ^
479                                 (Te3[(temp >>  8) & 0xff] & 0x0000ff00U) ^
480                                 (Te0[(temp >> 16) & 0xff] & 0x00ff0000U) ^
481                                 (Te1[(temp >> 24)       ] & 0xff000000U);
482                         rk[13] = rk[ 5] ^ rk[12];
483                         rk[14] = rk[ 6] ^ rk[13];
484                         rk[15] = rk[ 7] ^ rk[14];
485
486                         rk += 8;
487                 }
488         }
489         return 0;
490 }
491
492 /**
493  * Expand the cipher key into the decryption key schedule.
494  */
495 int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
496                          AES_KEY *key) {
497
498         u32 *rk;
499         int i, j, status;
500         u32 temp;
501
502         /* first, start with an encryption schedule */
503         status = AES_set_encrypt_key(userKey, bits, key);
504         if (status < 0)
505                 return status;
506
507         rk = key->rd_key;
508
509         /* invert the order of the round keys: */
510         for (i = 0, j = 4*(key->rounds); i < j; i += 4, j -= 4) {
511                 temp = rk[i    ]; rk[i    ] = rk[j    ]; rk[j    ] = temp;
512                 temp = rk[i + 1]; rk[i + 1] = rk[j + 1]; rk[j + 1] = temp;
513                 temp = rk[i + 2]; rk[i + 2] = rk[j + 2]; rk[j + 2] = temp;
514                 temp = rk[i + 3]; rk[i + 3] = rk[j + 3]; rk[j + 3] = temp;
515         }
516         /* apply the inverse MixColumn transform to all round keys but the first and the last: */
517         for (i = 1; i < (key->rounds); i++) {
518                 rk += 4;
519                 rk[0] =
520                         Td0[Te2[(rk[0]      ) & 0xff] & 0xff] ^
521                         Td1[Te2[(rk[0] >>  8) & 0xff] & 0xff] ^
522                         Td2[Te2[(rk[0] >> 16) & 0xff] & 0xff] ^
523                         Td3[Te2[(rk[0] >> 24)       ] & 0xff];
524                 rk[1] =
525                         Td0[Te2[(rk[1]      ) & 0xff] & 0xff] ^
526                         Td1[Te2[(rk[1] >>  8) & 0xff] & 0xff] ^
527                         Td2[Te2[(rk[1] >> 16) & 0xff] & 0xff] ^
528                         Td3[Te2[(rk[1] >> 24)       ] & 0xff];
529                 rk[2] =
530                         Td0[Te2[(rk[2]      ) & 0xff] & 0xff] ^
531                         Td1[Te2[(rk[2] >>  8) & 0xff] & 0xff] ^
532                         Td2[Te2[(rk[2] >> 16) & 0xff] & 0xff] ^
533                         Td3[Te2[(rk[2] >> 24)       ] & 0xff];
534                 rk[3] =
535                         Td0[Te2[(rk[3]      ) & 0xff] & 0xff] ^
536                         Td1[Te2[(rk[3] >>  8) & 0xff] & 0xff] ^
537                         Td2[Te2[(rk[3] >> 16) & 0xff] & 0xff] ^
538                         Td3[Te2[(rk[3] >> 24)       ] & 0xff];
539         }
540         return 0;
541 }
542
543 /*
544  * Encrypt a single block
545  * in and out can overlap
546  */
547 void AES_encrypt(const unsigned char *in, unsigned char *out,
548                  const AES_KEY *key) {
549
550         const u32 *rk;
551         u32 s0, s1, s2, s3, t0, t1, t2, t3;
552         int r;
553
554         assert(in && out && key);
555         rk = key->rd_key;
556
557         /*
558          * map byte array block to cipher state
559          * and add initial round key:
560          */
561         s0 = GETU32(in     ) ^ rk[0];
562         s1 = GETU32(in +  4) ^ rk[1];
563         s2 = GETU32(in +  8) ^ rk[2];
564         s3 = GETU32(in + 12) ^ rk[3];
565
566         t0 =
567             Te0[(s0      ) & 0xff] ^
568             Te1[(s1 >>  8) & 0xff] ^
569             Te2[(s2 >> 16) & 0xff] ^
570             Te3[(s3 >> 24)       ] ^
571             rk[4];
572         t1 =
573             Te0[(s1      ) & 0xff] ^
574             Te1[(s2 >>  8) & 0xff] ^
575             Te2[(s3 >> 16) & 0xff] ^
576             Te3[(s0 >> 24)       ] ^
577             rk[5];
578         t2 =
579             Te0[(s2      ) & 0xff] ^
580             Te1[(s3 >>  8) & 0xff] ^
581             Te2[(s0 >> 16) & 0xff] ^
582             Te3[(s1 >> 24)       ] ^
583             rk[6];
584         t3 =
585             Te0[(s3      ) & 0xff] ^
586             Te1[(s0 >>  8) & 0xff] ^
587             Te2[(s1 >> 16) & 0xff] ^
588             Te3[(s2 >> 24)       ] ^
589             rk[7];
590
591     /*
592      * Nr - 2 full rounds:
593      */
594     for (rk+=8,r=(key->rounds-2)>>1; r>0; rk+=8,r--) {
595         s0 =
596             Te0[(t0      ) & 0xff] ^
597             Te1[(t1 >>  8) & 0xff] ^
598             Te2[(t2 >> 16) & 0xff] ^
599             Te3[(t3 >> 24)       ] ^
600             rk[0];
601         s1 =
602             Te0[(t1      ) & 0xff] ^
603             Te1[(t2 >>  8) & 0xff] ^
604             Te2[(t3 >> 16) & 0xff] ^
605             Te3[(t0 >> 24)       ] ^
606             rk[1];
607         s2 =
608             Te0[(t2      ) & 0xff] ^
609             Te1[(t3 >>  8) & 0xff] ^
610             Te2[(t0 >> 16) & 0xff] ^
611             Te3[(t1 >> 24)       ] ^
612             rk[2];
613         s3 =
614             Te0[(t3      ) & 0xff] ^
615             Te1[(t0 >>  8) & 0xff] ^
616             Te2[(t1 >> 16) & 0xff] ^
617             Te3[(t2 >> 24)       ] ^
618             rk[3];
619  
620         t0 =
621             Te0[(s0      ) & 0xff] ^
622             Te1[(s1 >>  8) & 0xff] ^
623             Te2[(s2 >> 16) & 0xff] ^
624             Te3[(s3 >> 24)       ] ^
625             rk[4];
626         t1 =
627             Te0[(s1      ) & 0xff] ^
628             Te1[(s2 >>  8) & 0xff] ^
629             Te2[(s3 >> 16) & 0xff] ^
630             Te3[(s0 >> 24)       ] ^
631             rk[5];
632         t2 =
633             Te0[(s2      ) & 0xff] ^
634             Te1[(s3 >>  8) & 0xff] ^
635             Te2[(s0 >> 16) & 0xff] ^
636             Te3[(s1 >> 24)       ] ^
637             rk[6];
638         t3 =
639             Te0[(s3      ) & 0xff] ^
640             Te1[(s0 >>  8) & 0xff] ^
641             Te2[(s1 >> 16) & 0xff] ^
642             Te3[(s2 >> 24)       ] ^
643             rk[7];
644     }
645     /*
646          * apply last round and
647          * map cipher state to byte array block:
648          */
649         s0 =
650                 (Te2[(t0      ) & 0xff] & 0x000000ffU) ^
651                 (Te3[(t1 >>  8) & 0xff] & 0x0000ff00U) ^
652                 (Te0[(t2 >> 16) & 0xff] & 0x00ff0000U) ^
653                 (Te1[(t3 >> 24)       ] & 0xff000000U) ^
654                 rk[0];
655         PUTU32(out     , s0);
656         s1 =
657                 (Te2[(t1      ) & 0xff] & 0x000000ffU) ^
658                 (Te3[(t2 >>  8) & 0xff] & 0x0000ff00U) ^
659                 (Te0[(t3 >> 16) & 0xff] & 0x00ff0000U) ^
660                 (Te1[(t0 >> 24)       ] & 0xff000000U) ^
661                 rk[1];
662         PUTU32(out +  4, s1);
663         s2 =
664                 (Te2[(t2      ) & 0xff] & 0x000000ffU) ^
665                 (Te3[(t3 >>  8) & 0xff] & 0x0000ff00U) ^
666                 (Te0[(t0 >> 16) & 0xff] & 0x00ff0000U) ^
667                 (Te1[(t1 >> 24)       ] & 0xff000000U) ^
668                 rk[2];
669         PUTU32(out +  8, s2);
670         s3 =
671                 (Te2[(t3      ) & 0xff] & 0x000000ffU) ^
672                 (Te3[(t0 >>  8) & 0xff] & 0x0000ff00U) ^
673                 (Te0[(t1 >> 16) & 0xff] & 0x00ff0000U) ^
674                 (Te1[(t2 >> 24)       ] & 0xff000000U) ^
675                 rk[3];
676         PUTU32(out + 12, s3);
677 }
678
679 /*
680  * Decrypt a single block
681  * in and out can overlap
682  */
683 void AES_decrypt(const unsigned char *in, unsigned char *out,
684                  const AES_KEY *key) {
685
686         const u32 *rk;
687         u32 s0, s1, s2, s3, t0, t1, t2, t3;
688         int r;
689
690         assert(in && out && key);
691         rk = key->rd_key;
692
693         /*
694          * map byte array block to cipher state
695          * and add initial round key:
696          */
697         s0 = GETU32(in     ) ^ rk[0];
698         s1 = GETU32(in +  4) ^ rk[1];
699         s2 = GETU32(in +  8) ^ rk[2];
700         s3 = GETU32(in + 12) ^ rk[3];
701
702         t0 =
703             Td0[(s0      ) & 0xff] ^
704             Td1[(s3 >>  8) & 0xff] ^
705             Td2[(s2 >> 16) & 0xff] ^
706             Td3[(s1 >> 24)       ] ^
707             rk[4];
708         t1 =
709             Td0[(s1      ) & 0xff] ^
710             Td1[(s0 >>  8) & 0xff] ^
711             Td2[(s3 >> 16) & 0xff] ^
712             Td3[(s2 >> 24)       ] ^
713             rk[5];
714         t2 =
715             Td0[(s2      ) & 0xff] ^
716             Td1[(s1 >>  8) & 0xff] ^
717             Td2[(s0 >> 16) & 0xff] ^
718             Td3[(s3 >> 24) & 0xff] ^
719             rk[6];
720         t3 =
721             Td0[(s3      ) & 0xff] ^
722             Td1[(s2 >>  8) & 0xff] ^
723             Td2[(s1 >> 16) & 0xff] ^
724             Td3[(s0 >> 24)       ] ^
725             rk[7];
726
727     /*
728      * Nr - 2 full rounds:
729      */
730     for (rk+=8,r=(key->rounds-2)>>1; r>0; rk+=8,r--) {
731         s0 =
732             Td0[(t0      ) & 0xff] ^
733             Td1[(t3 >>  8) & 0xff] ^
734             Td2[(t2 >> 16) & 0xff] ^
735             Td3[(t1 >> 24)       ] ^
736             rk[0];
737         s1 =
738             Td0[(t1      ) & 0xff] ^
739             Td1[(t0 >>  8) & 0xff] ^
740             Td2[(t3 >> 16) & 0xff] ^
741             Td3[(t2 >> 24)       ] ^
742             rk[1];
743         s2 =
744             Td0[(t2      ) & 0xff] ^
745             Td1[(t1 >>  8) & 0xff] ^
746             Td2[(t0 >> 16) & 0xff] ^
747             Td3[(t3 >> 24)       ] ^
748             rk[2];
749         s3 =
750             Td0[(t3      ) & 0xff] ^
751             Td1[(t2 >>  8) & 0xff] ^
752             Td2[(t1 >> 16) & 0xff] ^
753             Td3[(t0 >> 24)       ] ^
754             rk[3];
755
756         t0 =
757             Td0[(s0      ) & 0xff] ^
758             Td1[(s3 >>  8) & 0xff] ^
759             Td2[(s2 >> 16) & 0xff] ^
760             Td3[(s1 >> 24)       ] ^
761             rk[4];
762         t1 =
763             Td0[(s1      ) & 0xff] ^
764             Td1[(s0 >>  8) & 0xff] ^
765             Td2[(s3 >> 16) & 0xff] ^
766             Td3[(s2 >> 24)       ] ^
767             rk[5];
768         t2 =
769             Td0[(s2      ) & 0xff] ^
770             Td1[(s1 >>  8) & 0xff] ^
771             Td2[(s0 >> 16) & 0xff] ^
772             Td3[(s3 >> 24) & 0xff] ^
773             rk[6];
774         t3 =
775             Td0[(s3      ) & 0xff] ^
776             Td1[(s2 >>  8) & 0xff] ^
777             Td2[(s1 >> 16) & 0xff] ^
778             Td3[(s0 >> 24)       ] ^
779             rk[7];
780     }
781     /*
782          * apply last round and
783          * map cipher state to byte array block:
784          */
785         s0 =
786                 (Td4[(t0      ) & 0xff])       ^
787                 (Td4[(t3 >>  8) & 0xff] <<  8) ^
788                 (Td4[(t2 >> 16) & 0xff] << 16) ^
789                 (Td4[(t1 >> 24)       ] << 24) ^
790                 rk[0];
791         PUTU32(out     , s0);
792         s1 =
793                 (Td4[(t1      ) & 0xff])       ^
794                 (Td4[(t0 >>  8) & 0xff] <<  8) ^
795                 (Td4[(t3 >> 16) & 0xff] << 16) ^
796                 (Td4[(t2 >> 24)       ] << 24) ^
797                 rk[1];
798         PUTU32(out +  4, s1);
799         s2 =
800                 (Td4[(t2      ) & 0xff])       ^
801                 (Td4[(t1 >>  8) & 0xff] <<  8) ^
802                 (Td4[(t0 >> 16) & 0xff] << 16) ^
803                 (Td4[(t3 >> 24)       ] << 24) ^
804                 rk[2];
805         PUTU32(out +  8, s2);
806         s3 =
807                 (Td4[(t3      ) & 0xff])       ^
808                 (Td4[(t2 >>  8) & 0xff] <<  8) ^
809                 (Td4[(t1 >> 16) & 0xff] << 16) ^
810                 (Td4[(t0 >> 24)       ] << 24) ^
811                 rk[3];
812         PUTU32(out + 12, s3);
813 }