bn/asm/armv4-gf2m.pl, modes/asm/ghash-armv4.pl: faster multiplication
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         static const size_t rem_8bit[256] = {
152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217         while (1) {
218                 Z.hi ^= Htable[n].hi;
219                 Z.lo ^= Htable[n].lo;
220
221                 if ((u8 *)Xi==xi)       break;
222
223                 n = *(--xi);
224
225                 rem  = (size_t)Z.lo&0xff;
226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227                 Z.hi = (Z.hi>>8);
228                 if (sizeof(size_t)==8)
229                         Z.hi ^= rem_8bit[rem];
230                 else
231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
232         }
233
234         if (is_endian.little) {
235 #ifdef BSWAP8
236                 Xi[0] = BSWAP8(Z.hi);
237                 Xi[1] = BSWAP8(Z.lo);
238 #else
239                 u8 *p = (u8 *)Xi;
240                 u32 v;
241                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
243                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
245 #endif
246         }
247         else {
248                 Xi[0] = Z.hi;
249                 Xi[1] = Z.lo;
250         }
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254 #elif   TABLE_BITS==4
255
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258         u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         int  i;
261 #endif
262
263         Htable[0].hi = 0;
264         Htable[0].lo = 0;
265         V.hi = H[0];
266         V.lo = H[1];
267
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269         for (Htable[8]=V, i=4; i>0; i>>=1) {
270                 REDUCE1BIT(V);
271                 Htable[i] = V;
272         }
273
274         for (i=2; i<16; i<<=1) {
275                 u128 *Hi = Htable+i;
276                 int   j;
277                 for (V=*Hi, j=1; j<i; ++j) {
278                         Hi[j].hi = V.hi^Htable[j].hi;
279                         Hi[j].lo = V.lo^Htable[j].lo;
280                 }
281         }
282 #else
283         Htable[8] = V;
284         REDUCE1BIT(V);
285         Htable[4] = V;
286         REDUCE1BIT(V);
287         Htable[2] = V;
288         REDUCE1BIT(V);
289         Htable[1] = V;
290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291         V=Htable[4];
292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295         V=Htable[8];
296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305         /*
306          * ARM assembler expects specific dword order in Htable.
307          */
308         {
309         int j;
310         const union { long one; char little; } is_endian = {1};
311
312         if (is_endian.little)
313                 for (j=0;j<16;++j) {
314                         V = Htable[j];
315                         Htable[j].hi = V.lo;
316                         Htable[j].lo = V.hi;
317                 }
318         else
319                 for (j=0;j<16;++j) {
320                         V = Htable[j];
321                         Htable[j].hi = V.lo<<32|V.lo>>32;
322                         Htable[j].lo = V.hi<<32|V.hi>>32;
323                 }
324         }
325 #endif
326 }
327
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337         u128 Z;
338         int cnt = 15;
339         size_t rem, nlo, nhi;
340         const union { long one; char little; } is_endian = {1};
341
342         nlo  = ((const u8 *)Xi)[15];
343         nhi  = nlo>>4;
344         nlo &= 0xf;
345
346         Z.hi = Htable[nlo].hi;
347         Z.lo = Htable[nlo].lo;
348
349         while (1) {
350                 rem  = (size_t)Z.lo&0xf;
351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352                 Z.hi = (Z.hi>>4);
353                 if (sizeof(size_t)==8)
354                         Z.hi ^= rem_4bit[rem];
355                 else
356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358                 Z.hi ^= Htable[nhi].hi;
359                 Z.lo ^= Htable[nhi].lo;
360
361                 if (--cnt<0)            break;
362
363                 nlo  = ((const u8 *)Xi)[cnt];
364                 nhi  = nlo>>4;
365                 nlo &= 0xf;
366
367                 rem  = (size_t)Z.lo&0xf;
368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369                 Z.hi = (Z.hi>>4);
370                 if (sizeof(size_t)==8)
371                         Z.hi ^= rem_4bit[rem];
372                 else
373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375                 Z.hi ^= Htable[nlo].hi;
376                 Z.lo ^= Htable[nlo].lo;
377         }
378
379         if (is_endian.little) {
380 #ifdef BSWAP8
381                 Xi[0] = BSWAP8(Z.hi);
382                 Xi[1] = BSWAP8(Z.lo);
383 #else
384                 u8 *p = (u8 *)Xi;
385                 u32 v;
386                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
388                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
390 #endif
391         }
392         else {
393                 Xi[0] = Z.hi;
394                 Xi[1] = Z.lo;
395         }
396 }
397
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407                                 const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413
414 #if 1
415     do {
416         cnt  = 15;
417         nlo  = ((const u8 *)Xi)[15];
418         nlo ^= inp[15];
419         nhi  = nlo>>4;
420         nlo &= 0xf;
421
422         Z.hi = Htable[nlo].hi;
423         Z.lo = Htable[nlo].lo;
424
425         while (1) {
426                 rem  = (size_t)Z.lo&0xf;
427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428                 Z.hi = (Z.hi>>4);
429                 if (sizeof(size_t)==8)
430                         Z.hi ^= rem_4bit[rem];
431                 else
432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434                 Z.hi ^= Htable[nhi].hi;
435                 Z.lo ^= Htable[nhi].lo;
436
437                 if (--cnt<0)            break;
438
439                 nlo  = ((const u8 *)Xi)[cnt];
440                 nlo ^= inp[cnt];
441                 nhi  = nlo>>4;
442                 nlo &= 0xf;
443
444                 rem  = (size_t)Z.lo&0xf;
445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446                 Z.hi = (Z.hi>>4);
447                 if (sizeof(size_t)==8)
448                         Z.hi ^= rem_4bit[rem];
449                 else
450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452                 Z.hi ^= Htable[nlo].hi;
453                 Z.lo ^= Htable[nlo].lo;
454         }
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */ 
462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504         Z.hi = Htable[cnt].hi;
505         Z.lo = Htable[cnt].lo;
506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507         Hshr4[cnt].hi = (Z.hi>>4);
508         Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510
511     do {
512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513                 nlo  = ((const u8 *)Xi)[cnt];
514                 nlo ^= inp[cnt];
515                 nhi  = nlo>>4;
516                 nlo &= 0xf;
517
518                 Z.hi ^= Htable[nlo].hi;
519                 Z.lo ^= Htable[nlo].lo;
520
521                 rem = (size_t)Z.lo&0xff;
522
523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524                 Z.hi = (Z.hi>>8);
525
526                 Z.hi ^= Hshr4[nhi].hi;
527                 Z.lo ^= Hshr4[nhi].lo;
528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529         }
530
531         nlo  = ((const u8 *)Xi)[0];
532         nlo ^= inp[0];
533         nhi  = nlo>>4;
534         nlo &= 0xf;
535
536         Z.hi ^= Htable[nlo].hi;
537         Z.lo ^= Htable[nlo].lo;
538
539         rem = (size_t)Z.lo&0xf;
540
541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
542         Z.hi = (Z.hi>>4);
543
544         Z.hi ^= Htable[nhi].hi;
545         Z.lo ^= Htable[nhi].lo;
546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548
549         if (is_endian.little) {
550 #ifdef BSWAP8
551                 Xi[0] = BSWAP8(Z.hi);
552                 Xi[1] = BSWAP8(Z.lo);
553 #else
554                 u8 *p = (u8 *)Xi;
555                 u32 v;
556                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
558                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
560 #endif
561         }
562         else {
563                 Xi[0] = Z.hi;
564                 Xi[1] = Z.lo;
565         }
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582
583 #else   /* TABLE_BITS */
584
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587         u128 V,Z = { 0,0 };
588         long X;
589         int  i,j;
590         const long *xi = (const long *)Xi;
591         const union { long one; char little; } is_endian = {1};
592
593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
594         V.lo = H[1];
595
596         for (j=0; j<16/sizeof(long); ++j) {
597                 if (is_endian.little) {
598                         if (sizeof(long)==8) {
599 #ifdef BSWAP8
600                                 X = (long)(BSWAP8(xi[j]));
601 #else
602                                 const u8 *p = (const u8 *)(xi+j);
603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605                         }
606                         else {
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)GETU32(p);
609                         }
610                 }
611                 else
612                         X = xi[j];
613
614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
616                         Z.hi ^= V.hi&M;
617                         Z.lo ^= V.lo&M;
618
619                         REDUCE1BIT(V);
620                 }
621         }
622
623         if (is_endian.little) {
624 #ifdef BSWAP8
625                 Xi[0] = BSWAP8(Z.hi);
626                 Xi[1] = BSWAP8(Z.lo);
627 #else
628                 u8 *p = (u8 *)Xi;
629                 u32 v;
630                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
632                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
634 #endif
635         }
636         else {
637                 Xi[0] = Z.hi;
638                 Xi[1] = Z.lo;
639         }
640 }
641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643 #endif
644
645 #if     TABLE_BITS==4 && defined(GHASH_ASM)
646 # if    !defined(I386_ONLY) && \
647         (defined(__i386)        || defined(__i386__)    || \
648          defined(__x86_64)      || defined(__x86_64__)  || \
649          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658 #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 # define gcm_init_avx   gcm_init_clmul
660 # define gcm_gmult_avx  gcm_gmult_clmul
661 # define gcm_ghash_avx  gcm_ghash_clmul
662 #else
663 void gcm_init_avx(u128 Htable[16],const u64 Xi[2]);
664 void gcm_gmult_avx(u64 Xi[2],const u128 Htable[16]);
665 void gcm_ghash_avx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
666 #endif
667
668 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
669 #   define GHASH_ASM_X86
670 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
671 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
672
673 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675 #  endif
676 # elif defined(__arm__) || defined(__arm)
677 #  include "arm_arch.h"
678 #  if __ARM_ARCH__>=7
679 #   define GHASH_ASM_ARM
680 #   define GCM_FUNCREF_4BIT
681 void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
682 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
683 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
684 #  endif
685 # elif defined(__sparc__) || defined(__sparc)
686 #  include "sparc_arch.h"
687 #  define GHASH_ASM_SPARC
688 #  define GCM_FUNCREF_4BIT
689 extern unsigned int OPENSSL_sparcv9cap_P[];
690 void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
691 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
692 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
693 # endif
694 #endif
695
696 #ifdef GCM_FUNCREF_4BIT
697 # undef  GCM_MUL
698 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
699 # ifdef GHASH
700 #  undef  GHASH
701 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
702 # endif
703 #endif
704
705 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
706 {
707         const union { long one; char little; } is_endian = {1};
708
709         memset(ctx,0,sizeof(*ctx));
710         ctx->block = block;
711         ctx->key   = key;
712
713         (*block)(ctx->H.c,ctx->H.c,key);
714
715         if (is_endian.little) {
716                 /* H is stored in host byte order */
717 #ifdef BSWAP8
718                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
719                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
720 #else
721                 u8 *p = ctx->H.c;
722                 u64 hi,lo;
723                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
724                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
725                 ctx->H.u[0] = hi;
726                 ctx->H.u[1] = lo;
727 #endif
728         }
729
730 #if     TABLE_BITS==8
731         gcm_init_8bit(ctx->Htable,ctx->H.u);
732 #elif   TABLE_BITS==4
733 # if    defined(GHASH_ASM_X86_OR_64)
734 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
735         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
736             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
737                 if (((OPENSSL_ia32cap_P[1]>>22)&0x41)==0x41) {  /* AVX+MOVBE */
738                         gcm_init_avx(ctx->Htable,ctx->H.u);
739                         ctx->gmult = gcm_gmult_avx;
740                         ctx->ghash = gcm_ghash_avx;
741                 } else {
742                         gcm_init_clmul(ctx->Htable,ctx->H.u);
743                         ctx->gmult = gcm_gmult_clmul;
744                         ctx->ghash = gcm_ghash_clmul;
745                 }
746                 return;
747         }
748 #  endif
749         gcm_init_4bit(ctx->Htable,ctx->H.u);
750 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
751 #   if  defined(OPENSSL_IA32_SSE2)
752         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
753 #   else
754         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
755 #   endif
756                 ctx->gmult = gcm_gmult_4bit_mmx;
757                 ctx->ghash = gcm_ghash_4bit_mmx;
758         } else {
759                 ctx->gmult = gcm_gmult_4bit_x86;
760                 ctx->ghash = gcm_ghash_4bit_x86;
761         }
762 #  else
763         ctx->gmult = gcm_gmult_4bit;
764         ctx->ghash = gcm_ghash_4bit;
765 #  endif
766 # elif  defined(GHASH_ASM_ARM)
767         if (OPENSSL_armcap_P & ARMV7_NEON) {
768                 gcm_init_neon(ctx->Htable,ctx->H.u);
769                 ctx->gmult = gcm_gmult_neon;
770                 ctx->ghash = gcm_ghash_neon;
771         } else {
772                 gcm_init_4bit(ctx->Htable,ctx->H.u);
773                 ctx->gmult = gcm_gmult_4bit;
774                 ctx->ghash = gcm_ghash_4bit;
775         }
776 # elif  defined(GHASH_ASM_SPARC)
777         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
778                 gcm_init_vis3(ctx->Htable,ctx->H.u);
779                 ctx->gmult = gcm_gmult_vis3;
780                 ctx->ghash = gcm_ghash_vis3;
781         } else {
782                 gcm_init_4bit(ctx->Htable,ctx->H.u);
783                 ctx->gmult = gcm_gmult_4bit;
784                 ctx->ghash = gcm_ghash_4bit;
785         }
786 # else
787         gcm_init_4bit(ctx->Htable,ctx->H.u);
788 # endif
789 #endif
790 }
791
792 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
793 {
794         const union { long one; char little; } is_endian = {1};
795         unsigned int ctr;
796 #ifdef GCM_FUNCREF_4BIT
797         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
798 #endif
799
800         ctx->Yi.u[0]  = 0;
801         ctx->Yi.u[1]  = 0;
802         ctx->Xi.u[0]  = 0;
803         ctx->Xi.u[1]  = 0;
804         ctx->len.u[0] = 0;      /* AAD length */
805         ctx->len.u[1] = 0;      /* message length */
806         ctx->ares = 0;
807         ctx->mres = 0;
808
809         if (len==12) {
810                 memcpy(ctx->Yi.c,iv,12);
811                 ctx->Yi.c[15]=1;
812                 ctr=1;
813         }
814         else {
815                 size_t i;
816                 u64 len0 = len;
817
818                 while (len>=16) {
819                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
820                         GCM_MUL(ctx,Yi);
821                         iv += 16;
822                         len -= 16;
823                 }
824                 if (len) {
825                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
826                         GCM_MUL(ctx,Yi);
827                 }
828                 len0 <<= 3;
829                 if (is_endian.little) {
830 #ifdef BSWAP8
831                         ctx->Yi.u[1]  ^= BSWAP8(len0);
832 #else
833                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
834                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
835                         ctx->Yi.c[10] ^= (u8)(len0>>40);
836                         ctx->Yi.c[11] ^= (u8)(len0>>32);
837                         ctx->Yi.c[12] ^= (u8)(len0>>24);
838                         ctx->Yi.c[13] ^= (u8)(len0>>16);
839                         ctx->Yi.c[14] ^= (u8)(len0>>8);
840                         ctx->Yi.c[15] ^= (u8)(len0);
841 #endif
842                 }
843                 else
844                         ctx->Yi.u[1]  ^= len0;
845
846                 GCM_MUL(ctx,Yi);
847
848                 if (is_endian.little)
849 #ifdef BSWAP4
850                         ctr = BSWAP4(ctx->Yi.d[3]);
851 #else
852                         ctr = GETU32(ctx->Yi.c+12);
853 #endif
854                 else
855                         ctr = ctx->Yi.d[3];
856         }
857
858         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
859         ++ctr;
860         if (is_endian.little)
861 #ifdef BSWAP4
862                 ctx->Yi.d[3] = BSWAP4(ctr);
863 #else
864                 PUTU32(ctx->Yi.c+12,ctr);
865 #endif
866         else
867                 ctx->Yi.d[3] = ctr;
868 }
869
870 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
871 {
872         size_t i;
873         unsigned int n;
874         u64 alen = ctx->len.u[0];
875 #ifdef GCM_FUNCREF_4BIT
876         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
877 # ifdef GHASH
878         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
879                                 const u8 *inp,size_t len)       = ctx->ghash;
880 # endif
881 #endif
882
883         if (ctx->len.u[1]) return -2;
884
885         alen += len;
886         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
887                 return -1;
888         ctx->len.u[0] = alen;
889
890         n = ctx->ares;
891         if (n) {
892                 while (n && len) {
893                         ctx->Xi.c[n] ^= *(aad++);
894                         --len;
895                         n = (n+1)%16;
896                 }
897                 if (n==0) GCM_MUL(ctx,Xi);
898                 else {
899                         ctx->ares = n;
900                         return 0;
901                 }
902         }
903
904 #ifdef GHASH
905         if ((i = (len&(size_t)-16))) {
906                 GHASH(ctx,aad,i);
907                 aad += i;
908                 len -= i;
909         }
910 #else
911         while (len>=16) {
912                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
913                 GCM_MUL(ctx,Xi);
914                 aad += 16;
915                 len -= 16;
916         }
917 #endif
918         if (len) {
919                 n = (unsigned int)len;
920                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
921         }
922
923         ctx->ares = n;
924         return 0;
925 }
926
927 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
928                 const unsigned char *in, unsigned char *out,
929                 size_t len)
930 {
931         const union { long one; char little; } is_endian = {1};
932         unsigned int n, ctr;
933         size_t i;
934         u64        mlen  = ctx->len.u[1];
935         block128_f block = ctx->block;
936         void      *key   = ctx->key;
937 #ifdef GCM_FUNCREF_4BIT
938         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
939 # ifdef GHASH
940         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
941                                 const u8 *inp,size_t len)       = ctx->ghash;
942 # endif
943 #endif
944
945 #if 0
946         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
947 #endif
948         mlen += len;
949         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
950                 return -1;
951         ctx->len.u[1] = mlen;
952
953         if (ctx->ares) {
954                 /* First call to encrypt finalizes GHASH(AAD) */
955                 GCM_MUL(ctx,Xi);
956                 ctx->ares = 0;
957         }
958
959         if (is_endian.little)
960 #ifdef BSWAP4
961                 ctr = BSWAP4(ctx->Yi.d[3]);
962 #else
963                 ctr = GETU32(ctx->Yi.c+12);
964 #endif
965         else
966                 ctr = ctx->Yi.d[3];
967
968         n = ctx->mres;
969 #if !defined(OPENSSL_SMALL_FOOTPRINT)
970         if (16%sizeof(size_t) == 0) do {        /* always true actually */
971                 if (n) {
972                         while (n && len) {
973                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
974                                 --len;
975                                 n = (n+1)%16;
976                         }
977                         if (n==0) GCM_MUL(ctx,Xi);
978                         else {
979                                 ctx->mres = n;
980                                 return 0;
981                         }
982                 }
983 #if defined(STRICT_ALIGNMENT)
984                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
985                         break;
986 #endif
987 #if defined(GHASH) && defined(GHASH_CHUNK)
988                 while (len>=GHASH_CHUNK) {
989                     size_t j=GHASH_CHUNK;
990
991                     while (j) {
992                         size_t *out_t=(size_t *)out;
993                         const size_t *in_t=(const size_t *)in;
994
995                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
996                         ++ctr;
997                         if (is_endian.little)
998 #ifdef BSWAP4
999                                 ctx->Yi.d[3] = BSWAP4(ctr);
1000 #else
1001                                 PUTU32(ctx->Yi.c+12,ctr);
1002 #endif
1003                         else
1004                                 ctx->Yi.d[3] = ctr;
1005                         for (i=0; i<16/sizeof(size_t); ++i)
1006                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1007                         out += 16;
1008                         in  += 16;
1009                         j   -= 16;
1010                     }
1011                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
1012                     len -= GHASH_CHUNK;
1013                 }
1014                 if ((i = (len&(size_t)-16))) {
1015                     size_t j=i;
1016
1017                     while (len>=16) {
1018                         size_t *out_t=(size_t *)out;
1019                         const size_t *in_t=(const size_t *)in;
1020
1021                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1022                         ++ctr;
1023                         if (is_endian.little)
1024 #ifdef BSWAP4
1025                                 ctx->Yi.d[3] = BSWAP4(ctr);
1026 #else
1027                                 PUTU32(ctx->Yi.c+12,ctr);
1028 #endif
1029                         else
1030                                 ctx->Yi.d[3] = ctr;
1031                         for (i=0; i<16/sizeof(size_t); ++i)
1032                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1033                         out += 16;
1034                         in  += 16;
1035                         len -= 16;
1036                     }
1037                     GHASH(ctx,out-j,j);
1038                 }
1039 #else
1040                 while (len>=16) {
1041                         size_t *out_t=(size_t *)out;
1042                         const size_t *in_t=(const size_t *)in;
1043
1044                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1045                         ++ctr;
1046                         if (is_endian.little)
1047 #ifdef BSWAP4
1048                                 ctx->Yi.d[3] = BSWAP4(ctr);
1049 #else
1050                                 PUTU32(ctx->Yi.c+12,ctr);
1051 #endif
1052                         else
1053                                 ctx->Yi.d[3] = ctr;
1054                         for (i=0; i<16/sizeof(size_t); ++i)
1055                                 ctx->Xi.t[i] ^=
1056                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1057                         GCM_MUL(ctx,Xi);
1058                         out += 16;
1059                         in  += 16;
1060                         len -= 16;
1061                 }
1062 #endif
1063                 if (len) {
1064                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1065                         ++ctr;
1066                         if (is_endian.little)
1067 #ifdef BSWAP4
1068                                 ctx->Yi.d[3] = BSWAP4(ctr);
1069 #else
1070                                 PUTU32(ctx->Yi.c+12,ctr);
1071 #endif
1072                         else
1073                                 ctx->Yi.d[3] = ctr;
1074                         while (len--) {
1075                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1076                                 ++n;
1077                         }
1078                 }
1079
1080                 ctx->mres = n;
1081                 return 0;
1082         } while(0);
1083 #endif
1084         for (i=0;i<len;++i) {
1085                 if (n==0) {
1086                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1087                         ++ctr;
1088                         if (is_endian.little)
1089 #ifdef BSWAP4
1090                                 ctx->Yi.d[3] = BSWAP4(ctr);
1091 #else
1092                                 PUTU32(ctx->Yi.c+12,ctr);
1093 #endif
1094                         else
1095                                 ctx->Yi.d[3] = ctr;
1096                 }
1097                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1098                 n = (n+1)%16;
1099                 if (n==0)
1100                         GCM_MUL(ctx,Xi);
1101         }
1102
1103         ctx->mres = n;
1104         return 0;
1105 }
1106
1107 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1108                 const unsigned char *in, unsigned char *out,
1109                 size_t len)
1110 {
1111         const union { long one; char little; } is_endian = {1};
1112         unsigned int n, ctr;
1113         size_t i;
1114         u64        mlen  = ctx->len.u[1];
1115         block128_f block = ctx->block;
1116         void      *key   = ctx->key;
1117 #ifdef GCM_FUNCREF_4BIT
1118         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1119 # ifdef GHASH
1120         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1121                                 const u8 *inp,size_t len)       = ctx->ghash;
1122 # endif
1123 #endif
1124
1125         mlen += len;
1126         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1127                 return -1;
1128         ctx->len.u[1] = mlen;
1129
1130         if (ctx->ares) {
1131                 /* First call to decrypt finalizes GHASH(AAD) */
1132                 GCM_MUL(ctx,Xi);
1133                 ctx->ares = 0;
1134         }
1135
1136         if (is_endian.little)
1137 #ifdef BSWAP4
1138                 ctr = BSWAP4(ctx->Yi.d[3]);
1139 #else
1140                 ctr = GETU32(ctx->Yi.c+12);
1141 #endif
1142         else
1143                 ctr = ctx->Yi.d[3];
1144
1145         n = ctx->mres;
1146 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1147         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1148                 if (n) {
1149                         while (n && len) {
1150                                 u8 c = *(in++);
1151                                 *(out++) = c^ctx->EKi.c[n];
1152                                 ctx->Xi.c[n] ^= c;
1153                                 --len;
1154                                 n = (n+1)%16;
1155                         }
1156                         if (n==0) GCM_MUL (ctx,Xi);
1157                         else {
1158                                 ctx->mres = n;
1159                                 return 0;
1160                         }
1161                 }
1162 #if defined(STRICT_ALIGNMENT)
1163                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1164                         break;
1165 #endif
1166 #if defined(GHASH) && defined(GHASH_CHUNK)
1167                 while (len>=GHASH_CHUNK) {
1168                     size_t j=GHASH_CHUNK;
1169
1170                     GHASH(ctx,in,GHASH_CHUNK);
1171                     while (j) {
1172                         size_t *out_t=(size_t *)out;
1173                         const size_t *in_t=(const size_t *)in;
1174
1175                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1176                         ++ctr;
1177                         if (is_endian.little)
1178 #ifdef BSWAP4
1179                                 ctx->Yi.d[3] = BSWAP4(ctr);
1180 #else
1181                                 PUTU32(ctx->Yi.c+12,ctr);
1182 #endif
1183                         else
1184                                 ctx->Yi.d[3] = ctr;
1185                         for (i=0; i<16/sizeof(size_t); ++i)
1186                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1187                         out += 16;
1188                         in  += 16;
1189                         j   -= 16;
1190                     }
1191                     len -= GHASH_CHUNK;
1192                 }
1193                 if ((i = (len&(size_t)-16))) {
1194                     GHASH(ctx,in,i);
1195                     while (len>=16) {
1196                         size_t *out_t=(size_t *)out;
1197                         const size_t *in_t=(const size_t *)in;
1198
1199                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1200                         ++ctr;
1201                         if (is_endian.little)
1202 #ifdef BSWAP4
1203                                 ctx->Yi.d[3] = BSWAP4(ctr);
1204 #else
1205                                 PUTU32(ctx->Yi.c+12,ctr);
1206 #endif
1207                         else
1208                                 ctx->Yi.d[3] = ctr;
1209                         for (i=0; i<16/sizeof(size_t); ++i)
1210                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1211                         out += 16;
1212                         in  += 16;
1213                         len -= 16;
1214                     }
1215                 }
1216 #else
1217                 while (len>=16) {
1218                         size_t *out_t=(size_t *)out;
1219                         const size_t *in_t=(const size_t *)in;
1220
1221                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1222                         ++ctr;
1223                         if (is_endian.little)
1224 #ifdef BSWAP4
1225                                 ctx->Yi.d[3] = BSWAP4(ctr);
1226 #else
1227                                 PUTU32(ctx->Yi.c+12,ctr);
1228 #endif
1229                         else
1230                                 ctx->Yi.d[3] = ctr;
1231                         for (i=0; i<16/sizeof(size_t); ++i) {
1232                                 size_t c = in[i];
1233                                 out[i] = c^ctx->EKi.t[i];
1234                                 ctx->Xi.t[i] ^= c;
1235                         }
1236                         GCM_MUL(ctx,Xi);
1237                         out += 16;
1238                         in  += 16;
1239                         len -= 16;
1240                 }
1241 #endif
1242                 if (len) {
1243                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1244                         ++ctr;
1245                         if (is_endian.little)
1246 #ifdef BSWAP4
1247                                 ctx->Yi.d[3] = BSWAP4(ctr);
1248 #else
1249                                 PUTU32(ctx->Yi.c+12,ctr);
1250 #endif
1251                         else
1252                                 ctx->Yi.d[3] = ctr;
1253                         while (len--) {
1254                                 u8 c = in[n];
1255                                 ctx->Xi.c[n] ^= c;
1256                                 out[n] = c^ctx->EKi.c[n];
1257                                 ++n;
1258                         }
1259                 }
1260
1261                 ctx->mres = n;
1262                 return 0;
1263         } while(0);
1264 #endif
1265         for (i=0;i<len;++i) {
1266                 u8 c;
1267                 if (n==0) {
1268                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1269                         ++ctr;
1270                         if (is_endian.little)
1271 #ifdef BSWAP4
1272                                 ctx->Yi.d[3] = BSWAP4(ctr);
1273 #else
1274                                 PUTU32(ctx->Yi.c+12,ctr);
1275 #endif
1276                         else
1277                                 ctx->Yi.d[3] = ctr;
1278                 }
1279                 c = in[i];
1280                 out[i] = c^ctx->EKi.c[n];
1281                 ctx->Xi.c[n] ^= c;
1282                 n = (n+1)%16;
1283                 if (n==0)
1284                         GCM_MUL(ctx,Xi);
1285         }
1286
1287         ctx->mres = n;
1288         return 0;
1289 }
1290
1291 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1292                 const unsigned char *in, unsigned char *out,
1293                 size_t len, ctr128_f stream)
1294 {
1295         const union { long one; char little; } is_endian = {1};
1296         unsigned int n, ctr;
1297         size_t i;
1298         u64   mlen = ctx->len.u[1];
1299         void *key  = ctx->key;
1300 #ifdef GCM_FUNCREF_4BIT
1301         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1302 # ifdef GHASH
1303         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1304                                 const u8 *inp,size_t len)       = ctx->ghash;
1305 # endif
1306 #endif
1307
1308         mlen += len;
1309         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1310                 return -1;
1311         ctx->len.u[1] = mlen;
1312
1313         if (ctx->ares) {
1314                 /* First call to encrypt finalizes GHASH(AAD) */
1315                 GCM_MUL(ctx,Xi);
1316                 ctx->ares = 0;
1317         }
1318
1319         if (is_endian.little)
1320 #ifdef BSWAP4
1321                 ctr = BSWAP4(ctx->Yi.d[3]);
1322 #else
1323                 ctr = GETU32(ctx->Yi.c+12);
1324 #endif
1325         else
1326                 ctr = ctx->Yi.d[3];
1327
1328         n = ctx->mres;
1329         if (n) {
1330                 while (n && len) {
1331                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1332                         --len;
1333                         n = (n+1)%16;
1334                 }
1335                 if (n==0) GCM_MUL(ctx,Xi);
1336                 else {
1337                         ctx->mres = n;
1338                         return 0;
1339                 }
1340         }
1341 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1342         while (len>=GHASH_CHUNK) {
1343                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1344                 ctr += GHASH_CHUNK/16;
1345                 if (is_endian.little)
1346 #ifdef BSWAP4
1347                         ctx->Yi.d[3] = BSWAP4(ctr);
1348 #else
1349                         PUTU32(ctx->Yi.c+12,ctr);
1350 #endif
1351                 else
1352                         ctx->Yi.d[3] = ctr;
1353                 GHASH(ctx,out,GHASH_CHUNK);
1354                 out += GHASH_CHUNK;
1355                 in  += GHASH_CHUNK;
1356                 len -= GHASH_CHUNK;
1357         }
1358 #endif
1359         if ((i = (len&(size_t)-16))) {
1360                 size_t j=i/16;
1361
1362                 (*stream)(in,out,j,key,ctx->Yi.c);
1363                 ctr += (unsigned int)j;
1364                 if (is_endian.little)
1365 #ifdef BSWAP4
1366                         ctx->Yi.d[3] = BSWAP4(ctr);
1367 #else
1368                         PUTU32(ctx->Yi.c+12,ctr);
1369 #endif
1370                 else
1371                         ctx->Yi.d[3] = ctr;
1372                 in  += i;
1373                 len -= i;
1374 #if defined(GHASH)
1375                 GHASH(ctx,out,i);
1376                 out += i;
1377 #else
1378                 while (j--) {
1379                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1380                         GCM_MUL(ctx,Xi);
1381                         out += 16;
1382                 }
1383 #endif
1384         }
1385         if (len) {
1386                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1387                 ++ctr;
1388                 if (is_endian.little)
1389 #ifdef BSWAP4
1390                         ctx->Yi.d[3] = BSWAP4(ctr);
1391 #else
1392                         PUTU32(ctx->Yi.c+12,ctr);
1393 #endif
1394                 else
1395                         ctx->Yi.d[3] = ctr;
1396                 while (len--) {
1397                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1398                         ++n;
1399                 }
1400         }
1401
1402         ctx->mres = n;
1403         return 0;
1404 }
1405
1406 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1407                 const unsigned char *in, unsigned char *out,
1408                 size_t len,ctr128_f stream)
1409 {
1410         const union { long one; char little; } is_endian = {1};
1411         unsigned int n, ctr;
1412         size_t i;
1413         u64   mlen = ctx->len.u[1];
1414         void *key  = ctx->key;
1415 #ifdef GCM_FUNCREF_4BIT
1416         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1417 # ifdef GHASH
1418         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1419                                 const u8 *inp,size_t len)       = ctx->ghash;
1420 # endif
1421 #endif
1422
1423         mlen += len;
1424         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1425                 return -1;
1426         ctx->len.u[1] = mlen;
1427
1428         if (ctx->ares) {
1429                 /* First call to decrypt finalizes GHASH(AAD) */
1430                 GCM_MUL(ctx,Xi);
1431                 ctx->ares = 0;
1432         }
1433
1434         if (is_endian.little)
1435 #ifdef BSWAP4
1436                 ctr = BSWAP4(ctx->Yi.d[3]);
1437 #else
1438                 ctr = GETU32(ctx->Yi.c+12);
1439 #endif
1440         else
1441                 ctr = ctx->Yi.d[3];
1442
1443         n = ctx->mres;
1444         if (n) {
1445                 while (n && len) {
1446                         u8 c = *(in++);
1447                         *(out++) = c^ctx->EKi.c[n];
1448                         ctx->Xi.c[n] ^= c;
1449                         --len;
1450                         n = (n+1)%16;
1451                 }
1452                 if (n==0) GCM_MUL (ctx,Xi);
1453                 else {
1454                         ctx->mres = n;
1455                         return 0;
1456                 }
1457         }
1458 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1459         while (len>=GHASH_CHUNK) {
1460                 GHASH(ctx,in,GHASH_CHUNK);
1461                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1462                 ctr += GHASH_CHUNK/16;
1463                 if (is_endian.little)
1464 #ifdef BSWAP4
1465                         ctx->Yi.d[3] = BSWAP4(ctr);
1466 #else
1467                         PUTU32(ctx->Yi.c+12,ctr);
1468 #endif
1469                 else
1470                         ctx->Yi.d[3] = ctr;
1471                 out += GHASH_CHUNK;
1472                 in  += GHASH_CHUNK;
1473                 len -= GHASH_CHUNK;
1474         }
1475 #endif
1476         if ((i = (len&(size_t)-16))) {
1477                 size_t j=i/16;
1478
1479 #if defined(GHASH)
1480                 GHASH(ctx,in,i);
1481 #else
1482                 while (j--) {
1483                         size_t k;
1484                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1485                         GCM_MUL(ctx,Xi);
1486                         in += 16;
1487                 }
1488                 j   = i/16;
1489                 in -= i;
1490 #endif
1491                 (*stream)(in,out,j,key,ctx->Yi.c);
1492                 ctr += (unsigned int)j;
1493                 if (is_endian.little)
1494 #ifdef BSWAP4
1495                         ctx->Yi.d[3] = BSWAP4(ctr);
1496 #else
1497                         PUTU32(ctx->Yi.c+12,ctr);
1498 #endif
1499                 else
1500                         ctx->Yi.d[3] = ctr;
1501                 out += i;
1502                 in  += i;
1503                 len -= i;
1504         }
1505         if (len) {
1506                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1507                 ++ctr;
1508                 if (is_endian.little)
1509 #ifdef BSWAP4
1510                         ctx->Yi.d[3] = BSWAP4(ctr);
1511 #else
1512                         PUTU32(ctx->Yi.c+12,ctr);
1513 #endif
1514                 else
1515                         ctx->Yi.d[3] = ctr;
1516                 while (len--) {
1517                         u8 c = in[n];
1518                         ctx->Xi.c[n] ^= c;
1519                         out[n] = c^ctx->EKi.c[n];
1520                         ++n;
1521                 }
1522         }
1523
1524         ctx->mres = n;
1525         return 0;
1526 }
1527
1528 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1529                         size_t len)
1530 {
1531         const union { long one; char little; } is_endian = {1};
1532         u64 alen = ctx->len.u[0]<<3;
1533         u64 clen = ctx->len.u[1]<<3;
1534 #ifdef GCM_FUNCREF_4BIT
1535         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1536 #endif
1537
1538         if (ctx->mres || ctx->ares)
1539                 GCM_MUL(ctx,Xi);
1540
1541         if (is_endian.little) {
1542 #ifdef BSWAP8
1543                 alen = BSWAP8(alen);
1544                 clen = BSWAP8(clen);
1545 #else
1546                 u8 *p = ctx->len.c;
1547
1548                 ctx->len.u[0] = alen;
1549                 ctx->len.u[1] = clen;
1550
1551                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1552                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1553 #endif
1554         }
1555
1556         ctx->Xi.u[0] ^= alen;
1557         ctx->Xi.u[1] ^= clen;
1558         GCM_MUL(ctx,Xi);
1559
1560         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1561         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1562
1563         if (tag && len<=sizeof(ctx->Xi))
1564                 return memcmp(ctx->Xi.c,tag,len);
1565         else
1566                 return -1;
1567 }
1568
1569 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1570 {
1571         CRYPTO_gcm128_finish(ctx, NULL, 0);
1572         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1573 }
1574
1575 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1576 {
1577         GCM128_CONTEXT *ret;
1578
1579         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1580                 CRYPTO_gcm128_init(ret,key,block);
1581
1582         return ret;
1583 }
1584
1585 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1586 {
1587         if (ctx) {
1588                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1589                 OPENSSL_free(ctx);
1590         }
1591 }
1592
1593 #if defined(SELFTEST)
1594 #include <stdio.h>
1595 #include <openssl/aes.h>
1596
1597 /* Test Case 1 */
1598 static const u8 K1[16],
1599                 *P1=NULL,
1600                 *A1=NULL,
1601                 IV1[12],
1602                 *C1=NULL,
1603                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1604
1605 /* Test Case 2 */
1606 #define K2 K1
1607 #define A2 A1
1608 #define IV2 IV1
1609 static const u8 P2[16],
1610                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1611                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1612
1613 /* Test Case 3 */
1614 #define A3 A2
1615 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1616                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1617                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1618                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1619                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1620                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1621                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1622                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1623                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1624                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1625                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1626
1627 /* Test Case 4 */
1628 #define K4 K3
1629 #define IV4 IV3
1630 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1631                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1632                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1633                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1634                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1635                         0xab,0xad,0xda,0xd2},
1636                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1637                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1638                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1639                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1640                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1641
1642 /* Test Case 5 */
1643 #define K5 K4
1644 #define P5 P4
1645 #define A5 A4
1646 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1647                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1648                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1649                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1650                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1651                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1652
1653 /* Test Case 6 */
1654 #define K6 K5
1655 #define P6 P5
1656 #define A6 A5
1657 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1658                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1659                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1660                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1661                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1662                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1663                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1664                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1665                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1666
1667 /* Test Case 7 */
1668 static const u8 K7[24],
1669                 *P7=NULL,
1670                 *A7=NULL,
1671                 IV7[12],
1672                 *C7=NULL,
1673                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1674
1675 /* Test Case 8 */
1676 #define K8 K7
1677 #define IV8 IV7
1678 #define A8 A7
1679 static const u8 P8[16],
1680                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1681                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1682
1683 /* Test Case 9 */
1684 #define A9 A8
1685 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1686                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1687                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1688                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1689                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1690                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1691                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1692                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1693                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1694                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1695                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1696                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1697
1698 /* Test Case 10 */
1699 #define K10 K9
1700 #define IV10 IV9
1701 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1702                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1703                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1704                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1705                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1706                         0xab,0xad,0xda,0xd2},
1707                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1708                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1709                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1710                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1711                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1712
1713 /* Test Case 11 */
1714 #define K11 K10
1715 #define P11 P10
1716 #define A11 A10
1717 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1718                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1719                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1720                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1721                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1722                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1723
1724 /* Test Case 12 */
1725 #define K12 K11
1726 #define P12 P11
1727 #define A12 A11
1728 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1729                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1730                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1731                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1732                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1733                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1734                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1735                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1736                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1737
1738 /* Test Case 13 */
1739 static const u8 K13[32],
1740                 *P13=NULL,
1741                 *A13=NULL,
1742                 IV13[12],
1743                 *C13=NULL,
1744                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1745
1746 /* Test Case 14 */
1747 #define K14 K13
1748 #define A14 A13
1749 static const u8 P14[16],
1750                 IV14[12],
1751                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1752                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1753
1754 /* Test Case 15 */
1755 #define A15 A14
1756 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1757                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1758                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1759                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1760                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1761                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1762                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1763                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1764                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1765                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1766                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1767                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1768
1769 /* Test Case 16 */
1770 #define K16 K15
1771 #define IV16 IV15
1772 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1773                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1774                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1775                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1776                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1777                         0xab,0xad,0xda,0xd2},
1778                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1779                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1780                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1781                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1782                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1783
1784 /* Test Case 17 */
1785 #define K17 K16
1786 #define P17 P16
1787 #define A17 A16
1788 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1789                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1790                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1791                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1792                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1793                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1794
1795 /* Test Case 18 */
1796 #define K18 K17
1797 #define P18 P17
1798 #define A18 A17
1799 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1800                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1801                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1802                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1803                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1804                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1805                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1806                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1807                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1808
1809 /* Test Case 19 */
1810 #define K19 K1
1811 #define P19 P1
1812 #define IV19 IV1
1813 #define C19 C1
1814 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1815                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1816                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1817                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1818                         0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1819                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1820                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1821                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1822                 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1823
1824 /* Test Case 20 */
1825 #define K20 K1
1826 #define A20 A1
1827 static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
1828                 P20[288],
1829                 C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1830                         0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1831                         0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1832                         0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1833                         0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1834                         0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1835                         0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1836                         0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1837                         0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1838                         0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1839                         0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1840                         0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1841                         0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1842                         0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1843                         0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1844                         0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1845                         0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1846                         0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1847                 T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1848
1849 #define TEST_CASE(n)    do {                                    \
1850         u8 out[sizeof(P##n)];                                   \
1851         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1852         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1853         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1854         memset(out,0,sizeof(out));                              \
1855         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1856         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1857         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1858             (C##n && memcmp(out,C##n,sizeof(out))))             \
1859                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1860         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1861         memset(out,0,sizeof(out));                              \
1862         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1863         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1864         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1865             (P##n && memcmp(out,P##n,sizeof(out))))             \
1866                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1867         } while(0)
1868
1869 int main()
1870 {
1871         GCM128_CONTEXT ctx;
1872         AES_KEY key;
1873         int ret=0;
1874
1875         TEST_CASE(1);
1876         TEST_CASE(2);
1877         TEST_CASE(3);
1878         TEST_CASE(4);
1879         TEST_CASE(5);
1880         TEST_CASE(6);
1881         TEST_CASE(7);
1882         TEST_CASE(8);
1883         TEST_CASE(9);
1884         TEST_CASE(10);
1885         TEST_CASE(11);
1886         TEST_CASE(12);
1887         TEST_CASE(13);
1888         TEST_CASE(14);
1889         TEST_CASE(15);
1890         TEST_CASE(16);
1891         TEST_CASE(17);
1892         TEST_CASE(18);
1893         TEST_CASE(19);
1894         TEST_CASE(20);
1895
1896 #ifdef OPENSSL_CPUID_OBJ
1897         {
1898         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1899         union { u64 u; u8 c[1024]; } buf;
1900         int i;
1901
1902         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1903         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1904         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1905
1906         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1907         start = OPENSSL_rdtsc();
1908         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1909         gcm_t = OPENSSL_rdtsc() - start;
1910
1911         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1912                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1913                         (block128_f)AES_encrypt);
1914         start = OPENSSL_rdtsc();
1915         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1916                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1917                         (block128_f)AES_encrypt);
1918         ctr_t = OPENSSL_rdtsc() - start;
1919
1920         printf("%.2f-%.2f=%.2f\n",
1921                         gcm_t/(double)sizeof(buf),
1922                         ctr_t/(double)sizeof(buf),
1923                         (gcm_t-ctr_t)/(double)sizeof(buf));
1924 #ifdef GHASH
1925         {
1926         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1927                                 const u8 *inp,size_t len)       = ctx.ghash;
1928
1929         GHASH((&ctx),buf.c,sizeof(buf));
1930         start = OPENSSL_rdtsc();
1931         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1932         gcm_t = OPENSSL_rdtsc() - start;
1933         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1934         }
1935 #endif
1936         }
1937 #endif
1938
1939         return ret;
1940 }
1941 #endif