ae5fab1b46c7d36aff57be9dc2164e8fa368e88c
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         __fips_constseg
152         static const size_t rem_8bit[256] = {
153                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217
218         while (1) {
219                 Z.hi ^= Htable[n].hi;
220                 Z.lo ^= Htable[n].lo;
221
222                 if ((u8 *)Xi==xi)       break;
223
224                 n = *(--xi);
225
226                 rem  = (size_t)Z.lo&0xff;
227                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228                 Z.hi = (Z.hi>>8);
229                 if (sizeof(size_t)==8)
230                         Z.hi ^= rem_8bit[rem];
231                 else
232                         Z.hi ^= (u64)rem_8bit[rem]<<32;
233         }
234
235         if (is_endian.little) {
236 #ifdef BSWAP8
237                 Xi[0] = BSWAP8(Z.hi);
238                 Xi[1] = BSWAP8(Z.lo);
239 #else
240                 u8 *p = (u8 *)Xi;
241                 u32 v;
242                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
243                 v = (u32)(Z.hi);        PUTU32(p+4,v);
244                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
245                 v = (u32)(Z.lo);        PUTU32(p+12,v);
246 #endif
247         }
248         else {
249                 Xi[0] = Z.hi;
250                 Xi[1] = Z.lo;
251         }
252 }
253 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
254
255 #elif   TABLE_BITS==4
256
257 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
258 {
259         u128 V;
260 #if defined(OPENSSL_SMALL_FOOTPRINT)
261         int  i;
262 #endif
263
264         Htable[0].hi = 0;
265         Htable[0].lo = 0;
266         V.hi = H[0];
267         V.lo = H[1];
268
269 #if defined(OPENSSL_SMALL_FOOTPRINT)
270         for (Htable[8]=V, i=4; i>0; i>>=1) {
271                 REDUCE1BIT(V);
272                 Htable[i] = V;
273         }
274
275         for (i=2; i<16; i<<=1) {
276                 u128 *Hi = Htable+i;
277                 int   j;
278                 for (V=*Hi, j=1; j<i; ++j) {
279                         Hi[j].hi = V.hi^Htable[j].hi;
280                         Hi[j].lo = V.lo^Htable[j].lo;
281                 }
282         }
283 #else
284         Htable[8] = V;
285         REDUCE1BIT(V);
286         Htable[4] = V;
287         REDUCE1BIT(V);
288         Htable[2] = V;
289         REDUCE1BIT(V);
290         Htable[1] = V;
291         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
292         V=Htable[4];
293         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
294         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
295         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
296         V=Htable[8];
297         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
298         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
299         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
300         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
301         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
302         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
303         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #endif
305 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306         /*
307          * ARM assembler expects specific dword order in Htable.
308          */
309         {
310         int j;
311         const union { long one; char little; } is_endian = {1};
312
313         if (is_endian.little)
314                 for (j=0;j<16;++j) {
315                         V = Htable[j];
316                         Htable[j].hi = V.lo;
317                         Htable[j].lo = V.hi;
318                 }
319         else
320                 for (j=0;j<16;++j) {
321                         V = Htable[j];
322                         Htable[j].hi = V.lo<<32|V.lo>>32;
323                         Htable[j].lo = V.hi<<32|V.hi>>32;
324                 }
325         }
326 #endif
327 }
328
329 #ifndef GHASH_ASM
330 __fips_constseg
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but the priority is to minimize
462      * cache footprint...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     __fips_constseg
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 #if     TABLE_BITS==4 && defined(GHASH_ASM)
649 # if    !defined(I386_ONLY) && \
650         (defined(__i386)        || defined(__i386__)    || \
651          defined(__x86_64)      || defined(__x86_64__)  || \
652          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
653 #  define GHASH_ASM_X86_OR_64
654 #  define GCM_FUNCREF_4BIT
655 extern unsigned int OPENSSL_ia32cap_P[2];
656
657 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
658 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 #if defined(__i386) || defined(__i386__)
662 # define gcm_init_avx   gcm_init_clmul
663 # define gcm_gmult_avx  gcm_gmult_clmul
664 # define gcm_ghash_avx  gcm_ghash_clmul
665 #else
666 void gcm_init_avx(u128 Htable[16],const u64 Xi[2]);
667 void gcm_gmult_avx(u64 Xi[2],const u128 Htable[16]);
668 void gcm_ghash_avx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
669 #endif
670
671 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
672 #   define GHASH_ASM_X86
673 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675
676 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
677 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
678 #  endif
679 # elif defined(__arm__) || defined(__arm)
680 #  include "arm_arch.h"
681 #  if __ARM_ARCH__>=7
682 #   define GHASH_ASM_ARM
683 #   define GCM_FUNCREF_4BIT
684 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
685 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
686 #  endif
687 # elif defined(__sparc__) || defined(__sparc)
688 #  include "sparc_arch.h"
689 #  define GHASH_ASM_SPARC
690 #  define GCM_FUNCREF_4BIT
691 extern unsigned int OPENSSL_sparcv9cap_P[];
692 void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
693 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
694 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
695 # endif
696 #endif
697
698 #ifdef GCM_FUNCREF_4BIT
699 # undef  GCM_MUL
700 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
701 # ifdef GHASH
702 #  undef  GHASH
703 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
704 # endif
705 #endif
706
707 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
708 {
709         const union { long one; char little; } is_endian = {1};
710
711         memset(ctx,0,sizeof(*ctx));
712         ctx->block = block;
713         ctx->key   = key;
714
715         (*block)(ctx->H.c,ctx->H.c,key);
716
717         if (is_endian.little) {
718                 /* H is stored in host byte order */
719 #ifdef BSWAP8
720                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
721                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
722 #else
723                 u8 *p = ctx->H.c;
724                 u64 hi,lo;
725                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
726                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
727                 ctx->H.u[0] = hi;
728                 ctx->H.u[1] = lo;
729 #endif
730         }
731
732 #if     TABLE_BITS==8
733         gcm_init_8bit(ctx->Htable,ctx->H.u);
734 #elif   TABLE_BITS==4
735 # if    defined(GHASH_ASM_X86_OR_64)
736 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
737         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
738             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
739                 if (((OPENSSL_ia32cap_P[1]>>22)&0x41)==0x41) {  /* AVX+MOVBE */
740                         gcm_init_avx(ctx->Htable,ctx->H.u);
741                         ctx->gmult = gcm_gmult_avx;
742                         ctx->ghash = gcm_ghash_avx;
743                 } else {
744                         gcm_init_clmul(ctx->Htable,ctx->H.u);
745                         ctx->gmult = gcm_gmult_clmul;
746                         ctx->ghash = gcm_ghash_clmul;
747                 }
748                 return;
749         }
750 #  endif
751         gcm_init_4bit(ctx->Htable,ctx->H.u);
752 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
753 #   if  defined(OPENSSL_IA32_SSE2)
754         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
755 #   else
756         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
757 #   endif
758                 ctx->gmult = gcm_gmult_4bit_mmx;
759                 ctx->ghash = gcm_ghash_4bit_mmx;
760         } else {
761                 ctx->gmult = gcm_gmult_4bit_x86;
762                 ctx->ghash = gcm_ghash_4bit_x86;
763         }
764 #  else
765         ctx->gmult = gcm_gmult_4bit;
766         ctx->ghash = gcm_ghash_4bit;
767 #  endif
768 # elif  defined(GHASH_ASM_ARM)
769         if (OPENSSL_armcap_P & ARMV7_NEON) {
770                 ctx->gmult = gcm_gmult_neon;
771                 ctx->ghash = gcm_ghash_neon;
772         } else {
773                 gcm_init_4bit(ctx->Htable,ctx->H.u);
774                 ctx->gmult = gcm_gmult_4bit;
775                 ctx->ghash = gcm_ghash_4bit;
776         }
777 # elif  defined(GHASH_ASM_SPARC)
778         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
779                 gcm_init_vis3(ctx->Htable,ctx->H.u);
780                 ctx->gmult = gcm_gmult_vis3;
781                 ctx->ghash = gcm_ghash_vis3;
782         } else {
783                 gcm_init_4bit(ctx->Htable,ctx->H.u);
784                 ctx->gmult = gcm_gmult_4bit;
785                 ctx->ghash = gcm_ghash_4bit;
786         }
787 # else
788         gcm_init_4bit(ctx->Htable,ctx->H.u);
789 # endif
790 #endif
791 }
792
793 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
794 {
795         const union { long one; char little; } is_endian = {1};
796         unsigned int ctr;
797 #ifdef GCM_FUNCREF_4BIT
798         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
799 #endif
800
801         ctx->Yi.u[0]  = 0;
802         ctx->Yi.u[1]  = 0;
803         ctx->Xi.u[0]  = 0;
804         ctx->Xi.u[1]  = 0;
805         ctx->len.u[0] = 0;      /* AAD length */
806         ctx->len.u[1] = 0;      /* message length */
807         ctx->ares = 0;
808         ctx->mres = 0;
809
810         if (len==12) {
811                 memcpy(ctx->Yi.c,iv,12);
812                 ctx->Yi.c[15]=1;
813                 ctr=1;
814         }
815         else {
816                 size_t i;
817                 u64 len0 = len;
818
819                 while (len>=16) {
820                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
821                         GCM_MUL(ctx,Yi);
822                         iv += 16;
823                         len -= 16;
824                 }
825                 if (len) {
826                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
827                         GCM_MUL(ctx,Yi);
828                 }
829                 len0 <<= 3;
830                 if (is_endian.little) {
831 #ifdef BSWAP8
832                         ctx->Yi.u[1]  ^= BSWAP8(len0);
833 #else
834                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
835                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
836                         ctx->Yi.c[10] ^= (u8)(len0>>40);
837                         ctx->Yi.c[11] ^= (u8)(len0>>32);
838                         ctx->Yi.c[12] ^= (u8)(len0>>24);
839                         ctx->Yi.c[13] ^= (u8)(len0>>16);
840                         ctx->Yi.c[14] ^= (u8)(len0>>8);
841                         ctx->Yi.c[15] ^= (u8)(len0);
842 #endif
843                 }
844                 else
845                         ctx->Yi.u[1]  ^= len0;
846
847                 GCM_MUL(ctx,Yi);
848
849                 if (is_endian.little)
850                         ctr = GETU32(ctx->Yi.c+12);
851                 else
852                         ctr = ctx->Yi.d[3];
853         }
854
855         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
856         ++ctr;
857         if (is_endian.little)
858                 PUTU32(ctx->Yi.c+12,ctr);
859         else
860                 ctx->Yi.d[3] = ctr;
861 }
862
863 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
864 {
865         size_t i;
866         unsigned int n;
867         u64 alen = ctx->len.u[0];
868 #ifdef GCM_FUNCREF_4BIT
869         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
870 # ifdef GHASH
871         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
872                                 const u8 *inp,size_t len)       = ctx->ghash;
873 # endif
874 #endif
875
876         if (ctx->len.u[1]) return -2;
877
878         alen += len;
879         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
880                 return -1;
881         ctx->len.u[0] = alen;
882
883         n = ctx->ares;
884         if (n) {
885                 while (n && len) {
886                         ctx->Xi.c[n] ^= *(aad++);
887                         --len;
888                         n = (n+1)%16;
889                 }
890                 if (n==0) GCM_MUL(ctx,Xi);
891                 else {
892                         ctx->ares = n;
893                         return 0;
894                 }
895         }
896
897 #ifdef GHASH
898         if ((i = (len&(size_t)-16))) {
899                 GHASH(ctx,aad,i);
900                 aad += i;
901                 len -= i;
902         }
903 #else
904         while (len>=16) {
905                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
906                 GCM_MUL(ctx,Xi);
907                 aad += 16;
908                 len -= 16;
909         }
910 #endif
911         if (len) {
912                 n = (unsigned int)len;
913                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
914         }
915
916         ctx->ares = n;
917         return 0;
918 }
919
920 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
921                 const unsigned char *in, unsigned char *out,
922                 size_t len)
923 {
924         const union { long one; char little; } is_endian = {1};
925         unsigned int n, ctr;
926         size_t i;
927         u64        mlen  = ctx->len.u[1];
928         block128_f block = ctx->block;
929         void      *key   = ctx->key;
930 #ifdef GCM_FUNCREF_4BIT
931         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
932 # ifdef GHASH
933         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
934                                 const u8 *inp,size_t len)       = ctx->ghash;
935 # endif
936 #endif
937
938 #if 0
939         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
940 #endif
941         mlen += len;
942         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
943                 return -1;
944         ctx->len.u[1] = mlen;
945
946         if (ctx->ares) {
947                 /* First call to encrypt finalizes GHASH(AAD) */
948                 GCM_MUL(ctx,Xi);
949                 ctx->ares = 0;
950         }
951
952         if (is_endian.little)
953                 ctr = GETU32(ctx->Yi.c+12);
954         else
955                 ctr = ctx->Yi.d[3];
956
957         n = ctx->mres;
958 #if !defined(OPENSSL_SMALL_FOOTPRINT)
959         if (16%sizeof(size_t) == 0) do {        /* always true actually */
960                 if (n) {
961                         while (n && len) {
962                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
963                                 --len;
964                                 n = (n+1)%16;
965                         }
966                         if (n==0) GCM_MUL(ctx,Xi);
967                         else {
968                                 ctx->mres = n;
969                                 return 0;
970                         }
971                 }
972 #if defined(STRICT_ALIGNMENT)
973                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
974                         break;
975 #endif
976 #if defined(GHASH) && defined(GHASH_CHUNK)
977                 while (len>=GHASH_CHUNK) {
978                     size_t j=GHASH_CHUNK;
979
980                     while (j) {
981                         size_t *out_t=(size_t *)out;
982                         const size_t *in_t=(const size_t *)in;
983
984                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
985                         ++ctr;
986                         if (is_endian.little)
987                                 PUTU32(ctx->Yi.c+12,ctr);
988                         else
989                                 ctx->Yi.d[3] = ctr;
990                         for (i=0; i<16/sizeof(size_t); ++i)
991                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
992                         out += 16;
993                         in  += 16;
994                         j   -= 16;
995                     }
996                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
997                     len -= GHASH_CHUNK;
998                 }
999                 if ((i = (len&(size_t)-16))) {
1000                     size_t j=i;
1001
1002                     while (len>=16) {
1003                         size_t *out_t=(size_t *)out;
1004                         const size_t *in_t=(const size_t *)in;
1005
1006                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1007                         ++ctr;
1008                         if (is_endian.little)
1009                                 PUTU32(ctx->Yi.c+12,ctr);
1010                         else
1011                                 ctx->Yi.d[3] = ctr;
1012                         for (i=0; i<16/sizeof(size_t); ++i)
1013                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1014                         out += 16;
1015                         in  += 16;
1016                         len -= 16;
1017                     }
1018                     GHASH(ctx,out-j,j);
1019                 }
1020 #else
1021                 while (len>=16) {
1022                         size_t *out_t=(size_t *)out;
1023                         const size_t *in_t=(const size_t *)in;
1024
1025                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1026                         ++ctr;
1027                         if (is_endian.little)
1028                                 PUTU32(ctx->Yi.c+12,ctr);
1029                         else
1030                                 ctx->Yi.d[3] = ctr;
1031                         for (i=0; i<16/sizeof(size_t); ++i)
1032                                 ctx->Xi.t[i] ^=
1033                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1034                         GCM_MUL(ctx,Xi);
1035                         out += 16;
1036                         in  += 16;
1037                         len -= 16;
1038                 }
1039 #endif
1040                 if (len) {
1041                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1042                         ++ctr;
1043                         if (is_endian.little)
1044                                 PUTU32(ctx->Yi.c+12,ctr);
1045                         else
1046                                 ctx->Yi.d[3] = ctr;
1047                         while (len--) {
1048                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1049                                 ++n;
1050                         }
1051                 }
1052
1053                 ctx->mres = n;
1054                 return 0;
1055         } while(0);
1056 #endif
1057         for (i=0;i<len;++i) {
1058                 if (n==0) {
1059                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1060                         ++ctr;
1061                         if (is_endian.little)
1062                                 PUTU32(ctx->Yi.c+12,ctr);
1063                         else
1064                                 ctx->Yi.d[3] = ctr;
1065                 }
1066                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1067                 n = (n+1)%16;
1068                 if (n==0)
1069                         GCM_MUL(ctx,Xi);
1070         }
1071
1072         ctx->mres = n;
1073         return 0;
1074 }
1075
1076 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1077                 const unsigned char *in, unsigned char *out,
1078                 size_t len)
1079 {
1080         const union { long one; char little; } is_endian = {1};
1081         unsigned int n, ctr;
1082         size_t i;
1083         u64        mlen  = ctx->len.u[1];
1084         block128_f block = ctx->block;
1085         void      *key   = ctx->key;
1086 #ifdef GCM_FUNCREF_4BIT
1087         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1088 # ifdef GHASH
1089         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1090                                 const u8 *inp,size_t len)       = ctx->ghash;
1091 # endif
1092 #endif
1093
1094         mlen += len;
1095         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1096                 return -1;
1097         ctx->len.u[1] = mlen;
1098
1099         if (ctx->ares) {
1100                 /* First call to decrypt finalizes GHASH(AAD) */
1101                 GCM_MUL(ctx,Xi);
1102                 ctx->ares = 0;
1103         }
1104
1105         if (is_endian.little)
1106                 ctr = GETU32(ctx->Yi.c+12);
1107         else
1108                 ctr = ctx->Yi.d[3];
1109
1110         n = ctx->mres;
1111 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1112         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1113                 if (n) {
1114                         while (n && len) {
1115                                 u8 c = *(in++);
1116                                 *(out++) = c^ctx->EKi.c[n];
1117                                 ctx->Xi.c[n] ^= c;
1118                                 --len;
1119                                 n = (n+1)%16;
1120                         }
1121                         if (n==0) GCM_MUL (ctx,Xi);
1122                         else {
1123                                 ctx->mres = n;
1124                                 return 0;
1125                         }
1126                 }
1127 #if defined(STRICT_ALIGNMENT)
1128                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1129                         break;
1130 #endif
1131 #if defined(GHASH) && defined(GHASH_CHUNK)
1132                 while (len>=GHASH_CHUNK) {
1133                     size_t j=GHASH_CHUNK;
1134
1135                     GHASH(ctx,in,GHASH_CHUNK);
1136                     while (j) {
1137                         size_t *out_t=(size_t *)out;
1138                         const size_t *in_t=(const size_t *)in;
1139
1140                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1141                         ++ctr;
1142                         if (is_endian.little)
1143                                 PUTU32(ctx->Yi.c+12,ctr);
1144                         else
1145                                 ctx->Yi.d[3] = ctr;
1146                         for (i=0; i<16/sizeof(size_t); ++i)
1147                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1148                         out += 16;
1149                         in  += 16;
1150                         j   -= 16;
1151                     }
1152                     len -= GHASH_CHUNK;
1153                 }
1154                 if ((i = (len&(size_t)-16))) {
1155                     GHASH(ctx,in,i);
1156                     while (len>=16) {
1157                         size_t *out_t=(size_t *)out;
1158                         const size_t *in_t=(const size_t *)in;
1159
1160                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1161                         ++ctr;
1162                         if (is_endian.little)
1163                                 PUTU32(ctx->Yi.c+12,ctr);
1164                         else
1165                                 ctx->Yi.d[3] = ctr;
1166                         for (i=0; i<16/sizeof(size_t); ++i)
1167                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1168                         out += 16;
1169                         in  += 16;
1170                         len -= 16;
1171                     }
1172                 }
1173 #else
1174                 while (len>=16) {
1175                         size_t *out_t=(size_t *)out;
1176                         const size_t *in_t=(const size_t *)in;
1177
1178                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1179                         ++ctr;
1180                         if (is_endian.little)
1181                                 PUTU32(ctx->Yi.c+12,ctr);
1182                         else
1183                                 ctx->Yi.d[3] = ctr;
1184                         for (i=0; i<16/sizeof(size_t); ++i) {
1185                                 size_t c = in[i];
1186                                 out[i] = c^ctx->EKi.t[i];
1187                                 ctx->Xi.t[i] ^= c;
1188                         }
1189                         GCM_MUL(ctx,Xi);
1190                         out += 16;
1191                         in  += 16;
1192                         len -= 16;
1193                 }
1194 #endif
1195                 if (len) {
1196                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1197                         ++ctr;
1198                         if (is_endian.little)
1199                                 PUTU32(ctx->Yi.c+12,ctr);
1200                         else
1201                                 ctx->Yi.d[3] = ctr;
1202                         while (len--) {
1203                                 u8 c = in[n];
1204                                 ctx->Xi.c[n] ^= c;
1205                                 out[n] = c^ctx->EKi.c[n];
1206                                 ++n;
1207                         }
1208                 }
1209
1210                 ctx->mres = n;
1211                 return 0;
1212         } while(0);
1213 #endif
1214         for (i=0;i<len;++i) {
1215                 u8 c;
1216                 if (n==0) {
1217                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1218                         ++ctr;
1219                         if (is_endian.little)
1220                                 PUTU32(ctx->Yi.c+12,ctr);
1221                         else
1222                                 ctx->Yi.d[3] = ctr;
1223                 }
1224                 c = in[i];
1225                 out[i] = c^ctx->EKi.c[n];
1226                 ctx->Xi.c[n] ^= c;
1227                 n = (n+1)%16;
1228                 if (n==0)
1229                         GCM_MUL(ctx,Xi);
1230         }
1231
1232         ctx->mres = n;
1233         return 0;
1234 }
1235
1236 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1237                 const unsigned char *in, unsigned char *out,
1238                 size_t len, ctr128_f stream)
1239 {
1240         const union { long one; char little; } is_endian = {1};
1241         unsigned int n, ctr;
1242         size_t i;
1243         u64   mlen = ctx->len.u[1];
1244         void *key  = ctx->key;
1245 #ifdef GCM_FUNCREF_4BIT
1246         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1247 # ifdef GHASH
1248         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1249                                 const u8 *inp,size_t len)       = ctx->ghash;
1250 # endif
1251 #endif
1252
1253         mlen += len;
1254         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1255                 return -1;
1256         ctx->len.u[1] = mlen;
1257
1258         if (ctx->ares) {
1259                 /* First call to encrypt finalizes GHASH(AAD) */
1260                 GCM_MUL(ctx,Xi);
1261                 ctx->ares = 0;
1262         }
1263
1264         if (is_endian.little)
1265                 ctr = GETU32(ctx->Yi.c+12);
1266         else
1267                 ctr = ctx->Yi.d[3];
1268
1269         n = ctx->mres;
1270         if (n) {
1271                 while (n && len) {
1272                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1273                         --len;
1274                         n = (n+1)%16;
1275                 }
1276                 if (n==0) GCM_MUL(ctx,Xi);
1277                 else {
1278                         ctx->mres = n;
1279                         return 0;
1280                 }
1281         }
1282 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1283         while (len>=GHASH_CHUNK) {
1284                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1285                 ctr += GHASH_CHUNK/16;
1286                 if (is_endian.little)
1287                         PUTU32(ctx->Yi.c+12,ctr);
1288                 else
1289                         ctx->Yi.d[3] = ctr;
1290                 GHASH(ctx,out,GHASH_CHUNK);
1291                 out += GHASH_CHUNK;
1292                 in  += GHASH_CHUNK;
1293                 len -= GHASH_CHUNK;
1294         }
1295 #endif
1296         if ((i = (len&(size_t)-16))) {
1297                 size_t j=i/16;
1298
1299                 (*stream)(in,out,j,key,ctx->Yi.c);
1300                 ctr += (unsigned int)j;
1301                 if (is_endian.little)
1302                         PUTU32(ctx->Yi.c+12,ctr);
1303                 else
1304                         ctx->Yi.d[3] = ctr;
1305                 in  += i;
1306                 len -= i;
1307 #if defined(GHASH)
1308                 GHASH(ctx,out,i);
1309                 out += i;
1310 #else
1311                 while (j--) {
1312                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1313                         GCM_MUL(ctx,Xi);
1314                         out += 16;
1315                 }
1316 #endif
1317         }
1318         if (len) {
1319                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1320                 ++ctr;
1321                 if (is_endian.little)
1322                         PUTU32(ctx->Yi.c+12,ctr);
1323                 else
1324                         ctx->Yi.d[3] = ctr;
1325                 while (len--) {
1326                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1327                         ++n;
1328                 }
1329         }
1330
1331         ctx->mres = n;
1332         return 0;
1333 }
1334
1335 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1336                 const unsigned char *in, unsigned char *out,
1337                 size_t len,ctr128_f stream)
1338 {
1339         const union { long one; char little; } is_endian = {1};
1340         unsigned int n, ctr;
1341         size_t i;
1342         u64   mlen = ctx->len.u[1];
1343         void *key  = ctx->key;
1344 #ifdef GCM_FUNCREF_4BIT
1345         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1346 # ifdef GHASH
1347         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1348                                 const u8 *inp,size_t len)       = ctx->ghash;
1349 # endif
1350 #endif
1351
1352         mlen += len;
1353         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1354                 return -1;
1355         ctx->len.u[1] = mlen;
1356
1357         if (ctx->ares) {
1358                 /* First call to decrypt finalizes GHASH(AAD) */
1359                 GCM_MUL(ctx,Xi);
1360                 ctx->ares = 0;
1361         }
1362
1363         if (is_endian.little)
1364                 ctr = GETU32(ctx->Yi.c+12);
1365         else
1366                 ctr = ctx->Yi.d[3];
1367
1368         n = ctx->mres;
1369         if (n) {
1370                 while (n && len) {
1371                         u8 c = *(in++);
1372                         *(out++) = c^ctx->EKi.c[n];
1373                         ctx->Xi.c[n] ^= c;
1374                         --len;
1375                         n = (n+1)%16;
1376                 }
1377                 if (n==0) GCM_MUL (ctx,Xi);
1378                 else {
1379                         ctx->mres = n;
1380                         return 0;
1381                 }
1382         }
1383 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1384         while (len>=GHASH_CHUNK) {
1385                 GHASH(ctx,in,GHASH_CHUNK);
1386                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1387                 ctr += GHASH_CHUNK/16;
1388                 if (is_endian.little)
1389                         PUTU32(ctx->Yi.c+12,ctr);
1390                 else
1391                         ctx->Yi.d[3] = ctr;
1392                 out += GHASH_CHUNK;
1393                 in  += GHASH_CHUNK;
1394                 len -= GHASH_CHUNK;
1395         }
1396 #endif
1397         if ((i = (len&(size_t)-16))) {
1398                 size_t j=i/16;
1399
1400 #if defined(GHASH)
1401                 GHASH(ctx,in,i);
1402 #else
1403                 while (j--) {
1404                         size_t k;
1405                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1406                         GCM_MUL(ctx,Xi);
1407                         in += 16;
1408                 }
1409                 j   = i/16;
1410                 in -= i;
1411 #endif
1412                 (*stream)(in,out,j,key,ctx->Yi.c);
1413                 ctr += (unsigned int)j;
1414                 if (is_endian.little)
1415                         PUTU32(ctx->Yi.c+12,ctr);
1416                 else
1417                         ctx->Yi.d[3] = ctr;
1418                 out += i;
1419                 in  += i;
1420                 len -= i;
1421         }
1422         if (len) {
1423                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1424                 ++ctr;
1425                 if (is_endian.little)
1426                         PUTU32(ctx->Yi.c+12,ctr);
1427                 else
1428                         ctx->Yi.d[3] = ctr;
1429                 while (len--) {
1430                         u8 c = in[n];
1431                         ctx->Xi.c[n] ^= c;
1432                         out[n] = c^ctx->EKi.c[n];
1433                         ++n;
1434                 }
1435         }
1436
1437         ctx->mres = n;
1438         return 0;
1439 }
1440
1441 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1442                         size_t len)
1443 {
1444         const union { long one; char little; } is_endian = {1};
1445         u64 alen = ctx->len.u[0]<<3;
1446         u64 clen = ctx->len.u[1]<<3;
1447 #ifdef GCM_FUNCREF_4BIT
1448         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1449 #endif
1450
1451         if (ctx->mres || ctx->ares)
1452                 GCM_MUL(ctx,Xi);
1453
1454         if (is_endian.little) {
1455 #ifdef BSWAP8
1456                 alen = BSWAP8(alen);
1457                 clen = BSWAP8(clen);
1458 #else
1459                 u8 *p = ctx->len.c;
1460
1461                 ctx->len.u[0] = alen;
1462                 ctx->len.u[1] = clen;
1463
1464                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1465                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1466 #endif
1467         }
1468
1469         ctx->Xi.u[0] ^= alen;
1470         ctx->Xi.u[1] ^= clen;
1471         GCM_MUL(ctx,Xi);
1472
1473         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1474         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1475
1476         if (tag && len<=sizeof(ctx->Xi))
1477                 return memcmp(ctx->Xi.c,tag,len);
1478         else
1479                 return -1;
1480 }
1481
1482 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1483 {
1484         CRYPTO_gcm128_finish(ctx, NULL, 0);
1485         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1486 }
1487
1488 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1489 {
1490         GCM128_CONTEXT *ret;
1491
1492         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1493                 CRYPTO_gcm128_init(ret,key,block);
1494
1495         return ret;
1496 }
1497
1498 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1499 {
1500         if (ctx) {
1501                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1502                 OPENSSL_free(ctx);
1503         }
1504 }
1505
1506 #if defined(SELFTEST)
1507 #include <stdio.h>
1508 #include <openssl/aes.h>
1509
1510 /* Test Case 1 */
1511 static const u8 K1[16],
1512                 *P1=NULL,
1513                 *A1=NULL,
1514                 IV1[12],
1515                 *C1=NULL,
1516                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1517
1518 /* Test Case 2 */
1519 #define K2 K1
1520 #define A2 A1
1521 #define IV2 IV1
1522 static const u8 P2[16],
1523                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1524                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1525
1526 /* Test Case 3 */
1527 #define A3 A2
1528 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1529                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1530                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1531                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1532                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1533                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1534                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1535                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1536                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1537                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1538                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1539
1540 /* Test Case 4 */
1541 #define K4 K3
1542 #define IV4 IV3
1543 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1544                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1545                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1546                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1547                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1548                         0xab,0xad,0xda,0xd2},
1549                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1550                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1551                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1552                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1553                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1554
1555 /* Test Case 5 */
1556 #define K5 K4
1557 #define P5 P4
1558 #define A5 A4
1559 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1560                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1561                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1562                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1563                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1564                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1565
1566 /* Test Case 6 */
1567 #define K6 K5
1568 #define P6 P5
1569 #define A6 A5
1570 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1571                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1572                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1573                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1574                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1575                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1576                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1577                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1578                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1579
1580 /* Test Case 7 */
1581 static const u8 K7[24],
1582                 *P7=NULL,
1583                 *A7=NULL,
1584                 IV7[12],
1585                 *C7=NULL,
1586                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1587
1588 /* Test Case 8 */
1589 #define K8 K7
1590 #define IV8 IV7
1591 #define A8 A7
1592 static const u8 P8[16],
1593                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1594                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1595
1596 /* Test Case 9 */
1597 #define A9 A8
1598 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1599                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1600                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1601                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1602                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1603                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1604                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1605                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1606                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1607                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1608                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1609                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1610
1611 /* Test Case 10 */
1612 #define K10 K9
1613 #define IV10 IV9
1614 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1615                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1616                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1617                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1618                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1619                         0xab,0xad,0xda,0xd2},
1620                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1621                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1622                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1623                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1624                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1625
1626 /* Test Case 11 */
1627 #define K11 K10
1628 #define P11 P10
1629 #define A11 A10
1630 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1631                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1632                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1633                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1634                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1635                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1636
1637 /* Test Case 12 */
1638 #define K12 K11
1639 #define P12 P11
1640 #define A12 A11
1641 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1642                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1643                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1644                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1645                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1646                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1647                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1648                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1649                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1650
1651 /* Test Case 13 */
1652 static const u8 K13[32],
1653                 *P13=NULL,
1654                 *A13=NULL,
1655                 IV13[12],
1656                 *C13=NULL,
1657                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1658
1659 /* Test Case 14 */
1660 #define K14 K13
1661 #define A14 A13
1662 static const u8 P14[16],
1663                 IV14[12],
1664                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1665                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1666
1667 /* Test Case 15 */
1668 #define A15 A14
1669 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1670                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1671                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1672                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1673                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1674                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1675                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1676                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1677                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1678                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1679                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1680                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1681
1682 /* Test Case 16 */
1683 #define K16 K15
1684 #define IV16 IV15
1685 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1686                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1687                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1688                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1689                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1690                         0xab,0xad,0xda,0xd2},
1691                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1692                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1693                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1694                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1695                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1696
1697 /* Test Case 17 */
1698 #define K17 K16
1699 #define P17 P16
1700 #define A17 A16
1701 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1702                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1703                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1704                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1705                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1706                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1707
1708 /* Test Case 18 */
1709 #define K18 K17
1710 #define P18 P17
1711 #define A18 A17
1712 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1713                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1714                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1715                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1716                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1717                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1718                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1719                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1720                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1721
1722 /* Test Case 19 */
1723 #define K19 K1
1724 #define P19 P1
1725 #define IV19 IV1
1726 #define C19 C1
1727 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1728                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1729                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1730                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1731                         0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1732                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1733                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1734                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1735                 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1736
1737 /* Test Case 20 */
1738 #define K20 K1
1739 #define A20 A1
1740 static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
1741                 P20[288],
1742                 C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1743                         0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1744                         0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1745                         0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1746                         0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1747                         0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1748                         0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1749                         0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1750                         0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1751                         0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1752                         0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1753                         0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1754                         0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1755                         0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1756                         0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1757                         0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1758                         0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1759                         0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1760                 T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1761
1762 #define TEST_CASE(n)    do {                                    \
1763         u8 out[sizeof(P##n)];                                   \
1764         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1765         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1766         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1767         memset(out,0,sizeof(out));                              \
1768         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1769         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1770         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1771             (C##n && memcmp(out,C##n,sizeof(out))))             \
1772                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1773         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1774         memset(out,0,sizeof(out));                              \
1775         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1776         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1777         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1778             (P##n && memcmp(out,P##n,sizeof(out))))             \
1779                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1780         } while(0)
1781
1782 int main()
1783 {
1784         GCM128_CONTEXT ctx;
1785         AES_KEY key;
1786         int ret=0;
1787
1788         TEST_CASE(1);
1789         TEST_CASE(2);
1790         TEST_CASE(3);
1791         TEST_CASE(4);
1792         TEST_CASE(5);
1793         TEST_CASE(6);
1794         TEST_CASE(7);
1795         TEST_CASE(8);
1796         TEST_CASE(9);
1797         TEST_CASE(10);
1798         TEST_CASE(11);
1799         TEST_CASE(12);
1800         TEST_CASE(13);
1801         TEST_CASE(14);
1802         TEST_CASE(15);
1803         TEST_CASE(16);
1804         TEST_CASE(17);
1805         TEST_CASE(18);
1806         TEST_CASE(19);
1807         TEST_CASE(20);
1808
1809 #ifdef OPENSSL_CPUID_OBJ
1810         {
1811         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1812         union { u64 u; u8 c[1024]; } buf;
1813         int i;
1814
1815         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1816         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1817         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1818
1819         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1820         start = OPENSSL_rdtsc();
1821         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1822         gcm_t = OPENSSL_rdtsc() - start;
1823
1824         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1825                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1826                         (block128_f)AES_encrypt);
1827         start = OPENSSL_rdtsc();
1828         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1829                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1830                         (block128_f)AES_encrypt);
1831         ctr_t = OPENSSL_rdtsc() - start;
1832
1833         printf("%.2f-%.2f=%.2f\n",
1834                         gcm_t/(double)sizeof(buf),
1835                         ctr_t/(double)sizeof(buf),
1836                         (gcm_t-ctr_t)/(double)sizeof(buf));
1837 #ifdef GHASH
1838         {
1839         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1840                                 const u8 *inp,size_t len)       = ctx.ghash;
1841
1842         GHASH((&ctx),buf.c,sizeof(buf));
1843         start = OPENSSL_rdtsc();
1844         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1845         gcm_t = OPENSSL_rdtsc() - start;
1846         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1847         }
1848 #endif
1849         }
1850 #endif
1851
1852         return ret;
1853 }
1854 #endif