Add GHASH for ARMv8 Crypto Extension.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         __fips_constseg
152         static const size_t rem_8bit[256] = {
153                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217
218         while (1) {
219                 Z.hi ^= Htable[n].hi;
220                 Z.lo ^= Htable[n].lo;
221
222                 if ((u8 *)Xi==xi)       break;
223
224                 n = *(--xi);
225
226                 rem  = (size_t)Z.lo&0xff;
227                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228                 Z.hi = (Z.hi>>8);
229                 if (sizeof(size_t)==8)
230                         Z.hi ^= rem_8bit[rem];
231                 else
232                         Z.hi ^= (u64)rem_8bit[rem]<<32;
233         }
234
235         if (is_endian.little) {
236 #ifdef BSWAP8
237                 Xi[0] = BSWAP8(Z.hi);
238                 Xi[1] = BSWAP8(Z.lo);
239 #else
240                 u8 *p = (u8 *)Xi;
241                 u32 v;
242                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
243                 v = (u32)(Z.hi);        PUTU32(p+4,v);
244                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
245                 v = (u32)(Z.lo);        PUTU32(p+12,v);
246 #endif
247         }
248         else {
249                 Xi[0] = Z.hi;
250                 Xi[1] = Z.lo;
251         }
252 }
253 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
254
255 #elif   TABLE_BITS==4
256
257 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
258 {
259         u128 V;
260 #if defined(OPENSSL_SMALL_FOOTPRINT)
261         int  i;
262 #endif
263
264         Htable[0].hi = 0;
265         Htable[0].lo = 0;
266         V.hi = H[0];
267         V.lo = H[1];
268
269 #if defined(OPENSSL_SMALL_FOOTPRINT)
270         for (Htable[8]=V, i=4; i>0; i>>=1) {
271                 REDUCE1BIT(V);
272                 Htable[i] = V;
273         }
274
275         for (i=2; i<16; i<<=1) {
276                 u128 *Hi = Htable+i;
277                 int   j;
278                 for (V=*Hi, j=1; j<i; ++j) {
279                         Hi[j].hi = V.hi^Htable[j].hi;
280                         Hi[j].lo = V.lo^Htable[j].lo;
281                 }
282         }
283 #else
284         Htable[8] = V;
285         REDUCE1BIT(V);
286         Htable[4] = V;
287         REDUCE1BIT(V);
288         Htable[2] = V;
289         REDUCE1BIT(V);
290         Htable[1] = V;
291         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
292         V=Htable[4];
293         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
294         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
295         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
296         V=Htable[8];
297         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
298         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
299         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
300         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
301         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
302         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
303         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #endif
305 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306         /*
307          * ARM assembler expects specific dword order in Htable.
308          */
309         {
310         int j;
311         const union { long one; char little; } is_endian = {1};
312
313         if (is_endian.little)
314                 for (j=0;j<16;++j) {
315                         V = Htable[j];
316                         Htable[j].hi = V.lo;
317                         Htable[j].lo = V.hi;
318                 }
319         else
320                 for (j=0;j<16;++j) {
321                         V = Htable[j];
322                         Htable[j].hi = V.lo<<32|V.lo>>32;
323                         Htable[j].lo = V.hi<<32|V.hi>>32;
324                 }
325         }
326 #endif
327 }
328
329 #ifndef GHASH_ASM
330 __fips_constseg
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but the priority is to minimize
462      * cache footprint...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     __fips_constseg
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 #if     TABLE_BITS==4 && defined(GHASH_ASM)
649 # if    !defined(I386_ONLY) && \
650         (defined(__i386)        || defined(__i386__)    || \
651          defined(__x86_64)      || defined(__x86_64__)  || \
652          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
653 #  define GHASH_ASM_X86_OR_64
654 #  define GCM_FUNCREF_4BIT
655 extern unsigned int OPENSSL_ia32cap_P[2];
656
657 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
658 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
662 # define gcm_init_avx   gcm_init_clmul
663 # define gcm_gmult_avx  gcm_gmult_clmul
664 # define gcm_ghash_avx  gcm_ghash_clmul
665 #else
666 void gcm_init_avx(u128 Htable[16],const u64 Xi[2]);
667 void gcm_gmult_avx(u64 Xi[2],const u128 Htable[16]);
668 void gcm_ghash_avx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
669 #endif
670
671 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
672 #   define GHASH_ASM_X86
673 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675
676 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
677 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
678 #  endif
679 # elif defined(__arm__) || defined(__arm)
680 #  include "arm_arch.h"
681 #  if __ARM_ARCH__>=7
682 #   define GHASH_ASM_ARM
683 #   define GCM_FUNCREF_4BIT
684 void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
685 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
686 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
687 #  endif
688 # elif defined(__sparc__) || defined(__sparc)
689 #  include "sparc_arch.h"
690 #  define GHASH_ASM_SPARC
691 #  define GCM_FUNCREF_4BIT
692 extern unsigned int OPENSSL_sparcv9cap_P[];
693 void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
694 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
695 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
696 # endif
697 #endif
698
699 #ifdef GCM_FUNCREF_4BIT
700 # undef  GCM_MUL
701 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
702 # ifdef GHASH
703 #  undef  GHASH
704 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
705 # endif
706 #endif
707
708 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
709 {
710         const union { long one; char little; } is_endian = {1};
711
712         memset(ctx,0,sizeof(*ctx));
713         ctx->block = block;
714         ctx->key   = key;
715
716         (*block)(ctx->H.c,ctx->H.c,key);
717
718         if (is_endian.little) {
719                 /* H is stored in host byte order */
720 #ifdef BSWAP8
721                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
722                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
723 #else
724                 u8 *p = ctx->H.c;
725                 u64 hi,lo;
726                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
727                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
728                 ctx->H.u[0] = hi;
729                 ctx->H.u[1] = lo;
730 #endif
731         }
732
733 #if     TABLE_BITS==8
734         gcm_init_8bit(ctx->Htable,ctx->H.u);
735 #elif   TABLE_BITS==4
736 # if    defined(GHASH_ASM_X86_OR_64)
737 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
738         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
739             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
740                 if (((OPENSSL_ia32cap_P[1]>>22)&0x41)==0x41) {  /* AVX+MOVBE */
741                         gcm_init_avx(ctx->Htable,ctx->H.u);
742                         ctx->gmult = gcm_gmult_avx;
743                         ctx->ghash = gcm_ghash_avx;
744                 } else {
745                         gcm_init_clmul(ctx->Htable,ctx->H.u);
746                         ctx->gmult = gcm_gmult_clmul;
747                         ctx->ghash = gcm_ghash_clmul;
748                 }
749                 return;
750         }
751 #  endif
752         gcm_init_4bit(ctx->Htable,ctx->H.u);
753 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
754 #   if  defined(OPENSSL_IA32_SSE2)
755         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
756 #   else
757         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
758 #   endif
759                 ctx->gmult = gcm_gmult_4bit_mmx;
760                 ctx->ghash = gcm_ghash_4bit_mmx;
761         } else {
762                 ctx->gmult = gcm_gmult_4bit_x86;
763                 ctx->ghash = gcm_ghash_4bit_x86;
764         }
765 #  else
766         ctx->gmult = gcm_gmult_4bit;
767         ctx->ghash = gcm_ghash_4bit;
768 #  endif
769 # elif  defined(GHASH_ASM_ARM)
770         if (OPENSSL_armcap_P & ARMV7_NEON) {
771                 gcm_init_neon(ctx->Htable,ctx->H.u);
772                 ctx->gmult = gcm_gmult_neon;
773                 ctx->ghash = gcm_ghash_neon;
774         } else {
775                 gcm_init_4bit(ctx->Htable,ctx->H.u);
776                 ctx->gmult = gcm_gmult_4bit;
777                 ctx->ghash = gcm_ghash_4bit;
778         }
779 # elif  defined(GHASH_ASM_SPARC)
780         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
781                 gcm_init_vis3(ctx->Htable,ctx->H.u);
782                 ctx->gmult = gcm_gmult_vis3;
783                 ctx->ghash = gcm_ghash_vis3;
784         } else {
785                 gcm_init_4bit(ctx->Htable,ctx->H.u);
786                 ctx->gmult = gcm_gmult_4bit;
787                 ctx->ghash = gcm_ghash_4bit;
788         }
789 # else
790         gcm_init_4bit(ctx->Htable,ctx->H.u);
791 # endif
792 #endif
793 }
794
795 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
796 {
797         const union { long one; char little; } is_endian = {1};
798         unsigned int ctr;
799 #ifdef GCM_FUNCREF_4BIT
800         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
801 #endif
802
803         ctx->Yi.u[0]  = 0;
804         ctx->Yi.u[1]  = 0;
805         ctx->Xi.u[0]  = 0;
806         ctx->Xi.u[1]  = 0;
807         ctx->len.u[0] = 0;      /* AAD length */
808         ctx->len.u[1] = 0;      /* message length */
809         ctx->ares = 0;
810         ctx->mres = 0;
811
812         if (len==12) {
813                 memcpy(ctx->Yi.c,iv,12);
814                 ctx->Yi.c[15]=1;
815                 ctr=1;
816         }
817         else {
818                 size_t i;
819                 u64 len0 = len;
820
821                 while (len>=16) {
822                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
823                         GCM_MUL(ctx,Yi);
824                         iv += 16;
825                         len -= 16;
826                 }
827                 if (len) {
828                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
829                         GCM_MUL(ctx,Yi);
830                 }
831                 len0 <<= 3;
832                 if (is_endian.little) {
833 #ifdef BSWAP8
834                         ctx->Yi.u[1]  ^= BSWAP8(len0);
835 #else
836                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
837                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
838                         ctx->Yi.c[10] ^= (u8)(len0>>40);
839                         ctx->Yi.c[11] ^= (u8)(len0>>32);
840                         ctx->Yi.c[12] ^= (u8)(len0>>24);
841                         ctx->Yi.c[13] ^= (u8)(len0>>16);
842                         ctx->Yi.c[14] ^= (u8)(len0>>8);
843                         ctx->Yi.c[15] ^= (u8)(len0);
844 #endif
845                 }
846                 else
847                         ctx->Yi.u[1]  ^= len0;
848
849                 GCM_MUL(ctx,Yi);
850
851                 if (is_endian.little)
852 #ifdef BSWAP4
853                         ctr = BSWAP4(ctx->Yi.d[3]);
854 #else
855                         ctr = GETU32(ctx->Yi.c+12);
856 #endif
857                 else
858                         ctr = ctx->Yi.d[3];
859         }
860
861         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
862         ++ctr;
863         if (is_endian.little)
864 #ifdef BSWAP4
865                 ctx->Yi.d[3] = BSWAP4(ctr);
866 #else
867                 PUTU32(ctx->Yi.c+12,ctr);
868 #endif
869         else
870                 ctx->Yi.d[3] = ctr;
871 }
872
873 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
874 {
875         size_t i;
876         unsigned int n;
877         u64 alen = ctx->len.u[0];
878 #ifdef GCM_FUNCREF_4BIT
879         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
880 # ifdef GHASH
881         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
882                                 const u8 *inp,size_t len)       = ctx->ghash;
883 # endif
884 #endif
885
886         if (ctx->len.u[1]) return -2;
887
888         alen += len;
889         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
890                 return -1;
891         ctx->len.u[0] = alen;
892
893         n = ctx->ares;
894         if (n) {
895                 while (n && len) {
896                         ctx->Xi.c[n] ^= *(aad++);
897                         --len;
898                         n = (n+1)%16;
899                 }
900                 if (n==0) GCM_MUL(ctx,Xi);
901                 else {
902                         ctx->ares = n;
903                         return 0;
904                 }
905         }
906
907 #ifdef GHASH
908         if ((i = (len&(size_t)-16))) {
909                 GHASH(ctx,aad,i);
910                 aad += i;
911                 len -= i;
912         }
913 #else
914         while (len>=16) {
915                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
916                 GCM_MUL(ctx,Xi);
917                 aad += 16;
918                 len -= 16;
919         }
920 #endif
921         if (len) {
922                 n = (unsigned int)len;
923                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
924         }
925
926         ctx->ares = n;
927         return 0;
928 }
929
930 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
931                 const unsigned char *in, unsigned char *out,
932                 size_t len)
933 {
934         const union { long one; char little; } is_endian = {1};
935         unsigned int n, ctr;
936         size_t i;
937         u64        mlen  = ctx->len.u[1];
938         block128_f block = ctx->block;
939         void      *key   = ctx->key;
940 #ifdef GCM_FUNCREF_4BIT
941         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
942 # ifdef GHASH
943         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
944                                 const u8 *inp,size_t len)       = ctx->ghash;
945 # endif
946 #endif
947
948 #if 0
949         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
950 #endif
951         mlen += len;
952         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
953                 return -1;
954         ctx->len.u[1] = mlen;
955
956         if (ctx->ares) {
957                 /* First call to encrypt finalizes GHASH(AAD) */
958                 GCM_MUL(ctx,Xi);
959                 ctx->ares = 0;
960         }
961
962         if (is_endian.little)
963 #ifdef BSWAP4
964                 ctr = BSWAP4(ctx->Yi.d[3]);
965 #else
966                 ctr = GETU32(ctx->Yi.c+12);
967 #endif
968         else
969                 ctr = ctx->Yi.d[3];
970
971         n = ctx->mres;
972 #if !defined(OPENSSL_SMALL_FOOTPRINT)
973         if (16%sizeof(size_t) == 0) do {        /* always true actually */
974                 if (n) {
975                         while (n && len) {
976                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
977                                 --len;
978                                 n = (n+1)%16;
979                         }
980                         if (n==0) GCM_MUL(ctx,Xi);
981                         else {
982                                 ctx->mres = n;
983                                 return 0;
984                         }
985                 }
986 #if defined(STRICT_ALIGNMENT)
987                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
988                         break;
989 #endif
990 #if defined(GHASH) && defined(GHASH_CHUNK)
991                 while (len>=GHASH_CHUNK) {
992                     size_t j=GHASH_CHUNK;
993
994                     while (j) {
995                         size_t *out_t=(size_t *)out;
996                         const size_t *in_t=(const size_t *)in;
997
998                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
999                         ++ctr;
1000                         if (is_endian.little)
1001 #ifdef BSWAP4
1002                                 ctx->Yi.d[3] = BSWAP4(ctr);
1003 #else
1004                                 PUTU32(ctx->Yi.c+12,ctr);
1005 #endif
1006                         else
1007                                 ctx->Yi.d[3] = ctr;
1008                         for (i=0; i<16/sizeof(size_t); ++i)
1009                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1010                         out += 16;
1011                         in  += 16;
1012                         j   -= 16;
1013                     }
1014                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
1015                     len -= GHASH_CHUNK;
1016                 }
1017                 if ((i = (len&(size_t)-16))) {
1018                     size_t j=i;
1019
1020                     while (len>=16) {
1021                         size_t *out_t=(size_t *)out;
1022                         const size_t *in_t=(const size_t *)in;
1023
1024                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1025                         ++ctr;
1026                         if (is_endian.little)
1027 #ifdef BSWAP4
1028                                 ctx->Yi.d[3] = BSWAP4(ctr);
1029 #else
1030                                 PUTU32(ctx->Yi.c+12,ctr);
1031 #endif
1032                         else
1033                                 ctx->Yi.d[3] = ctr;
1034                         for (i=0; i<16/sizeof(size_t); ++i)
1035                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1036                         out += 16;
1037                         in  += 16;
1038                         len -= 16;
1039                     }
1040                     GHASH(ctx,out-j,j);
1041                 }
1042 #else
1043                 while (len>=16) {
1044                         size_t *out_t=(size_t *)out;
1045                         const size_t *in_t=(const size_t *)in;
1046
1047                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1048                         ++ctr;
1049                         if (is_endian.little)
1050 #ifdef BSWAP4
1051                                 ctx->Yi.d[3] = BSWAP4(ctr);
1052 #else
1053                                 PUTU32(ctx->Yi.c+12,ctr);
1054 #endif
1055                         else
1056                                 ctx->Yi.d[3] = ctr;
1057                         for (i=0; i<16/sizeof(size_t); ++i)
1058                                 ctx->Xi.t[i] ^=
1059                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1060                         GCM_MUL(ctx,Xi);
1061                         out += 16;
1062                         in  += 16;
1063                         len -= 16;
1064                 }
1065 #endif
1066                 if (len) {
1067                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1068                         ++ctr;
1069                         if (is_endian.little)
1070 #ifdef BSWAP4
1071                                 ctx->Yi.d[3] = BSWAP4(ctr);
1072 #else
1073                                 PUTU32(ctx->Yi.c+12,ctr);
1074 #endif
1075                         else
1076                                 ctx->Yi.d[3] = ctr;
1077                         while (len--) {
1078                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1079                                 ++n;
1080                         }
1081                 }
1082
1083                 ctx->mres = n;
1084                 return 0;
1085         } while(0);
1086 #endif
1087         for (i=0;i<len;++i) {
1088                 if (n==0) {
1089                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1090                         ++ctr;
1091                         if (is_endian.little)
1092 #ifdef BSWAP4
1093                                 ctx->Yi.d[3] = BSWAP4(ctr);
1094 #else
1095                                 PUTU32(ctx->Yi.c+12,ctr);
1096 #endif
1097                         else
1098                                 ctx->Yi.d[3] = ctr;
1099                 }
1100                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1101                 n = (n+1)%16;
1102                 if (n==0)
1103                         GCM_MUL(ctx,Xi);
1104         }
1105
1106         ctx->mres = n;
1107         return 0;
1108 }
1109
1110 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1111                 const unsigned char *in, unsigned char *out,
1112                 size_t len)
1113 {
1114         const union { long one; char little; } is_endian = {1};
1115         unsigned int n, ctr;
1116         size_t i;
1117         u64        mlen  = ctx->len.u[1];
1118         block128_f block = ctx->block;
1119         void      *key   = ctx->key;
1120 #ifdef GCM_FUNCREF_4BIT
1121         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1122 # ifdef GHASH
1123         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1124                                 const u8 *inp,size_t len)       = ctx->ghash;
1125 # endif
1126 #endif
1127
1128         mlen += len;
1129         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1130                 return -1;
1131         ctx->len.u[1] = mlen;
1132
1133         if (ctx->ares) {
1134                 /* First call to decrypt finalizes GHASH(AAD) */
1135                 GCM_MUL(ctx,Xi);
1136                 ctx->ares = 0;
1137         }
1138
1139         if (is_endian.little)
1140 #ifdef BSWAP4
1141                 ctr = BSWAP4(ctx->Yi.d[3]);
1142 #else
1143                 ctr = GETU32(ctx->Yi.c+12);
1144 #endif
1145         else
1146                 ctr = ctx->Yi.d[3];
1147
1148         n = ctx->mres;
1149 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1150         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1151                 if (n) {
1152                         while (n && len) {
1153                                 u8 c = *(in++);
1154                                 *(out++) = c^ctx->EKi.c[n];
1155                                 ctx->Xi.c[n] ^= c;
1156                                 --len;
1157                                 n = (n+1)%16;
1158                         }
1159                         if (n==0) GCM_MUL (ctx,Xi);
1160                         else {
1161                                 ctx->mres = n;
1162                                 return 0;
1163                         }
1164                 }
1165 #if defined(STRICT_ALIGNMENT)
1166                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1167                         break;
1168 #endif
1169 #if defined(GHASH) && defined(GHASH_CHUNK)
1170                 while (len>=GHASH_CHUNK) {
1171                     size_t j=GHASH_CHUNK;
1172
1173                     GHASH(ctx,in,GHASH_CHUNK);
1174                     while (j) {
1175                         size_t *out_t=(size_t *)out;
1176                         const size_t *in_t=(const size_t *)in;
1177
1178                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1179                         ++ctr;
1180                         if (is_endian.little)
1181 #ifdef BSWAP4
1182                                 ctx->Yi.d[3] = BSWAP4(ctr);
1183 #else
1184                                 PUTU32(ctx->Yi.c+12,ctr);
1185 #endif
1186                         else
1187                                 ctx->Yi.d[3] = ctr;
1188                         for (i=0; i<16/sizeof(size_t); ++i)
1189                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1190                         out += 16;
1191                         in  += 16;
1192                         j   -= 16;
1193                     }
1194                     len -= GHASH_CHUNK;
1195                 }
1196                 if ((i = (len&(size_t)-16))) {
1197                     GHASH(ctx,in,i);
1198                     while (len>=16) {
1199                         size_t *out_t=(size_t *)out;
1200                         const size_t *in_t=(const size_t *)in;
1201
1202                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1203                         ++ctr;
1204                         if (is_endian.little)
1205 #ifdef BSWAP4
1206                                 ctx->Yi.d[3] = BSWAP4(ctr);
1207 #else
1208                                 PUTU32(ctx->Yi.c+12,ctr);
1209 #endif
1210                         else
1211                                 ctx->Yi.d[3] = ctr;
1212                         for (i=0; i<16/sizeof(size_t); ++i)
1213                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1214                         out += 16;
1215                         in  += 16;
1216                         len -= 16;
1217                     }
1218                 }
1219 #else
1220                 while (len>=16) {
1221                         size_t *out_t=(size_t *)out;
1222                         const size_t *in_t=(const size_t *)in;
1223
1224                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1225                         ++ctr;
1226                         if (is_endian.little)
1227 #ifdef BSWAP4
1228                                 ctx->Yi.d[3] = BSWAP4(ctr);
1229 #else
1230                                 PUTU32(ctx->Yi.c+12,ctr);
1231 #endif
1232                         else
1233                                 ctx->Yi.d[3] = ctr;
1234                         for (i=0; i<16/sizeof(size_t); ++i) {
1235                                 size_t c = in[i];
1236                                 out[i] = c^ctx->EKi.t[i];
1237                                 ctx->Xi.t[i] ^= c;
1238                         }
1239                         GCM_MUL(ctx,Xi);
1240                         out += 16;
1241                         in  += 16;
1242                         len -= 16;
1243                 }
1244 #endif
1245                 if (len) {
1246                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1247                         ++ctr;
1248                         if (is_endian.little)
1249 #ifdef BSWAP4
1250                                 ctx->Yi.d[3] = BSWAP4(ctr);
1251 #else
1252                                 PUTU32(ctx->Yi.c+12,ctr);
1253 #endif
1254                         else
1255                                 ctx->Yi.d[3] = ctr;
1256                         while (len--) {
1257                                 u8 c = in[n];
1258                                 ctx->Xi.c[n] ^= c;
1259                                 out[n] = c^ctx->EKi.c[n];
1260                                 ++n;
1261                         }
1262                 }
1263
1264                 ctx->mres = n;
1265                 return 0;
1266         } while(0);
1267 #endif
1268         for (i=0;i<len;++i) {
1269                 u8 c;
1270                 if (n==0) {
1271                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1272                         ++ctr;
1273                         if (is_endian.little)
1274 #ifdef BSWAP4
1275                                 ctx->Yi.d[3] = BSWAP4(ctr);
1276 #else
1277                                 PUTU32(ctx->Yi.c+12,ctr);
1278 #endif
1279                         else
1280                                 ctx->Yi.d[3] = ctr;
1281                 }
1282                 c = in[i];
1283                 out[i] = c^ctx->EKi.c[n];
1284                 ctx->Xi.c[n] ^= c;
1285                 n = (n+1)%16;
1286                 if (n==0)
1287                         GCM_MUL(ctx,Xi);
1288         }
1289
1290         ctx->mres = n;
1291         return 0;
1292 }
1293
1294 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1295                 const unsigned char *in, unsigned char *out,
1296                 size_t len, ctr128_f stream)
1297 {
1298         const union { long one; char little; } is_endian = {1};
1299         unsigned int n, ctr;
1300         size_t i;
1301         u64   mlen = ctx->len.u[1];
1302         void *key  = ctx->key;
1303 #ifdef GCM_FUNCREF_4BIT
1304         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1305 # ifdef GHASH
1306         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1307                                 const u8 *inp,size_t len)       = ctx->ghash;
1308 # endif
1309 #endif
1310
1311         mlen += len;
1312         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1313                 return -1;
1314         ctx->len.u[1] = mlen;
1315
1316         if (ctx->ares) {
1317                 /* First call to encrypt finalizes GHASH(AAD) */
1318                 GCM_MUL(ctx,Xi);
1319                 ctx->ares = 0;
1320         }
1321
1322         if (is_endian.little)
1323 #ifdef BSWAP4
1324                 ctr = BSWAP4(ctx->Yi.d[3]);
1325 #else
1326                 ctr = GETU32(ctx->Yi.c+12);
1327 #endif
1328         else
1329                 ctr = ctx->Yi.d[3];
1330
1331         n = ctx->mres;
1332         if (n) {
1333                 while (n && len) {
1334                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1335                         --len;
1336                         n = (n+1)%16;
1337                 }
1338                 if (n==0) GCM_MUL(ctx,Xi);
1339                 else {
1340                         ctx->mres = n;
1341                         return 0;
1342                 }
1343         }
1344 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1345         while (len>=GHASH_CHUNK) {
1346                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1347                 ctr += GHASH_CHUNK/16;
1348                 if (is_endian.little)
1349 #ifdef BSWAP4
1350                         ctx->Yi.d[3] = BSWAP4(ctr);
1351 #else
1352                         PUTU32(ctx->Yi.c+12,ctr);
1353 #endif
1354                 else
1355                         ctx->Yi.d[3] = ctr;
1356                 GHASH(ctx,out,GHASH_CHUNK);
1357                 out += GHASH_CHUNK;
1358                 in  += GHASH_CHUNK;
1359                 len -= GHASH_CHUNK;
1360         }
1361 #endif
1362         if ((i = (len&(size_t)-16))) {
1363                 size_t j=i/16;
1364
1365                 (*stream)(in,out,j,key,ctx->Yi.c);
1366                 ctr += (unsigned int)j;
1367                 if (is_endian.little)
1368 #ifdef BSWAP4
1369                         ctx->Yi.d[3] = BSWAP4(ctr);
1370 #else
1371                         PUTU32(ctx->Yi.c+12,ctr);
1372 #endif
1373                 else
1374                         ctx->Yi.d[3] = ctr;
1375                 in  += i;
1376                 len -= i;
1377 #if defined(GHASH)
1378                 GHASH(ctx,out,i);
1379                 out += i;
1380 #else
1381                 while (j--) {
1382                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1383                         GCM_MUL(ctx,Xi);
1384                         out += 16;
1385                 }
1386 #endif
1387         }
1388         if (len) {
1389                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1390                 ++ctr;
1391                 if (is_endian.little)
1392 #ifdef BSWAP4
1393                         ctx->Yi.d[3] = BSWAP4(ctr);
1394 #else
1395                         PUTU32(ctx->Yi.c+12,ctr);
1396 #endif
1397                 else
1398                         ctx->Yi.d[3] = ctr;
1399                 while (len--) {
1400                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1401                         ++n;
1402                 }
1403         }
1404
1405         ctx->mres = n;
1406         return 0;
1407 }
1408
1409 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1410                 const unsigned char *in, unsigned char *out,
1411                 size_t len,ctr128_f stream)
1412 {
1413         const union { long one; char little; } is_endian = {1};
1414         unsigned int n, ctr;
1415         size_t i;
1416         u64   mlen = ctx->len.u[1];
1417         void *key  = ctx->key;
1418 #ifdef GCM_FUNCREF_4BIT
1419         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1420 # ifdef GHASH
1421         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1422                                 const u8 *inp,size_t len)       = ctx->ghash;
1423 # endif
1424 #endif
1425
1426         mlen += len;
1427         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1428                 return -1;
1429         ctx->len.u[1] = mlen;
1430
1431         if (ctx->ares) {
1432                 /* First call to decrypt finalizes GHASH(AAD) */
1433                 GCM_MUL(ctx,Xi);
1434                 ctx->ares = 0;
1435         }
1436
1437         if (is_endian.little)
1438 #ifdef BSWAP4
1439                 ctr = BSWAP4(ctx->Yi.d[3]);
1440 #else
1441                 ctr = GETU32(ctx->Yi.c+12);
1442 #endif
1443         else
1444                 ctr = ctx->Yi.d[3];
1445
1446         n = ctx->mres;
1447         if (n) {
1448                 while (n && len) {
1449                         u8 c = *(in++);
1450                         *(out++) = c^ctx->EKi.c[n];
1451                         ctx->Xi.c[n] ^= c;
1452                         --len;
1453                         n = (n+1)%16;
1454                 }
1455                 if (n==0) GCM_MUL (ctx,Xi);
1456                 else {
1457                         ctx->mres = n;
1458                         return 0;
1459                 }
1460         }
1461 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1462         while (len>=GHASH_CHUNK) {
1463                 GHASH(ctx,in,GHASH_CHUNK);
1464                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1465                 ctr += GHASH_CHUNK/16;
1466                 if (is_endian.little)
1467 #ifdef BSWAP4
1468                         ctx->Yi.d[3] = BSWAP4(ctr);
1469 #else
1470                         PUTU32(ctx->Yi.c+12,ctr);
1471 #endif
1472                 else
1473                         ctx->Yi.d[3] = ctr;
1474                 out += GHASH_CHUNK;
1475                 in  += GHASH_CHUNK;
1476                 len -= GHASH_CHUNK;
1477         }
1478 #endif
1479         if ((i = (len&(size_t)-16))) {
1480                 size_t j=i/16;
1481
1482 #if defined(GHASH)
1483                 GHASH(ctx,in,i);
1484 #else
1485                 while (j--) {
1486                         size_t k;
1487                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1488                         GCM_MUL(ctx,Xi);
1489                         in += 16;
1490                 }
1491                 j   = i/16;
1492                 in -= i;
1493 #endif
1494                 (*stream)(in,out,j,key,ctx->Yi.c);
1495                 ctr += (unsigned int)j;
1496                 if (is_endian.little)
1497 #ifdef BSWAP4
1498                         ctx->Yi.d[3] = BSWAP4(ctr);
1499 #else
1500                         PUTU32(ctx->Yi.c+12,ctr);
1501 #endif
1502                 else
1503                         ctx->Yi.d[3] = ctr;
1504                 out += i;
1505                 in  += i;
1506                 len -= i;
1507         }
1508         if (len) {
1509                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1510                 ++ctr;
1511                 if (is_endian.little)
1512 #ifdef BSWAP4
1513                         ctx->Yi.d[3] = BSWAP4(ctr);
1514 #else
1515                         PUTU32(ctx->Yi.c+12,ctr);
1516 #endif
1517                 else
1518                         ctx->Yi.d[3] = ctr;
1519                 while (len--) {
1520                         u8 c = in[n];
1521                         ctx->Xi.c[n] ^= c;
1522                         out[n] = c^ctx->EKi.c[n];
1523                         ++n;
1524                 }
1525         }
1526
1527         ctx->mres = n;
1528         return 0;
1529 }
1530
1531 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1532                         size_t len)
1533 {
1534         const union { long one; char little; } is_endian = {1};
1535         u64 alen = ctx->len.u[0]<<3;
1536         u64 clen = ctx->len.u[1]<<3;
1537 #ifdef GCM_FUNCREF_4BIT
1538         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1539 #endif
1540
1541         if (ctx->mres || ctx->ares)
1542                 GCM_MUL(ctx,Xi);
1543
1544         if (is_endian.little) {
1545 #ifdef BSWAP8
1546                 alen = BSWAP8(alen);
1547                 clen = BSWAP8(clen);
1548 #else
1549                 u8 *p = ctx->len.c;
1550
1551                 ctx->len.u[0] = alen;
1552                 ctx->len.u[1] = clen;
1553
1554                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1555                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1556 #endif
1557         }
1558
1559         ctx->Xi.u[0] ^= alen;
1560         ctx->Xi.u[1] ^= clen;
1561         GCM_MUL(ctx,Xi);
1562
1563         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1564         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1565
1566         if (tag && len<=sizeof(ctx->Xi))
1567                 return memcmp(ctx->Xi.c,tag,len);
1568         else
1569                 return -1;
1570 }
1571
1572 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1573 {
1574         CRYPTO_gcm128_finish(ctx, NULL, 0);
1575         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1576 }
1577
1578 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1579 {
1580         GCM128_CONTEXT *ret;
1581
1582         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1583                 CRYPTO_gcm128_init(ret,key,block);
1584
1585         return ret;
1586 }
1587
1588 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1589 {
1590         if (ctx) {
1591                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1592                 OPENSSL_free(ctx);
1593         }
1594 }
1595
1596 #if defined(SELFTEST)
1597 #include <stdio.h>
1598 #include <openssl/aes.h>
1599
1600 /* Test Case 1 */
1601 static const u8 K1[16],
1602                 *P1=NULL,
1603                 *A1=NULL,
1604                 IV1[12],
1605                 *C1=NULL,
1606                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1607
1608 /* Test Case 2 */
1609 #define K2 K1
1610 #define A2 A1
1611 #define IV2 IV1
1612 static const u8 P2[16],
1613                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1614                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1615
1616 /* Test Case 3 */
1617 #define A3 A2
1618 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1619                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1620                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1621                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1622                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1623                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1624                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1625                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1626                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1627                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1628                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1629
1630 /* Test Case 4 */
1631 #define K4 K3
1632 #define IV4 IV3
1633 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1634                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1635                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1636                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1637                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1638                         0xab,0xad,0xda,0xd2},
1639                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1640                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1641                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1642                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1643                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1644
1645 /* Test Case 5 */
1646 #define K5 K4
1647 #define P5 P4
1648 #define A5 A4
1649 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1650                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1651                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1652                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1653                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1654                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1655
1656 /* Test Case 6 */
1657 #define K6 K5
1658 #define P6 P5
1659 #define A6 A5
1660 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1661                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1662                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1663                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1664                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1665                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1666                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1667                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1668                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1669
1670 /* Test Case 7 */
1671 static const u8 K7[24],
1672                 *P7=NULL,
1673                 *A7=NULL,
1674                 IV7[12],
1675                 *C7=NULL,
1676                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1677
1678 /* Test Case 8 */
1679 #define K8 K7
1680 #define IV8 IV7
1681 #define A8 A7
1682 static const u8 P8[16],
1683                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1684                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1685
1686 /* Test Case 9 */
1687 #define A9 A8
1688 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1689                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1690                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1691                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1692                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1693                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1694                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1695                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1696                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1697                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1698                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1699                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1700
1701 /* Test Case 10 */
1702 #define K10 K9
1703 #define IV10 IV9
1704 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1705                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1706                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1707                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1708                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1709                         0xab,0xad,0xda,0xd2},
1710                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1711                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1712                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1713                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1714                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1715
1716 /* Test Case 11 */
1717 #define K11 K10
1718 #define P11 P10
1719 #define A11 A10
1720 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1721                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1722                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1723                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1724                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1725                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1726
1727 /* Test Case 12 */
1728 #define K12 K11
1729 #define P12 P11
1730 #define A12 A11
1731 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1732                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1733                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1734                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1735                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1736                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1737                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1738                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1739                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1740
1741 /* Test Case 13 */
1742 static const u8 K13[32],
1743                 *P13=NULL,
1744                 *A13=NULL,
1745                 IV13[12],
1746                 *C13=NULL,
1747                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1748
1749 /* Test Case 14 */
1750 #define K14 K13
1751 #define A14 A13
1752 static const u8 P14[16],
1753                 IV14[12],
1754                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1755                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1756
1757 /* Test Case 15 */
1758 #define A15 A14
1759 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1760                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1761                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1762                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1763                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1764                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1765                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1766                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1767                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1768                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1769                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1770                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1771
1772 /* Test Case 16 */
1773 #define K16 K15
1774 #define IV16 IV15
1775 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1776                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1777                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1778                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1779                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1780                         0xab,0xad,0xda,0xd2},
1781                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1782                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1783                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1784                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1785                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1786
1787 /* Test Case 17 */
1788 #define K17 K16
1789 #define P17 P16
1790 #define A17 A16
1791 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1792                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1793                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1794                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1795                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1796                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1797
1798 /* Test Case 18 */
1799 #define K18 K17
1800 #define P18 P17
1801 #define A18 A17
1802 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1803                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1804                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1805                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1806                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1807                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1808                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1809                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1810                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1811
1812 /* Test Case 19 */
1813 #define K19 K1
1814 #define P19 P1
1815 #define IV19 IV1
1816 #define C19 C1
1817 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1818                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1819                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1820                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1821                         0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1822                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1823                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1824                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1825                 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1826
1827 /* Test Case 20 */
1828 #define K20 K1
1829 #define A20 A1
1830 static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
1831                 P20[288],
1832                 C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1833                         0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1834                         0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1835                         0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1836                         0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1837                         0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1838                         0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1839                         0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1840                         0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1841                         0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1842                         0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1843                         0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1844                         0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1845                         0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1846                         0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1847                         0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1848                         0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1849                         0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1850                 T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1851
1852 #define TEST_CASE(n)    do {                                    \
1853         u8 out[sizeof(P##n)];                                   \
1854         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1855         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1856         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1857         memset(out,0,sizeof(out));                              \
1858         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1859         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1860         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1861             (C##n && memcmp(out,C##n,sizeof(out))))             \
1862                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1863         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1864         memset(out,0,sizeof(out));                              \
1865         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1866         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1867         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1868             (P##n && memcmp(out,P##n,sizeof(out))))             \
1869                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1870         } while(0)
1871
1872 int main()
1873 {
1874         GCM128_CONTEXT ctx;
1875         AES_KEY key;
1876         int ret=0;
1877
1878         TEST_CASE(1);
1879         TEST_CASE(2);
1880         TEST_CASE(3);
1881         TEST_CASE(4);
1882         TEST_CASE(5);
1883         TEST_CASE(6);
1884         TEST_CASE(7);
1885         TEST_CASE(8);
1886         TEST_CASE(9);
1887         TEST_CASE(10);
1888         TEST_CASE(11);
1889         TEST_CASE(12);
1890         TEST_CASE(13);
1891         TEST_CASE(14);
1892         TEST_CASE(15);
1893         TEST_CASE(16);
1894         TEST_CASE(17);
1895         TEST_CASE(18);
1896         TEST_CASE(19);
1897         TEST_CASE(20);
1898
1899 #ifdef OPENSSL_CPUID_OBJ
1900         {
1901         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1902         union { u64 u; u8 c[1024]; } buf;
1903         int i;
1904
1905         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1906         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1907         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1908
1909         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1910         start = OPENSSL_rdtsc();
1911         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1912         gcm_t = OPENSSL_rdtsc() - start;
1913
1914         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1915                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1916                         (block128_f)AES_encrypt);
1917         start = OPENSSL_rdtsc();
1918         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1919                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1920                         (block128_f)AES_encrypt);
1921         ctr_t = OPENSSL_rdtsc() - start;
1922
1923         printf("%.2f-%.2f=%.2f\n",
1924                         gcm_t/(double)sizeof(buf),
1925                         ctr_t/(double)sizeof(buf),
1926                         (gcm_t-ctr_t)/(double)sizeof(buf));
1927 #ifdef GHASH
1928         {
1929         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1930                                 const u8 *inp,size_t len)       = ctx.ghash;
1931
1932         GHASH((&ctx),buf.c,sizeof(buf));
1933         start = OPENSSL_rdtsc();
1934         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1935         gcm_t = OPENSSL_rdtsc() - start;
1936         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1937         }
1938 #endif
1939         }
1940 #endif
1941
1942         return ret;
1943 }
1944 #endif