c8c906c8211632ea52729e18129df08e429fe0bc
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         __fips_constseg
152         static const size_t rem_8bit[256] = {
153                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217
218         while (1) {
219                 Z.hi ^= Htable[n].hi;
220                 Z.lo ^= Htable[n].lo;
221
222                 if ((u8 *)Xi==xi)       break;
223
224                 n = *(--xi);
225
226                 rem  = (size_t)Z.lo&0xff;
227                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228                 Z.hi = (Z.hi>>8);
229                 if (sizeof(size_t)==8)
230                         Z.hi ^= rem_8bit[rem];
231                 else
232                         Z.hi ^= (u64)rem_8bit[rem]<<32;
233         }
234
235         if (is_endian.little) {
236 #ifdef BSWAP8
237                 Xi[0] = BSWAP8(Z.hi);
238                 Xi[1] = BSWAP8(Z.lo);
239 #else
240                 u8 *p = (u8 *)Xi;
241                 u32 v;
242                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
243                 v = (u32)(Z.hi);        PUTU32(p+4,v);
244                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
245                 v = (u32)(Z.lo);        PUTU32(p+12,v);
246 #endif
247         }
248         else {
249                 Xi[0] = Z.hi;
250                 Xi[1] = Z.lo;
251         }
252 }
253 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
254
255 #elif   TABLE_BITS==4
256
257 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
258 {
259         u128 V;
260 #if defined(OPENSSL_SMALL_FOOTPRINT)
261         int  i;
262 #endif
263
264         Htable[0].hi = 0;
265         Htable[0].lo = 0;
266         V.hi = H[0];
267         V.lo = H[1];
268
269 #if defined(OPENSSL_SMALL_FOOTPRINT)
270         for (Htable[8]=V, i=4; i>0; i>>=1) {
271                 REDUCE1BIT(V);
272                 Htable[i] = V;
273         }
274
275         for (i=2; i<16; i<<=1) {
276                 u128 *Hi = Htable+i;
277                 int   j;
278                 for (V=*Hi, j=1; j<i; ++j) {
279                         Hi[j].hi = V.hi^Htable[j].hi;
280                         Hi[j].lo = V.lo^Htable[j].lo;
281                 }
282         }
283 #else
284         Htable[8] = V;
285         REDUCE1BIT(V);
286         Htable[4] = V;
287         REDUCE1BIT(V);
288         Htable[2] = V;
289         REDUCE1BIT(V);
290         Htable[1] = V;
291         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
292         V=Htable[4];
293         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
294         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
295         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
296         V=Htable[8];
297         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
298         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
299         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
300         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
301         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
302         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
303         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #endif
305 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306         /*
307          * ARM assembler expects specific dword order in Htable.
308          */
309         {
310         int j;
311         const union { long one; char little; } is_endian = {1};
312
313         if (is_endian.little)
314                 for (j=0;j<16;++j) {
315                         V = Htable[j];
316                         Htable[j].hi = V.lo;
317                         Htable[j].lo = V.hi;
318                 }
319         else
320                 for (j=0;j<16;++j) {
321                         V = Htable[j];
322                         Htable[j].hi = V.lo<<32|V.lo>>32;
323                         Htable[j].lo = V.hi<<32|V.hi>>32;
324                 }
325         }
326 #endif
327 }
328
329 #ifndef GHASH_ASM
330 __fips_constseg
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but the priority is to minimize
462      * cache footprint...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     __fips_constseg
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 #if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
649 # if    !defined(I386_ONLY) && \
650         (defined(__i386)        || defined(__i386__)    || \
651          defined(__x86_64)      || defined(__x86_64__)  || \
652          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
653 #  define GHASH_ASM_X86_OR_64
654 #  define GCM_FUNCREF_4BIT
655 extern unsigned int OPENSSL_ia32cap_P[2];
656
657 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
658 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 #if defined(__i386) || defined(__i386__) || defined(_M_IX86)
662 # define gcm_init_avx   gcm_init_clmul
663 # define gcm_gmult_avx  gcm_gmult_clmul
664 # define gcm_ghash_avx  gcm_ghash_clmul
665 #else
666 void gcm_init_avx(u128 Htable[16],const u64 Xi[2]);
667 void gcm_gmult_avx(u64 Xi[2],const u128 Htable[16]);
668 void gcm_ghash_avx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
669 #endif
670
671 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
672 #   define GHASH_ASM_X86
673 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675
676 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
677 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
678 #  endif
679 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
680 #  include "arm_arch.h"
681 #  if __ARM_ARCH__>=7
682 #   define GHASH_ASM_ARM
683 #   define GCM_FUNCREF_4BIT
684 #   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
685 #   if defined(__arm__) || defined(__arm)
686 #    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
687 #   endif
688 void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
689 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
690 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
691 void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
692 void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
693 void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
694 #  endif
695 # elif defined(__sparc__) || defined(__sparc)
696 #  include "sparc_arch.h"
697 #  define GHASH_ASM_SPARC
698 #  define GCM_FUNCREF_4BIT
699 extern unsigned int OPENSSL_sparcv9cap_P[];
700 void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
701 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
702 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
703 #elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
704 #  define GHASH_ASM_PPC
705 #  define GCM_FUNCREF_4BIT
706 extern unsigned int OPENSSL_ppccap_P[];
707 void gcm_init_p8(u128 Htable[16],const u64 Xi[2]);
708 void gcm_gmult_p8(u64 Xi[2],const u128 Htable[16]);
709 void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
710 # endif
711 #endif
712
713 #ifdef GCM_FUNCREF_4BIT
714 # undef  GCM_MUL
715 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
716 # ifdef GHASH
717 #  undef  GHASH
718 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
719 # endif
720 #endif
721
722 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
723 {
724         const union { long one; char little; } is_endian = {1};
725
726         memset(ctx,0,sizeof(*ctx));
727         ctx->block = block;
728         ctx->key   = key;
729
730         (*block)(ctx->H.c,ctx->H.c,key);
731
732         if (is_endian.little) {
733                 /* H is stored in host byte order */
734 #ifdef BSWAP8
735                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
736                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
737 #else
738                 u8 *p = ctx->H.c;
739                 u64 hi,lo;
740                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
741                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
742                 ctx->H.u[0] = hi;
743                 ctx->H.u[1] = lo;
744 #endif
745         }
746
747 #if     TABLE_BITS==8
748         gcm_init_8bit(ctx->Htable,ctx->H.u);
749 #elif   TABLE_BITS==4
750 # if    defined(GHASH_ASM_X86_OR_64)
751 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
752         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
753             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
754                 if (((OPENSSL_ia32cap_P[1]>>22)&0x41)==0x41) {  /* AVX+MOVBE */
755                         gcm_init_avx(ctx->Htable,ctx->H.u);
756                         ctx->gmult = gcm_gmult_avx;
757                         ctx->ghash = gcm_ghash_avx;
758                 } else {
759                         gcm_init_clmul(ctx->Htable,ctx->H.u);
760                         ctx->gmult = gcm_gmult_clmul;
761                         ctx->ghash = gcm_ghash_clmul;
762                 }
763                 return;
764         }
765 #  endif
766         gcm_init_4bit(ctx->Htable,ctx->H.u);
767 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
768 #   if  defined(OPENSSL_IA32_SSE2)
769         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
770 #   else
771         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
772 #   endif
773                 ctx->gmult = gcm_gmult_4bit_mmx;
774                 ctx->ghash = gcm_ghash_4bit_mmx;
775         } else {
776                 ctx->gmult = gcm_gmult_4bit_x86;
777                 ctx->ghash = gcm_ghash_4bit_x86;
778         }
779 #  else
780         ctx->gmult = gcm_gmult_4bit;
781         ctx->ghash = gcm_ghash_4bit;
782 #  endif
783 # elif  defined(GHASH_ASM_ARM)
784 #  ifdef PMULL_CAPABLE
785         if (PMULL_CAPABLE) {
786                 gcm_init_v8(ctx->Htable,ctx->H.u);
787                 ctx->gmult = gcm_gmult_v8;
788                 ctx->ghash = gcm_ghash_v8;
789         } else
790 #  endif
791 #  ifdef NEON_CAPABLE
792         if (NEON_CAPABLE) {
793                 gcm_init_neon(ctx->Htable,ctx->H.u);
794                 ctx->gmult = gcm_gmult_neon;
795                 ctx->ghash = gcm_ghash_neon;
796         } else
797 #  endif
798         {
799                 gcm_init_4bit(ctx->Htable,ctx->H.u);
800                 ctx->gmult = gcm_gmult_4bit;
801                 ctx->ghash = gcm_ghash_4bit;
802         }
803 # elif  defined(GHASH_ASM_SPARC)
804         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
805                 gcm_init_vis3(ctx->Htable,ctx->H.u);
806                 ctx->gmult = gcm_gmult_vis3;
807                 ctx->ghash = gcm_ghash_vis3;
808         } else {
809                 gcm_init_4bit(ctx->Htable,ctx->H.u);
810                 ctx->gmult = gcm_gmult_4bit;
811                 ctx->ghash = gcm_ghash_4bit;
812         }
813 # elif  defined(GHASH_ASM_PPC)
814         if (OPENSSL_ppccap_P[0] & (1<<2)) {
815                 gcm_init_p8(ctx->Htable,ctx->H.u);
816                 ctx->gmult = gcm_gmult_p8;
817                 ctx->ghash = gcm_ghash_p8;
818         } else {
819                 gcm_init_4bit(ctx->Htable,ctx->H.u);
820                 ctx->gmult = gcm_gmult_4bit;
821                 ctx->ghash = gcm_ghash_4bit;
822         }
823 # else
824         gcm_init_4bit(ctx->Htable,ctx->H.u);
825 # endif
826 #endif
827 }
828
829 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
830 {
831         const union { long one; char little; } is_endian = {1};
832         unsigned int ctr;
833 #ifdef GCM_FUNCREF_4BIT
834         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
835 #endif
836
837         ctx->Yi.u[0]  = 0;
838         ctx->Yi.u[1]  = 0;
839         ctx->Xi.u[0]  = 0;
840         ctx->Xi.u[1]  = 0;
841         ctx->len.u[0] = 0;      /* AAD length */
842         ctx->len.u[1] = 0;      /* message length */
843         ctx->ares = 0;
844         ctx->mres = 0;
845
846         if (len==12) {
847                 memcpy(ctx->Yi.c,iv,12);
848                 ctx->Yi.c[15]=1;
849                 ctr=1;
850         }
851         else {
852                 size_t i;
853                 u64 len0 = len;
854
855                 while (len>=16) {
856                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
857                         GCM_MUL(ctx,Yi);
858                         iv += 16;
859                         len -= 16;
860                 }
861                 if (len) {
862                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
863                         GCM_MUL(ctx,Yi);
864                 }
865                 len0 <<= 3;
866                 if (is_endian.little) {
867 #ifdef BSWAP8
868                         ctx->Yi.u[1]  ^= BSWAP8(len0);
869 #else
870                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
871                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
872                         ctx->Yi.c[10] ^= (u8)(len0>>40);
873                         ctx->Yi.c[11] ^= (u8)(len0>>32);
874                         ctx->Yi.c[12] ^= (u8)(len0>>24);
875                         ctx->Yi.c[13] ^= (u8)(len0>>16);
876                         ctx->Yi.c[14] ^= (u8)(len0>>8);
877                         ctx->Yi.c[15] ^= (u8)(len0);
878 #endif
879                 }
880                 else
881                         ctx->Yi.u[1]  ^= len0;
882
883                 GCM_MUL(ctx,Yi);
884
885                 if (is_endian.little)
886 #ifdef BSWAP4
887                         ctr = BSWAP4(ctx->Yi.d[3]);
888 #else
889                         ctr = GETU32(ctx->Yi.c+12);
890 #endif
891                 else
892                         ctr = ctx->Yi.d[3];
893         }
894
895         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
896         ++ctr;
897         if (is_endian.little)
898 #ifdef BSWAP4
899                 ctx->Yi.d[3] = BSWAP4(ctr);
900 #else
901                 PUTU32(ctx->Yi.c+12,ctr);
902 #endif
903         else
904                 ctx->Yi.d[3] = ctr;
905 }
906
907 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
908 {
909         size_t i;
910         unsigned int n;
911         u64 alen = ctx->len.u[0];
912 #ifdef GCM_FUNCREF_4BIT
913         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
914 # ifdef GHASH
915         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
916                                 const u8 *inp,size_t len)       = ctx->ghash;
917 # endif
918 #endif
919
920         if (ctx->len.u[1]) return -2;
921
922         alen += len;
923         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
924                 return -1;
925         ctx->len.u[0] = alen;
926
927         n = ctx->ares;
928         if (n) {
929                 while (n && len) {
930                         ctx->Xi.c[n] ^= *(aad++);
931                         --len;
932                         n = (n+1)%16;
933                 }
934                 if (n==0) GCM_MUL(ctx,Xi);
935                 else {
936                         ctx->ares = n;
937                         return 0;
938                 }
939         }
940
941 #ifdef GHASH
942         if ((i = (len&(size_t)-16))) {
943                 GHASH(ctx,aad,i);
944                 aad += i;
945                 len -= i;
946         }
947 #else
948         while (len>=16) {
949                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
950                 GCM_MUL(ctx,Xi);
951                 aad += 16;
952                 len -= 16;
953         }
954 #endif
955         if (len) {
956                 n = (unsigned int)len;
957                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
958         }
959
960         ctx->ares = n;
961         return 0;
962 }
963
964 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
965                 const unsigned char *in, unsigned char *out,
966                 size_t len)
967 {
968         const union { long one; char little; } is_endian = {1};
969         unsigned int n, ctr;
970         size_t i;
971         u64        mlen  = ctx->len.u[1];
972         block128_f block = ctx->block;
973         void      *key   = ctx->key;
974 #ifdef GCM_FUNCREF_4BIT
975         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
976 # ifdef GHASH
977         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
978                                 const u8 *inp,size_t len)       = ctx->ghash;
979 # endif
980 #endif
981
982 #if 0
983         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
984 #endif
985         mlen += len;
986         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
987                 return -1;
988         ctx->len.u[1] = mlen;
989
990         if (ctx->ares) {
991                 /* First call to encrypt finalizes GHASH(AAD) */
992                 GCM_MUL(ctx,Xi);
993                 ctx->ares = 0;
994         }
995
996         if (is_endian.little)
997 #ifdef BSWAP4
998                 ctr = BSWAP4(ctx->Yi.d[3]);
999 #else
1000                 ctr = GETU32(ctx->Yi.c+12);
1001 #endif
1002         else
1003                 ctr = ctx->Yi.d[3];
1004
1005         n = ctx->mres;
1006 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1007         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1008                 if (n) {
1009                         while (n && len) {
1010                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1011                                 --len;
1012                                 n = (n+1)%16;
1013                         }
1014                         if (n==0) GCM_MUL(ctx,Xi);
1015                         else {
1016                                 ctx->mres = n;
1017                                 return 0;
1018                         }
1019                 }
1020 #if defined(STRICT_ALIGNMENT)
1021                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1022                         break;
1023 #endif
1024 #if defined(GHASH) && defined(GHASH_CHUNK)
1025                 while (len>=GHASH_CHUNK) {
1026                     size_t j=GHASH_CHUNK;
1027
1028                     while (j) {
1029                         size_t *out_t=(size_t *)out;
1030                         const size_t *in_t=(const size_t *)in;
1031
1032                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1033                         ++ctr;
1034                         if (is_endian.little)
1035 #ifdef BSWAP4
1036                                 ctx->Yi.d[3] = BSWAP4(ctr);
1037 #else
1038                                 PUTU32(ctx->Yi.c+12,ctr);
1039 #endif
1040                         else
1041                                 ctx->Yi.d[3] = ctr;
1042                         for (i=0; i<16/sizeof(size_t); ++i)
1043                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1044                         out += 16;
1045                         in  += 16;
1046                         j   -= 16;
1047                     }
1048                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
1049                     len -= GHASH_CHUNK;
1050                 }
1051                 if ((i = (len&(size_t)-16))) {
1052                     size_t j=i;
1053
1054                     while (len>=16) {
1055                         size_t *out_t=(size_t *)out;
1056                         const size_t *in_t=(const size_t *)in;
1057
1058                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1059                         ++ctr;
1060                         if (is_endian.little)
1061 #ifdef BSWAP4
1062                                 ctx->Yi.d[3] = BSWAP4(ctr);
1063 #else
1064                                 PUTU32(ctx->Yi.c+12,ctr);
1065 #endif
1066                         else
1067                                 ctx->Yi.d[3] = ctr;
1068                         for (i=0; i<16/sizeof(size_t); ++i)
1069                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1070                         out += 16;
1071                         in  += 16;
1072                         len -= 16;
1073                     }
1074                     GHASH(ctx,out-j,j);
1075                 }
1076 #else
1077                 while (len>=16) {
1078                         size_t *out_t=(size_t *)out;
1079                         const size_t *in_t=(const size_t *)in;
1080
1081                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1082                         ++ctr;
1083                         if (is_endian.little)
1084 #ifdef BSWAP4
1085                                 ctx->Yi.d[3] = BSWAP4(ctr);
1086 #else
1087                                 PUTU32(ctx->Yi.c+12,ctr);
1088 #endif
1089                         else
1090                                 ctx->Yi.d[3] = ctr;
1091                         for (i=0; i<16/sizeof(size_t); ++i)
1092                                 ctx->Xi.t[i] ^=
1093                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1094                         GCM_MUL(ctx,Xi);
1095                         out += 16;
1096                         in  += 16;
1097                         len -= 16;
1098                 }
1099 #endif
1100                 if (len) {
1101                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1102                         ++ctr;
1103                         if (is_endian.little)
1104 #ifdef BSWAP4
1105                                 ctx->Yi.d[3] = BSWAP4(ctr);
1106 #else
1107                                 PUTU32(ctx->Yi.c+12,ctr);
1108 #endif
1109                         else
1110                                 ctx->Yi.d[3] = ctr;
1111                         while (len--) {
1112                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1113                                 ++n;
1114                         }
1115                 }
1116
1117                 ctx->mres = n;
1118                 return 0;
1119         } while(0);
1120 #endif
1121         for (i=0;i<len;++i) {
1122                 if (n==0) {
1123                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1124                         ++ctr;
1125                         if (is_endian.little)
1126 #ifdef BSWAP4
1127                                 ctx->Yi.d[3] = BSWAP4(ctr);
1128 #else
1129                                 PUTU32(ctx->Yi.c+12,ctr);
1130 #endif
1131                         else
1132                                 ctx->Yi.d[3] = ctr;
1133                 }
1134                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1135                 n = (n+1)%16;
1136                 if (n==0)
1137                         GCM_MUL(ctx,Xi);
1138         }
1139
1140         ctx->mres = n;
1141         return 0;
1142 }
1143
1144 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1145                 const unsigned char *in, unsigned char *out,
1146                 size_t len)
1147 {
1148         const union { long one; char little; } is_endian = {1};
1149         unsigned int n, ctr;
1150         size_t i;
1151         u64        mlen  = ctx->len.u[1];
1152         block128_f block = ctx->block;
1153         void      *key   = ctx->key;
1154 #ifdef GCM_FUNCREF_4BIT
1155         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1156 # ifdef GHASH
1157         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1158                                 const u8 *inp,size_t len)       = ctx->ghash;
1159 # endif
1160 #endif
1161
1162         mlen += len;
1163         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1164                 return -1;
1165         ctx->len.u[1] = mlen;
1166
1167         if (ctx->ares) {
1168                 /* First call to decrypt finalizes GHASH(AAD) */
1169                 GCM_MUL(ctx,Xi);
1170                 ctx->ares = 0;
1171         }
1172
1173         if (is_endian.little)
1174 #ifdef BSWAP4
1175                 ctr = BSWAP4(ctx->Yi.d[3]);
1176 #else
1177                 ctr = GETU32(ctx->Yi.c+12);
1178 #endif
1179         else
1180                 ctr = ctx->Yi.d[3];
1181
1182         n = ctx->mres;
1183 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1184         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1185                 if (n) {
1186                         while (n && len) {
1187                                 u8 c = *(in++);
1188                                 *(out++) = c^ctx->EKi.c[n];
1189                                 ctx->Xi.c[n] ^= c;
1190                                 --len;
1191                                 n = (n+1)%16;
1192                         }
1193                         if (n==0) GCM_MUL (ctx,Xi);
1194                         else {
1195                                 ctx->mres = n;
1196                                 return 0;
1197                         }
1198                 }
1199 #if defined(STRICT_ALIGNMENT)
1200                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1201                         break;
1202 #endif
1203 #if defined(GHASH) && defined(GHASH_CHUNK)
1204                 while (len>=GHASH_CHUNK) {
1205                     size_t j=GHASH_CHUNK;
1206
1207                     GHASH(ctx,in,GHASH_CHUNK);
1208                     while (j) {
1209                         size_t *out_t=(size_t *)out;
1210                         const size_t *in_t=(const size_t *)in;
1211
1212                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1213                         ++ctr;
1214                         if (is_endian.little)
1215 #ifdef BSWAP4
1216                                 ctx->Yi.d[3] = BSWAP4(ctr);
1217 #else
1218                                 PUTU32(ctx->Yi.c+12,ctr);
1219 #endif
1220                         else
1221                                 ctx->Yi.d[3] = ctr;
1222                         for (i=0; i<16/sizeof(size_t); ++i)
1223                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1224                         out += 16;
1225                         in  += 16;
1226                         j   -= 16;
1227                     }
1228                     len -= GHASH_CHUNK;
1229                 }
1230                 if ((i = (len&(size_t)-16))) {
1231                     GHASH(ctx,in,i);
1232                     while (len>=16) {
1233                         size_t *out_t=(size_t *)out;
1234                         const size_t *in_t=(const size_t *)in;
1235
1236                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1237                         ++ctr;
1238                         if (is_endian.little)
1239 #ifdef BSWAP4
1240                                 ctx->Yi.d[3] = BSWAP4(ctr);
1241 #else
1242                                 PUTU32(ctx->Yi.c+12,ctr);
1243 #endif
1244                         else
1245                                 ctx->Yi.d[3] = ctr;
1246                         for (i=0; i<16/sizeof(size_t); ++i)
1247                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1248                         out += 16;
1249                         in  += 16;
1250                         len -= 16;
1251                     }
1252                 }
1253 #else
1254                 while (len>=16) {
1255                         size_t *out_t=(size_t *)out;
1256                         const size_t *in_t=(const size_t *)in;
1257
1258                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1259                         ++ctr;
1260                         if (is_endian.little)
1261 #ifdef BSWAP4
1262                                 ctx->Yi.d[3] = BSWAP4(ctr);
1263 #else
1264                                 PUTU32(ctx->Yi.c+12,ctr);
1265 #endif
1266                         else
1267                                 ctx->Yi.d[3] = ctr;
1268                         for (i=0; i<16/sizeof(size_t); ++i) {
1269                                 size_t c = in[i];
1270                                 out[i] = c^ctx->EKi.t[i];
1271                                 ctx->Xi.t[i] ^= c;
1272                         }
1273                         GCM_MUL(ctx,Xi);
1274                         out += 16;
1275                         in  += 16;
1276                         len -= 16;
1277                 }
1278 #endif
1279                 if (len) {
1280                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1281                         ++ctr;
1282                         if (is_endian.little)
1283 #ifdef BSWAP4
1284                                 ctx->Yi.d[3] = BSWAP4(ctr);
1285 #else
1286                                 PUTU32(ctx->Yi.c+12,ctr);
1287 #endif
1288                         else
1289                                 ctx->Yi.d[3] = ctr;
1290                         while (len--) {
1291                                 u8 c = in[n];
1292                                 ctx->Xi.c[n] ^= c;
1293                                 out[n] = c^ctx->EKi.c[n];
1294                                 ++n;
1295                         }
1296                 }
1297
1298                 ctx->mres = n;
1299                 return 0;
1300         } while(0);
1301 #endif
1302         for (i=0;i<len;++i) {
1303                 u8 c;
1304                 if (n==0) {
1305                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1306                         ++ctr;
1307                         if (is_endian.little)
1308 #ifdef BSWAP4
1309                                 ctx->Yi.d[3] = BSWAP4(ctr);
1310 #else
1311                                 PUTU32(ctx->Yi.c+12,ctr);
1312 #endif
1313                         else
1314                                 ctx->Yi.d[3] = ctr;
1315                 }
1316                 c = in[i];
1317                 out[i] = c^ctx->EKi.c[n];
1318                 ctx->Xi.c[n] ^= c;
1319                 n = (n+1)%16;
1320                 if (n==0)
1321                         GCM_MUL(ctx,Xi);
1322         }
1323
1324         ctx->mres = n;
1325         return 0;
1326 }
1327
1328 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1329                 const unsigned char *in, unsigned char *out,
1330                 size_t len, ctr128_f stream)
1331 {
1332         const union { long one; char little; } is_endian = {1};
1333         unsigned int n, ctr;
1334         size_t i;
1335         u64   mlen = ctx->len.u[1];
1336         void *key  = ctx->key;
1337 #ifdef GCM_FUNCREF_4BIT
1338         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1339 # ifdef GHASH
1340         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1341                                 const u8 *inp,size_t len)       = ctx->ghash;
1342 # endif
1343 #endif
1344
1345         mlen += len;
1346         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1347                 return -1;
1348         ctx->len.u[1] = mlen;
1349
1350         if (ctx->ares) {
1351                 /* First call to encrypt finalizes GHASH(AAD) */
1352                 GCM_MUL(ctx,Xi);
1353                 ctx->ares = 0;
1354         }
1355
1356         if (is_endian.little)
1357 #ifdef BSWAP4
1358                 ctr = BSWAP4(ctx->Yi.d[3]);
1359 #else
1360                 ctr = GETU32(ctx->Yi.c+12);
1361 #endif
1362         else
1363                 ctr = ctx->Yi.d[3];
1364
1365         n = ctx->mres;
1366         if (n) {
1367                 while (n && len) {
1368                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1369                         --len;
1370                         n = (n+1)%16;
1371                 }
1372                 if (n==0) GCM_MUL(ctx,Xi);
1373                 else {
1374                         ctx->mres = n;
1375                         return 0;
1376                 }
1377         }
1378 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1379         while (len>=GHASH_CHUNK) {
1380                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1381                 ctr += GHASH_CHUNK/16;
1382                 if (is_endian.little)
1383 #ifdef BSWAP4
1384                         ctx->Yi.d[3] = BSWAP4(ctr);
1385 #else
1386                         PUTU32(ctx->Yi.c+12,ctr);
1387 #endif
1388                 else
1389                         ctx->Yi.d[3] = ctr;
1390                 GHASH(ctx,out,GHASH_CHUNK);
1391                 out += GHASH_CHUNK;
1392                 in  += GHASH_CHUNK;
1393                 len -= GHASH_CHUNK;
1394         }
1395 #endif
1396         if ((i = (len&(size_t)-16))) {
1397                 size_t j=i/16;
1398
1399                 (*stream)(in,out,j,key,ctx->Yi.c);
1400                 ctr += (unsigned int)j;
1401                 if (is_endian.little)
1402 #ifdef BSWAP4
1403                         ctx->Yi.d[3] = BSWAP4(ctr);
1404 #else
1405                         PUTU32(ctx->Yi.c+12,ctr);
1406 #endif
1407                 else
1408                         ctx->Yi.d[3] = ctr;
1409                 in  += i;
1410                 len -= i;
1411 #if defined(GHASH)
1412                 GHASH(ctx,out,i);
1413                 out += i;
1414 #else
1415                 while (j--) {
1416                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1417                         GCM_MUL(ctx,Xi);
1418                         out += 16;
1419                 }
1420 #endif
1421         }
1422         if (len) {
1423                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1424                 ++ctr;
1425                 if (is_endian.little)
1426 #ifdef BSWAP4
1427                         ctx->Yi.d[3] = BSWAP4(ctr);
1428 #else
1429                         PUTU32(ctx->Yi.c+12,ctr);
1430 #endif
1431                 else
1432                         ctx->Yi.d[3] = ctr;
1433                 while (len--) {
1434                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1435                         ++n;
1436                 }
1437         }
1438
1439         ctx->mres = n;
1440         return 0;
1441 }
1442
1443 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1444                 const unsigned char *in, unsigned char *out,
1445                 size_t len,ctr128_f stream)
1446 {
1447         const union { long one; char little; } is_endian = {1};
1448         unsigned int n, ctr;
1449         size_t i;
1450         u64   mlen = ctx->len.u[1];
1451         void *key  = ctx->key;
1452 #ifdef GCM_FUNCREF_4BIT
1453         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1454 # ifdef GHASH
1455         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1456                                 const u8 *inp,size_t len)       = ctx->ghash;
1457 # endif
1458 #endif
1459
1460         mlen += len;
1461         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1462                 return -1;
1463         ctx->len.u[1] = mlen;
1464
1465         if (ctx->ares) {
1466                 /* First call to decrypt finalizes GHASH(AAD) */
1467                 GCM_MUL(ctx,Xi);
1468                 ctx->ares = 0;
1469         }
1470
1471         if (is_endian.little)
1472 #ifdef BSWAP4
1473                 ctr = BSWAP4(ctx->Yi.d[3]);
1474 #else
1475                 ctr = GETU32(ctx->Yi.c+12);
1476 #endif
1477         else
1478                 ctr = ctx->Yi.d[3];
1479
1480         n = ctx->mres;
1481         if (n) {
1482                 while (n && len) {
1483                         u8 c = *(in++);
1484                         *(out++) = c^ctx->EKi.c[n];
1485                         ctx->Xi.c[n] ^= c;
1486                         --len;
1487                         n = (n+1)%16;
1488                 }
1489                 if (n==0) GCM_MUL (ctx,Xi);
1490                 else {
1491                         ctx->mres = n;
1492                         return 0;
1493                 }
1494         }
1495 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1496         while (len>=GHASH_CHUNK) {
1497                 GHASH(ctx,in,GHASH_CHUNK);
1498                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1499                 ctr += GHASH_CHUNK/16;
1500                 if (is_endian.little)
1501 #ifdef BSWAP4
1502                         ctx->Yi.d[3] = BSWAP4(ctr);
1503 #else
1504                         PUTU32(ctx->Yi.c+12,ctr);
1505 #endif
1506                 else
1507                         ctx->Yi.d[3] = ctr;
1508                 out += GHASH_CHUNK;
1509                 in  += GHASH_CHUNK;
1510                 len -= GHASH_CHUNK;
1511         }
1512 #endif
1513         if ((i = (len&(size_t)-16))) {
1514                 size_t j=i/16;
1515
1516 #if defined(GHASH)
1517                 GHASH(ctx,in,i);
1518 #else
1519                 while (j--) {
1520                         size_t k;
1521                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1522                         GCM_MUL(ctx,Xi);
1523                         in += 16;
1524                 }
1525                 j   = i/16;
1526                 in -= i;
1527 #endif
1528                 (*stream)(in,out,j,key,ctx->Yi.c);
1529                 ctr += (unsigned int)j;
1530                 if (is_endian.little)
1531 #ifdef BSWAP4
1532                         ctx->Yi.d[3] = BSWAP4(ctr);
1533 #else
1534                         PUTU32(ctx->Yi.c+12,ctr);
1535 #endif
1536                 else
1537                         ctx->Yi.d[3] = ctr;
1538                 out += i;
1539                 in  += i;
1540                 len -= i;
1541         }
1542         if (len) {
1543                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1544                 ++ctr;
1545                 if (is_endian.little)
1546 #ifdef BSWAP4
1547                         ctx->Yi.d[3] = BSWAP4(ctr);
1548 #else
1549                         PUTU32(ctx->Yi.c+12,ctr);
1550 #endif
1551                 else
1552                         ctx->Yi.d[3] = ctr;
1553                 while (len--) {
1554                         u8 c = in[n];
1555                         ctx->Xi.c[n] ^= c;
1556                         out[n] = c^ctx->EKi.c[n];
1557                         ++n;
1558                 }
1559         }
1560
1561         ctx->mres = n;
1562         return 0;
1563 }
1564
1565 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1566                         size_t len)
1567 {
1568         const union { long one; char little; } is_endian = {1};
1569         u64 alen = ctx->len.u[0]<<3;
1570         u64 clen = ctx->len.u[1]<<3;
1571 #ifdef GCM_FUNCREF_4BIT
1572         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1573 #endif
1574
1575         if (ctx->mres || ctx->ares)
1576                 GCM_MUL(ctx,Xi);
1577
1578         if (is_endian.little) {
1579 #ifdef BSWAP8
1580                 alen = BSWAP8(alen);
1581                 clen = BSWAP8(clen);
1582 #else
1583                 u8 *p = ctx->len.c;
1584
1585                 ctx->len.u[0] = alen;
1586                 ctx->len.u[1] = clen;
1587
1588                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1589                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1590 #endif
1591         }
1592
1593         ctx->Xi.u[0] ^= alen;
1594         ctx->Xi.u[1] ^= clen;
1595         GCM_MUL(ctx,Xi);
1596
1597         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1598         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1599
1600         if (tag && len<=sizeof(ctx->Xi))
1601                 return memcmp(ctx->Xi.c,tag,len);
1602         else
1603                 return -1;
1604 }
1605
1606 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1607 {
1608         CRYPTO_gcm128_finish(ctx, NULL, 0);
1609         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1610 }
1611
1612 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1613 {
1614         GCM128_CONTEXT *ret;
1615
1616         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1617                 CRYPTO_gcm128_init(ret,key,block);
1618
1619         return ret;
1620 }
1621
1622 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1623 {
1624         if (ctx) {
1625                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1626                 OPENSSL_free(ctx);
1627         }
1628 }
1629
1630 #if defined(SELFTEST)
1631 #include <stdio.h>
1632 #include <openssl/aes.h>
1633
1634 /* Test Case 1 */
1635 static const u8 K1[16],
1636                 *P1=NULL,
1637                 *A1=NULL,
1638                 IV1[12],
1639                 *C1=NULL,
1640                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1641
1642 /* Test Case 2 */
1643 #define K2 K1
1644 #define A2 A1
1645 #define IV2 IV1
1646 static const u8 P2[16],
1647                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1648                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1649
1650 /* Test Case 3 */
1651 #define A3 A2
1652 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1653                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1654                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1655                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1656                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1657                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1658                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1659                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1660                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1661                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1662                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1663
1664 /* Test Case 4 */
1665 #define K4 K3
1666 #define IV4 IV3
1667 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1668                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1669                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1670                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1671                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1672                         0xab,0xad,0xda,0xd2},
1673                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1674                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1675                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1676                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1677                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1678
1679 /* Test Case 5 */
1680 #define K5 K4
1681 #define P5 P4
1682 #define A5 A4
1683 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1684                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1685                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1686                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1687                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1688                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1689
1690 /* Test Case 6 */
1691 #define K6 K5
1692 #define P6 P5
1693 #define A6 A5
1694 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1695                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1696                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1697                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1698                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1699                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1700                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1701                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1702                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1703
1704 /* Test Case 7 */
1705 static const u8 K7[24],
1706                 *P7=NULL,
1707                 *A7=NULL,
1708                 IV7[12],
1709                 *C7=NULL,
1710                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1711
1712 /* Test Case 8 */
1713 #define K8 K7
1714 #define IV8 IV7
1715 #define A8 A7
1716 static const u8 P8[16],
1717                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1718                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1719
1720 /* Test Case 9 */
1721 #define A9 A8
1722 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1723                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1724                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1725                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1726                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1727                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1728                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1729                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1730                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1731                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1732                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1733                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1734
1735 /* Test Case 10 */
1736 #define K10 K9
1737 #define IV10 IV9
1738 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1739                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1740                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1741                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1742                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1743                         0xab,0xad,0xda,0xd2},
1744                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1745                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1746                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1747                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1748                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1749
1750 /* Test Case 11 */
1751 #define K11 K10
1752 #define P11 P10
1753 #define A11 A10
1754 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1755                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1756                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1757                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1758                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1759                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1760
1761 /* Test Case 12 */
1762 #define K12 K11
1763 #define P12 P11
1764 #define A12 A11
1765 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1766                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1767                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1768                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1769                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1770                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1771                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1772                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1773                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1774
1775 /* Test Case 13 */
1776 static const u8 K13[32],
1777                 *P13=NULL,
1778                 *A13=NULL,
1779                 IV13[12],
1780                 *C13=NULL,
1781                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1782
1783 /* Test Case 14 */
1784 #define K14 K13
1785 #define A14 A13
1786 static const u8 P14[16],
1787                 IV14[12],
1788                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1789                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1790
1791 /* Test Case 15 */
1792 #define A15 A14
1793 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1794                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1795                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1796                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1797                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1798                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1799                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1800                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1801                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1802                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1803                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1804                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1805
1806 /* Test Case 16 */
1807 #define K16 K15
1808 #define IV16 IV15
1809 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1810                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1811                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1812                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1813                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1814                         0xab,0xad,0xda,0xd2},
1815                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1816                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1817                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1818                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1819                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1820
1821 /* Test Case 17 */
1822 #define K17 K16
1823 #define P17 P16
1824 #define A17 A16
1825 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1826                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1827                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1828                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1829                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1830                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1831
1832 /* Test Case 18 */
1833 #define K18 K17
1834 #define P18 P17
1835 #define A18 A17
1836 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1837                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1838                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1839                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1840                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1841                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1842                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1843                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1844                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1845
1846 /* Test Case 19 */
1847 #define K19 K1
1848 #define P19 P1
1849 #define IV19 IV1
1850 #define C19 C1
1851 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1852                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1853                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1854                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1855                         0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1856                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1857                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1858                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1859                 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1860
1861 /* Test Case 20 */
1862 #define K20 K1
1863 #define A20 A1
1864 static const u8 IV20[64]={0xff,0xff,0xff,0xff}, /* this results in 0xff in counter LSB */
1865                 P20[288],
1866                 C20[]= {0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
1867                         0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
1868                         0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
1869                         0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
1870                         0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
1871                         0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
1872                         0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
1873                         0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
1874                         0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
1875                         0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
1876                         0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
1877                         0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
1878                         0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
1879                         0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
1880                         0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
1881                         0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
1882                         0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
1883                         0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c},
1884                 T20[]= {0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f};
1885
1886 #define TEST_CASE(n)    do {                                    \
1887         u8 out[sizeof(P##n)];                                   \
1888         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1889         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1890         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1891         memset(out,0,sizeof(out));                              \
1892         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1893         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1894         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1895             (C##n && memcmp(out,C##n,sizeof(out))))             \
1896                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1897         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1898         memset(out,0,sizeof(out));                              \
1899         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1900         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1901         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1902             (P##n && memcmp(out,P##n,sizeof(out))))             \
1903                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1904         } while(0)
1905
1906 int main()
1907 {
1908         GCM128_CONTEXT ctx;
1909         AES_KEY key;
1910         int ret=0;
1911
1912         TEST_CASE(1);
1913         TEST_CASE(2);
1914         TEST_CASE(3);
1915         TEST_CASE(4);
1916         TEST_CASE(5);
1917         TEST_CASE(6);
1918         TEST_CASE(7);
1919         TEST_CASE(8);
1920         TEST_CASE(9);
1921         TEST_CASE(10);
1922         TEST_CASE(11);
1923         TEST_CASE(12);
1924         TEST_CASE(13);
1925         TEST_CASE(14);
1926         TEST_CASE(15);
1927         TEST_CASE(16);
1928         TEST_CASE(17);
1929         TEST_CASE(18);
1930         TEST_CASE(19);
1931         TEST_CASE(20);
1932
1933 #ifdef OPENSSL_CPUID_OBJ
1934         {
1935         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1936         union { u64 u; u8 c[1024]; } buf;
1937         int i;
1938
1939         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1940         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1941         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1942
1943         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1944         start = OPENSSL_rdtsc();
1945         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1946         gcm_t = OPENSSL_rdtsc() - start;
1947
1948         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1949                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1950                         (block128_f)AES_encrypt);
1951         start = OPENSSL_rdtsc();
1952         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1953                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1954                         (block128_f)AES_encrypt);
1955         ctr_t = OPENSSL_rdtsc() - start;
1956
1957         printf("%.2f-%.2f=%.2f\n",
1958                         gcm_t/(double)sizeof(buf),
1959                         ctr_t/(double)sizeof(buf),
1960                         (gcm_t-ctr_t)/(double)sizeof(buf));
1961 #ifdef GHASH
1962         {
1963         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1964                                 const u8 *inp,size_t len)       = ctx.ghash;
1965
1966         GHASH((&ctx),buf.c,sizeof(buf));
1967         start = OPENSSL_rdtsc();
1968         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1969         gcm_t = OPENSSL_rdtsc() - start;
1970         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1971         }
1972 #endif
1973         }
1974 #endif
1975
1976         return ret;
1977 }
1978 #endif