8f5ce988585f13ccab0c5f3498db69547e9f58e0
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         __fips_constseg
152         static const size_t rem_8bit[256] = {
153                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217
218         while (1) {
219                 Z.hi ^= Htable[n].hi;
220                 Z.lo ^= Htable[n].lo;
221
222                 if ((u8 *)Xi==xi)       break;
223
224                 n = *(--xi);
225
226                 rem  = (size_t)Z.lo&0xff;
227                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228                 Z.hi = (Z.hi>>8);
229                 if (sizeof(size_t)==8)
230                         Z.hi ^= rem_8bit[rem];
231                 else
232                         Z.hi ^= (u64)rem_8bit[rem]<<32;
233         }
234
235         if (is_endian.little) {
236 #ifdef BSWAP8
237                 Xi[0] = BSWAP8(Z.hi);
238                 Xi[1] = BSWAP8(Z.lo);
239 #else
240                 u8 *p = (u8 *)Xi;
241                 u32 v;
242                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
243                 v = (u32)(Z.hi);        PUTU32(p+4,v);
244                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
245                 v = (u32)(Z.lo);        PUTU32(p+12,v);
246 #endif
247         }
248         else {
249                 Xi[0] = Z.hi;
250                 Xi[1] = Z.lo;
251         }
252 }
253 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
254
255 #elif   TABLE_BITS==4
256
257 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
258 {
259         u128 V;
260 #if defined(OPENSSL_SMALL_FOOTPRINT)
261         int  i;
262 #endif
263
264         Htable[0].hi = 0;
265         Htable[0].lo = 0;
266         V.hi = H[0];
267         V.lo = H[1];
268
269 #if defined(OPENSSL_SMALL_FOOTPRINT)
270         for (Htable[8]=V, i=4; i>0; i>>=1) {
271                 REDUCE1BIT(V);
272                 Htable[i] = V;
273         }
274
275         for (i=2; i<16; i<<=1) {
276                 u128 *Hi = Htable+i;
277                 int   j;
278                 for (V=*Hi, j=1; j<i; ++j) {
279                         Hi[j].hi = V.hi^Htable[j].hi;
280                         Hi[j].lo = V.lo^Htable[j].lo;
281                 }
282         }
283 #else
284         Htable[8] = V;
285         REDUCE1BIT(V);
286         Htable[4] = V;
287         REDUCE1BIT(V);
288         Htable[2] = V;
289         REDUCE1BIT(V);
290         Htable[1] = V;
291         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
292         V=Htable[4];
293         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
294         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
295         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
296         V=Htable[8];
297         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
298         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
299         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
300         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
301         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
302         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
303         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #endif
305 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306         /*
307          * ARM assembler expects specific dword order in Htable.
308          */
309         {
310         int j;
311         const union { long one; char little; } is_endian = {1};
312
313         if (is_endian.little)
314                 for (j=0;j<16;++j) {
315                         V = Htable[j];
316                         Htable[j].hi = V.lo;
317                         Htable[j].lo = V.hi;
318                 }
319         else
320                 for (j=0;j<16;++j) {
321                         V = Htable[j];
322                         Htable[j].hi = V.lo<<32|V.lo>>32;
323                         Htable[j].lo = V.hi<<32|V.hi>>32;
324                 }
325         }
326 #endif
327 }
328
329 #ifndef GHASH_ASM
330 __fips_constseg
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but the priority is to minimize
462      * cache footprint...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     __fips_constseg
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 #if     TABLE_BITS==4 && defined(GHASH_ASM)
649 # if    !defined(I386_ONLY) && \
650         (defined(__i386)        || defined(__i386__)    || \
651          defined(__x86_64)      || defined(__x86_64__)  || \
652          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
653 #  define GHASH_ASM_X86_OR_64
654 #  define GCM_FUNCREF_4BIT
655 extern unsigned int OPENSSL_ia32cap_P[2];
656
657 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
658 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
662 #   define GHASH_ASM_X86
663 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665
666 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
667 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
668 #  endif
669 # elif defined(__arm__) || defined(__arm)
670 #  include "arm_arch.h"
671 #  if __ARM_ARCH__>=7
672 #   define GHASH_ASM_ARM
673 #   define GCM_FUNCREF_4BIT
674 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
675 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 #  endif
677 # elif defined(__sparc__) || defined(__sparc)
678 #  include "sparc_arch.h"
679 #  define GHASH_ASM_SPARC
680 #  define GCM_FUNCREF_4BIT
681 extern unsigned int OPENSSL_sparcv9cap_P[];
682 void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
683 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
684 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
685 # endif
686 #endif
687
688 #ifdef GCM_FUNCREF_4BIT
689 # undef  GCM_MUL
690 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
691 # ifdef GHASH
692 #  undef  GHASH
693 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
694 # endif
695 #endif
696
697 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
698 {
699         const union { long one; char little; } is_endian = {1};
700
701         memset(ctx,0,sizeof(*ctx));
702         ctx->block = block;
703         ctx->key   = key;
704
705         (*block)(ctx->H.c,ctx->H.c,key);
706
707         if (is_endian.little) {
708                 /* H is stored in host byte order */
709 #ifdef BSWAP8
710                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
711                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
712 #else
713                 u8 *p = ctx->H.c;
714                 u64 hi,lo;
715                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
716                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
717                 ctx->H.u[0] = hi;
718                 ctx->H.u[1] = lo;
719 #endif
720         }
721
722 #if     TABLE_BITS==8
723         gcm_init_8bit(ctx->Htable,ctx->H.u);
724 #elif   TABLE_BITS==4
725 # if    defined(GHASH_ASM_X86_OR_64)
726 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
727         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
728             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
729                 gcm_init_clmul(ctx->Htable,ctx->H.u);
730                 ctx->gmult = gcm_gmult_clmul;
731                 ctx->ghash = gcm_ghash_clmul;
732                 return;
733         }
734 #  endif
735         gcm_init_4bit(ctx->Htable,ctx->H.u);
736 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
737 #   if  defined(OPENSSL_IA32_SSE2)
738         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
739 #   else
740         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
741 #   endif
742                 ctx->gmult = gcm_gmult_4bit_mmx;
743                 ctx->ghash = gcm_ghash_4bit_mmx;
744         } else {
745                 ctx->gmult = gcm_gmult_4bit_x86;
746                 ctx->ghash = gcm_ghash_4bit_x86;
747         }
748 #  else
749         ctx->gmult = gcm_gmult_4bit;
750         ctx->ghash = gcm_ghash_4bit;
751 #  endif
752 # elif  defined(GHASH_ASM_ARM)
753         if (OPENSSL_armcap_P & ARMV7_NEON) {
754                 ctx->gmult = gcm_gmult_neon;
755                 ctx->ghash = gcm_ghash_neon;
756         } else {
757                 gcm_init_4bit(ctx->Htable,ctx->H.u);
758                 ctx->gmult = gcm_gmult_4bit;
759                 ctx->ghash = gcm_ghash_4bit;
760         }
761 # elif  defined(GHASH_ASM_SPARC)
762         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
763                 gcm_init_vis3(ctx->Htable,ctx->H.u);
764                 ctx->gmult = gcm_gmult_vis3;
765                 ctx->ghash = gcm_ghash_vis3;
766         } else {
767                 gcm_init_4bit(ctx->Htable,ctx->H.u);
768                 ctx->gmult = gcm_gmult_4bit;
769                 ctx->ghash = gcm_ghash_4bit;
770         }
771 # else
772         gcm_init_4bit(ctx->Htable,ctx->H.u);
773 # endif
774 #endif
775 }
776
777 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
778 {
779         const union { long one; char little; } is_endian = {1};
780         unsigned int ctr;
781 #ifdef GCM_FUNCREF_4BIT
782         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
783 #endif
784
785         ctx->Yi.u[0]  = 0;
786         ctx->Yi.u[1]  = 0;
787         ctx->Xi.u[0]  = 0;
788         ctx->Xi.u[1]  = 0;
789         ctx->len.u[0] = 0;      /* AAD length */
790         ctx->len.u[1] = 0;      /* message length */
791         ctx->ares = 0;
792         ctx->mres = 0;
793
794         if (len==12) {
795                 memcpy(ctx->Yi.c,iv,12);
796                 ctx->Yi.c[15]=1;
797                 ctr=1;
798         }
799         else {
800                 size_t i;
801                 u64 len0 = len;
802
803                 while (len>=16) {
804                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
805                         GCM_MUL(ctx,Yi);
806                         iv += 16;
807                         len -= 16;
808                 }
809                 if (len) {
810                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
811                         GCM_MUL(ctx,Yi);
812                 }
813                 len0 <<= 3;
814                 if (is_endian.little) {
815 #ifdef BSWAP8
816                         ctx->Yi.u[1]  ^= BSWAP8(len0);
817 #else
818                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
819                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
820                         ctx->Yi.c[10] ^= (u8)(len0>>40);
821                         ctx->Yi.c[11] ^= (u8)(len0>>32);
822                         ctx->Yi.c[12] ^= (u8)(len0>>24);
823                         ctx->Yi.c[13] ^= (u8)(len0>>16);
824                         ctx->Yi.c[14] ^= (u8)(len0>>8);
825                         ctx->Yi.c[15] ^= (u8)(len0);
826 #endif
827                 }
828                 else
829                         ctx->Yi.u[1]  ^= len0;
830
831                 GCM_MUL(ctx,Yi);
832
833                 if (is_endian.little)
834                         ctr = GETU32(ctx->Yi.c+12);
835                 else
836                         ctr = ctx->Yi.d[3];
837         }
838
839         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
840         ++ctr;
841         if (is_endian.little)
842                 PUTU32(ctx->Yi.c+12,ctr);
843         else
844                 ctx->Yi.d[3] = ctr;
845 }
846
847 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
848 {
849         size_t i;
850         unsigned int n;
851         u64 alen = ctx->len.u[0];
852 #ifdef GCM_FUNCREF_4BIT
853         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
854 # ifdef GHASH
855         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
856                                 const u8 *inp,size_t len)       = ctx->ghash;
857 # endif
858 #endif
859
860         if (ctx->len.u[1]) return -2;
861
862         alen += len;
863         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
864                 return -1;
865         ctx->len.u[0] = alen;
866
867         n = ctx->ares;
868         if (n) {
869                 while (n && len) {
870                         ctx->Xi.c[n] ^= *(aad++);
871                         --len;
872                         n = (n+1)%16;
873                 }
874                 if (n==0) GCM_MUL(ctx,Xi);
875                 else {
876                         ctx->ares = n;
877                         return 0;
878                 }
879         }
880
881 #ifdef GHASH
882         if ((i = (len&(size_t)-16))) {
883                 GHASH(ctx,aad,i);
884                 aad += i;
885                 len -= i;
886         }
887 #else
888         while (len>=16) {
889                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
890                 GCM_MUL(ctx,Xi);
891                 aad += 16;
892                 len -= 16;
893         }
894 #endif
895         if (len) {
896                 n = (unsigned int)len;
897                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
898         }
899
900         ctx->ares = n;
901         return 0;
902 }
903
904 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
905                 const unsigned char *in, unsigned char *out,
906                 size_t len)
907 {
908         const union { long one; char little; } is_endian = {1};
909         unsigned int n, ctr;
910         size_t i;
911         u64        mlen  = ctx->len.u[1];
912         block128_f block = ctx->block;
913         void      *key   = ctx->key;
914 #ifdef GCM_FUNCREF_4BIT
915         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
916 # ifdef GHASH
917         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
918                                 const u8 *inp,size_t len)       = ctx->ghash;
919 # endif
920 #endif
921
922 #if 0
923         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
924 #endif
925         mlen += len;
926         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
927                 return -1;
928         ctx->len.u[1] = mlen;
929
930         if (ctx->ares) {
931                 /* First call to encrypt finalizes GHASH(AAD) */
932                 GCM_MUL(ctx,Xi);
933                 ctx->ares = 0;
934         }
935
936         if (is_endian.little)
937                 ctr = GETU32(ctx->Yi.c+12);
938         else
939                 ctr = ctx->Yi.d[3];
940
941         n = ctx->mres;
942 #if !defined(OPENSSL_SMALL_FOOTPRINT)
943         if (16%sizeof(size_t) == 0) do {        /* always true actually */
944                 if (n) {
945                         while (n && len) {
946                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
947                                 --len;
948                                 n = (n+1)%16;
949                         }
950                         if (n==0) GCM_MUL(ctx,Xi);
951                         else {
952                                 ctx->mres = n;
953                                 return 0;
954                         }
955                 }
956 #if defined(STRICT_ALIGNMENT)
957                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
958                         break;
959 #endif
960 #if defined(GHASH) && defined(GHASH_CHUNK)
961                 while (len>=GHASH_CHUNK) {
962                     size_t j=GHASH_CHUNK;
963
964                     while (j) {
965                         size_t *out_t=(size_t *)out, *ivec_t=(size_t *)ivec;
966                         const size_t *in_t=(const size_t *)in;
967                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
968                         ++ctr;
969                         if (is_endian.little)
970                                 PUTU32(ctx->Yi.c+12,ctr);
971                         else
972                                 ctx->Yi.d[3] = ctr;
973                         for (i=0; i<16/sizeof(size_t); ++i)
974                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
975                         out += 16;
976                         in  += 16;
977                         j   -= 16;
978                     }
979                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
980                     len -= GHASH_CHUNK;
981                 }
982                 if ((i = (len&(size_t)-16))) {
983                     size_t j=i;
984
985                     while (len>=16) {
986                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
987                         ++ctr;
988                         if (is_endian.little)
989                                 PUTU32(ctx->Yi.c+12,ctr);
990                         else
991                                 ctx->Yi.d[3] = ctr;
992                         for (i=0; i<16; i+=sizeof(size_t))
993                                 *(size_t *)(out+i) =
994                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
995                         out += 16;
996                         in  += 16;
997                         len -= 16;
998                     }
999                     GHASH(ctx,out-j,j);
1000                 }
1001 #else
1002                 while (len>=16) {
1003                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1004                         ++ctr;
1005                         if (is_endian.little)
1006                                 PUTU32(ctx->Yi.c+12,ctr);
1007                         else
1008                                 ctx->Yi.d[3] = ctr;
1009                         for (i=0; i<16; i+=sizeof(size_t))
1010                                 *(size_t *)(ctx->Xi.c+i) ^=
1011                                 *(size_t *)(out+i) =
1012                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1013                         GCM_MUL(ctx,Xi);
1014                         out += 16;
1015                         in  += 16;
1016                         len -= 16;
1017                 }
1018 #endif
1019                 if (len) {
1020                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1021                         ++ctr;
1022                         if (is_endian.little)
1023                                 PUTU32(ctx->Yi.c+12,ctr);
1024                         else
1025                                 ctx->Yi.d[3] = ctr;
1026                         while (len--) {
1027                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1028                                 ++n;
1029                         }
1030                 }
1031
1032                 ctx->mres = n;
1033                 return 0;
1034         } while(0);
1035 #endif
1036         for (i=0;i<len;++i) {
1037                 if (n==0) {
1038                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1039                         ++ctr;
1040                         if (is_endian.little)
1041                                 PUTU32(ctx->Yi.c+12,ctr);
1042                         else
1043                                 ctx->Yi.d[3] = ctr;
1044                 }
1045                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1046                 n = (n+1)%16;
1047                 if (n==0)
1048                         GCM_MUL(ctx,Xi);
1049         }
1050
1051         ctx->mres = n;
1052         return 0;
1053 }
1054
1055 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1056                 const unsigned char *in, unsigned char *out,
1057                 size_t len)
1058 {
1059         const union { long one; char little; } is_endian = {1};
1060         unsigned int n, ctr;
1061         size_t i;
1062         u64        mlen  = ctx->len.u[1];
1063         block128_f block = ctx->block;
1064         void      *key   = ctx->key;
1065 #ifdef GCM_FUNCREF_4BIT
1066         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1067 # ifdef GHASH
1068         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1069                                 const u8 *inp,size_t len)       = ctx->ghash;
1070 # endif
1071 #endif
1072
1073         mlen += len;
1074         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1075                 return -1;
1076         ctx->len.u[1] = mlen;
1077
1078         if (ctx->ares) {
1079                 /* First call to decrypt finalizes GHASH(AAD) */
1080                 GCM_MUL(ctx,Xi);
1081                 ctx->ares = 0;
1082         }
1083
1084         if (is_endian.little)
1085                 ctr = GETU32(ctx->Yi.c+12);
1086         else
1087                 ctr = ctx->Yi.d[3];
1088
1089         n = ctx->mres;
1090 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1091         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1092                 if (n) {
1093                         while (n && len) {
1094                                 u8 c = *(in++);
1095                                 *(out++) = c^ctx->EKi.c[n];
1096                                 ctx->Xi.c[n] ^= c;
1097                                 --len;
1098                                 n = (n+1)%16;
1099                         }
1100                         if (n==0) GCM_MUL (ctx,Xi);
1101                         else {
1102                                 ctx->mres = n;
1103                                 return 0;
1104                         }
1105                 }
1106 #if defined(STRICT_ALIGNMENT)
1107                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1108                         break;
1109 #endif
1110 #if defined(GHASH) && defined(GHASH_CHUNK)
1111                 while (len>=GHASH_CHUNK) {
1112                     size_t j=GHASH_CHUNK;
1113
1114                     GHASH(ctx,in,GHASH_CHUNK);
1115                     while (j) {
1116                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1117                         ++ctr;
1118                         if (is_endian.little)
1119                                 PUTU32(ctx->Yi.c+12,ctr);
1120                         else
1121                                 ctx->Yi.d[3] = ctr;
1122                         for (i=0; i<16; i+=sizeof(size_t))
1123                                 *(size_t *)(out+i) =
1124                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1125                         out += 16;
1126                         in  += 16;
1127                         j   -= 16;
1128                     }
1129                     len -= GHASH_CHUNK;
1130                 }
1131                 if ((i = (len&(size_t)-16))) {
1132                     GHASH(ctx,in,i);
1133                     while (len>=16) {
1134                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1135                         ++ctr;
1136                         if (is_endian.little)
1137                                 PUTU32(ctx->Yi.c+12,ctr);
1138                         else
1139                                 ctx->Yi.d[3] = ctr;
1140                         for (i=0; i<16; i+=sizeof(size_t))
1141                                 *(size_t *)(out+i) =
1142                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1143                         out += 16;
1144                         in  += 16;
1145                         len -= 16;
1146                     }
1147                 }
1148 #else
1149                 while (len>=16) {
1150                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1151                         ++ctr;
1152                         if (is_endian.little)
1153                                 PUTU32(ctx->Yi.c+12,ctr);
1154                         else
1155                                 ctx->Yi.d[3] = ctr;
1156                         for (i=0; i<16; i+=sizeof(size_t)) {
1157                                 size_t c = *(size_t *)(in+i);
1158                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1159                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1160                         }
1161                         GCM_MUL(ctx,Xi);
1162                         out += 16;
1163                         in  += 16;
1164                         len -= 16;
1165                 }
1166 #endif
1167                 if (len) {
1168                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1169                         ++ctr;
1170                         if (is_endian.little)
1171                                 PUTU32(ctx->Yi.c+12,ctr);
1172                         else
1173                                 ctx->Yi.d[3] = ctr;
1174                         while (len--) {
1175                                 u8 c = in[n];
1176                                 ctx->Xi.c[n] ^= c;
1177                                 out[n] = c^ctx->EKi.c[n];
1178                                 ++n;
1179                         }
1180                 }
1181
1182                 ctx->mres = n;
1183                 return 0;
1184         } while(0);
1185 #endif
1186         for (i=0;i<len;++i) {
1187                 u8 c;
1188                 if (n==0) {
1189                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1190                         ++ctr;
1191                         if (is_endian.little)
1192                                 PUTU32(ctx->Yi.c+12,ctr);
1193                         else
1194                                 ctx->Yi.d[3] = ctr;
1195                 }
1196                 c = in[i];
1197                 out[i] = c^ctx->EKi.c[n];
1198                 ctx->Xi.c[n] ^= c;
1199                 n = (n+1)%16;
1200                 if (n==0)
1201                         GCM_MUL(ctx,Xi);
1202         }
1203
1204         ctx->mres = n;
1205         return 0;
1206 }
1207
1208 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1209                 const unsigned char *in, unsigned char *out,
1210                 size_t len, ctr128_f stream)
1211 {
1212         const union { long one; char little; } is_endian = {1};
1213         unsigned int n, ctr;
1214         size_t i;
1215         u64   mlen = ctx->len.u[1];
1216         void *key  = ctx->key;
1217 #ifdef GCM_FUNCREF_4BIT
1218         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1219 # ifdef GHASH
1220         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1221                                 const u8 *inp,size_t len)       = ctx->ghash;
1222 # endif
1223 #endif
1224
1225         mlen += len;
1226         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1227                 return -1;
1228         ctx->len.u[1] = mlen;
1229
1230         if (ctx->ares) {
1231                 /* First call to encrypt finalizes GHASH(AAD) */
1232                 GCM_MUL(ctx,Xi);
1233                 ctx->ares = 0;
1234         }
1235
1236         if (is_endian.little)
1237                 ctr = GETU32(ctx->Yi.c+12);
1238         else
1239                 ctr = ctx->Yi.d[3];
1240
1241         n = ctx->mres;
1242         if (n) {
1243                 while (n && len) {
1244                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1245                         --len;
1246                         n = (n+1)%16;
1247                 }
1248                 if (n==0) GCM_MUL(ctx,Xi);
1249                 else {
1250                         ctx->mres = n;
1251                         return 0;
1252                 }
1253         }
1254 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1255         while (len>=GHASH_CHUNK) {
1256                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1257                 ctr += GHASH_CHUNK/16;
1258                 if (is_endian.little)
1259                         PUTU32(ctx->Yi.c+12,ctr);
1260                 else
1261                         ctx->Yi.d[3] = ctr;
1262                 GHASH(ctx,out,GHASH_CHUNK);
1263                 out += GHASH_CHUNK;
1264                 in  += GHASH_CHUNK;
1265                 len -= GHASH_CHUNK;
1266         }
1267 #endif
1268         if ((i = (len&(size_t)-16))) {
1269                 size_t j=i/16;
1270
1271                 (*stream)(in,out,j,key,ctx->Yi.c);
1272                 ctr += (unsigned int)j;
1273                 if (is_endian.little)
1274                         PUTU32(ctx->Yi.c+12,ctr);
1275                 else
1276                         ctx->Yi.d[3] = ctr;
1277                 in  += i;
1278                 len -= i;
1279 #if defined(GHASH)
1280                 GHASH(ctx,out,i);
1281                 out += i;
1282 #else
1283                 while (j--) {
1284                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1285                         GCM_MUL(ctx,Xi);
1286                         out += 16;
1287                 }
1288 #endif
1289         }
1290         if (len) {
1291                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1292                 ++ctr;
1293                 if (is_endian.little)
1294                         PUTU32(ctx->Yi.c+12,ctr);
1295                 else
1296                         ctx->Yi.d[3] = ctr;
1297                 while (len--) {
1298                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1299                         ++n;
1300                 }
1301         }
1302
1303         ctx->mres = n;
1304         return 0;
1305 }
1306
1307 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1308                 const unsigned char *in, unsigned char *out,
1309                 size_t len,ctr128_f stream)
1310 {
1311         const union { long one; char little; } is_endian = {1};
1312         unsigned int n, ctr;
1313         size_t i;
1314         u64   mlen = ctx->len.u[1];
1315         void *key  = ctx->key;
1316 #ifdef GCM_FUNCREF_4BIT
1317         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1318 # ifdef GHASH
1319         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1320                                 const u8 *inp,size_t len)       = ctx->ghash;
1321 # endif
1322 #endif
1323
1324         mlen += len;
1325         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1326                 return -1;
1327         ctx->len.u[1] = mlen;
1328
1329         if (ctx->ares) {
1330                 /* First call to decrypt finalizes GHASH(AAD) */
1331                 GCM_MUL(ctx,Xi);
1332                 ctx->ares = 0;
1333         }
1334
1335         if (is_endian.little)
1336                 ctr = GETU32(ctx->Yi.c+12);
1337         else
1338                 ctr = ctx->Yi.d[3];
1339
1340         n = ctx->mres;
1341         if (n) {
1342                 while (n && len) {
1343                         u8 c = *(in++);
1344                         *(out++) = c^ctx->EKi.c[n];
1345                         ctx->Xi.c[n] ^= c;
1346                         --len;
1347                         n = (n+1)%16;
1348                 }
1349                 if (n==0) GCM_MUL (ctx,Xi);
1350                 else {
1351                         ctx->mres = n;
1352                         return 0;
1353                 }
1354         }
1355 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1356         while (len>=GHASH_CHUNK) {
1357                 GHASH(ctx,in,GHASH_CHUNK);
1358                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1359                 ctr += GHASH_CHUNK/16;
1360                 if (is_endian.little)
1361                         PUTU32(ctx->Yi.c+12,ctr);
1362                 else
1363                         ctx->Yi.d[3] = ctr;
1364                 out += GHASH_CHUNK;
1365                 in  += GHASH_CHUNK;
1366                 len -= GHASH_CHUNK;
1367         }
1368 #endif
1369         if ((i = (len&(size_t)-16))) {
1370                 size_t j=i/16;
1371
1372 #if defined(GHASH)
1373                 GHASH(ctx,in,i);
1374 #else
1375                 while (j--) {
1376                         size_t k;
1377                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1378                         GCM_MUL(ctx,Xi);
1379                         in += 16;
1380                 }
1381                 j   = i/16;
1382                 in -= i;
1383 #endif
1384                 (*stream)(in,out,j,key,ctx->Yi.c);
1385                 ctr += (unsigned int)j;
1386                 if (is_endian.little)
1387                         PUTU32(ctx->Yi.c+12,ctr);
1388                 else
1389                         ctx->Yi.d[3] = ctr;
1390                 out += i;
1391                 in  += i;
1392                 len -= i;
1393         }
1394         if (len) {
1395                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1396                 ++ctr;
1397                 if (is_endian.little)
1398                         PUTU32(ctx->Yi.c+12,ctr);
1399                 else
1400                         ctx->Yi.d[3] = ctr;
1401                 while (len--) {
1402                         u8 c = in[n];
1403                         ctx->Xi.c[n] ^= c;
1404                         out[n] = c^ctx->EKi.c[n];
1405                         ++n;
1406                 }
1407         }
1408
1409         ctx->mres = n;
1410         return 0;
1411 }
1412
1413 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1414                         size_t len)
1415 {
1416         const union { long one; char little; } is_endian = {1};
1417         u64 alen = ctx->len.u[0]<<3;
1418         u64 clen = ctx->len.u[1]<<3;
1419 #ifdef GCM_FUNCREF_4BIT
1420         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1421 #endif
1422
1423         if (ctx->mres || ctx->ares)
1424                 GCM_MUL(ctx,Xi);
1425
1426         if (is_endian.little) {
1427 #ifdef BSWAP8
1428                 alen = BSWAP8(alen);
1429                 clen = BSWAP8(clen);
1430 #else
1431                 u8 *p = ctx->len.c;
1432
1433                 ctx->len.u[0] = alen;
1434                 ctx->len.u[1] = clen;
1435
1436                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1437                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1438 #endif
1439         }
1440
1441         ctx->Xi.u[0] ^= alen;
1442         ctx->Xi.u[1] ^= clen;
1443         GCM_MUL(ctx,Xi);
1444
1445         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1446         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1447
1448         if (tag && len<=sizeof(ctx->Xi))
1449                 return memcmp(ctx->Xi.c,tag,len);
1450         else
1451                 return -1;
1452 }
1453
1454 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1455 {
1456         CRYPTO_gcm128_finish(ctx, NULL, 0);
1457         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1458 }
1459
1460 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1461 {
1462         GCM128_CONTEXT *ret;
1463
1464         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1465                 CRYPTO_gcm128_init(ret,key,block);
1466
1467         return ret;
1468 }
1469
1470 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1471 {
1472         if (ctx) {
1473                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1474                 OPENSSL_free(ctx);
1475         }
1476 }
1477
1478 #if defined(SELFTEST)
1479 #include <stdio.h>
1480 #include <openssl/aes.h>
1481
1482 /* Test Case 1 */
1483 static const u8 K1[16],
1484                 *P1=NULL,
1485                 *A1=NULL,
1486                 IV1[12],
1487                 *C1=NULL,
1488                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1489
1490 /* Test Case 2 */
1491 #define K2 K1
1492 #define A2 A1
1493 #define IV2 IV1
1494 static const u8 P2[16],
1495                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1496                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1497
1498 /* Test Case 3 */
1499 #define A3 A2
1500 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1501                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1502                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1503                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1504                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1505                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1506                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1507                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1508                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1509                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1510                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1511
1512 /* Test Case 4 */
1513 #define K4 K3
1514 #define IV4 IV3
1515 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1516                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1517                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1518                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1519                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1520                         0xab,0xad,0xda,0xd2},
1521                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1522                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1523                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1524                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1525                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1526
1527 /* Test Case 5 */
1528 #define K5 K4
1529 #define P5 P4
1530 #define A5 A4
1531 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1532                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1533                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1534                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1535                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1536                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1537
1538 /* Test Case 6 */
1539 #define K6 K5
1540 #define P6 P5
1541 #define A6 A5
1542 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1543                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1544                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1545                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1546                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1547                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1548                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1549                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1550                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1551
1552 /* Test Case 7 */
1553 static const u8 K7[24],
1554                 *P7=NULL,
1555                 *A7=NULL,
1556                 IV7[12],
1557                 *C7=NULL,
1558                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1559
1560 /* Test Case 8 */
1561 #define K8 K7
1562 #define IV8 IV7
1563 #define A8 A7
1564 static const u8 P8[16],
1565                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1566                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1567
1568 /* Test Case 9 */
1569 #define A9 A8
1570 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1571                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1572                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1573                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1574                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1575                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1576                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1577                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1578                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1579                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1580                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1581                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1582
1583 /* Test Case 10 */
1584 #define K10 K9
1585 #define IV10 IV9
1586 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1587                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1588                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1589                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1590                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1591                         0xab,0xad,0xda,0xd2},
1592                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1593                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1594                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1595                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1596                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1597
1598 /* Test Case 11 */
1599 #define K11 K10
1600 #define P11 P10
1601 #define A11 A10
1602 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1603                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1604                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1605                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1606                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1607                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1608
1609 /* Test Case 12 */
1610 #define K12 K11
1611 #define P12 P11
1612 #define A12 A11
1613 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1614                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1615                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1616                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1617                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1618                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1619                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1620                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1621                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1622
1623 /* Test Case 13 */
1624 static const u8 K13[32],
1625                 *P13=NULL,
1626                 *A13=NULL,
1627                 IV13[12],
1628                 *C13=NULL,
1629                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1630
1631 /* Test Case 14 */
1632 #define K14 K13
1633 #define A14 A13
1634 static const u8 P14[16],
1635                 IV14[12],
1636                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1637                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1638
1639 /* Test Case 15 */
1640 #define A15 A14
1641 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1642                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1643                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1644                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1645                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1646                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1647                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1648                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1649                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1650                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1651                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1652                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1653
1654 /* Test Case 16 */
1655 #define K16 K15
1656 #define IV16 IV15
1657 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1658                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1659                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1660                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1661                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1662                         0xab,0xad,0xda,0xd2},
1663                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1664                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1665                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1666                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1667                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1668
1669 /* Test Case 17 */
1670 #define K17 K16
1671 #define P17 P16
1672 #define A17 A16
1673 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1674                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1675                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1676                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1677                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1678                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1679
1680 /* Test Case 18 */
1681 #define K18 K17
1682 #define P18 P17
1683 #define A18 A17
1684 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1685                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1686                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1687                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1688                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1689                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1690                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1691                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1692                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1693
1694 #define TEST_CASE(n)    do {                                    \
1695         u8 out[sizeof(P##n)];                                   \
1696         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1697         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1698         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1699         memset(out,0,sizeof(out));                              \
1700         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1701         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1702         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1703             (C##n && memcmp(out,C##n,sizeof(out))))             \
1704                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1705         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1706         memset(out,0,sizeof(out));                              \
1707         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1708         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1709         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1710             (P##n && memcmp(out,P##n,sizeof(out))))             \
1711                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1712         } while(0)
1713
1714 int main()
1715 {
1716         GCM128_CONTEXT ctx;
1717         AES_KEY key;
1718         int ret=0;
1719
1720         TEST_CASE(1);
1721         TEST_CASE(2);
1722         TEST_CASE(3);
1723         TEST_CASE(4);
1724         TEST_CASE(5);
1725         TEST_CASE(6);
1726         TEST_CASE(7);
1727         TEST_CASE(8);
1728         TEST_CASE(9);
1729         TEST_CASE(10);
1730         TEST_CASE(11);
1731         TEST_CASE(12);
1732         TEST_CASE(13);
1733         TEST_CASE(14);
1734         TEST_CASE(15);
1735         TEST_CASE(16);
1736         TEST_CASE(17);
1737         TEST_CASE(18);
1738
1739 #ifdef OPENSSL_CPUID_OBJ
1740         {
1741         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1742         union { u64 u; u8 c[1024]; } buf;
1743         int i;
1744
1745         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1746         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1747         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1748
1749         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1750         start = OPENSSL_rdtsc();
1751         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1752         gcm_t = OPENSSL_rdtsc() - start;
1753
1754         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1755                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1756                         (block128_f)AES_encrypt);
1757         start = OPENSSL_rdtsc();
1758         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1759                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1760                         (block128_f)AES_encrypt);
1761         ctr_t = OPENSSL_rdtsc() - start;
1762
1763         printf("%.2f-%.2f=%.2f\n",
1764                         gcm_t/(double)sizeof(buf),
1765                         ctr_t/(double)sizeof(buf),
1766                         (gcm_t-ctr_t)/(double)sizeof(buf));
1767 #ifdef GHASH
1768         {
1769         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1770                                 const u8 *inp,size_t len)       = ctx.ghash;
1771
1772         GHASH((&ctx),buf.c,sizeof(buf));
1773         start = OPENSSL_rdtsc();
1774         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1775         gcm_t = OPENSSL_rdtsc() - start;
1776         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1777         }
1778 #endif
1779         }
1780 #endif
1781
1782         return ret;
1783 }
1784 #endif