Allow for dynamic base in Win64 FIPS module.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         __fips_constseg
152         static const size_t rem_8bit[256] = {
153                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217
218         while (1) {
219                 Z.hi ^= Htable[n].hi;
220                 Z.lo ^= Htable[n].lo;
221
222                 if ((u8 *)Xi==xi)       break;
223
224                 n = *(--xi);
225
226                 rem  = (size_t)Z.lo&0xff;
227                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228                 Z.hi = (Z.hi>>8);
229                 if (sizeof(size_t)==8)
230                         Z.hi ^= rem_8bit[rem];
231                 else
232                         Z.hi ^= (u64)rem_8bit[rem]<<32;
233         }
234
235         if (is_endian.little) {
236 #ifdef BSWAP8
237                 Xi[0] = BSWAP8(Z.hi);
238                 Xi[1] = BSWAP8(Z.lo);
239 #else
240                 u8 *p = (u8 *)Xi;
241                 u32 v;
242                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
243                 v = (u32)(Z.hi);        PUTU32(p+4,v);
244                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
245                 v = (u32)(Z.lo);        PUTU32(p+12,v);
246 #endif
247         }
248         else {
249                 Xi[0] = Z.hi;
250                 Xi[1] = Z.lo;
251         }
252 }
253 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
254
255 #elif   TABLE_BITS==4
256
257 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
258 {
259         u128 V;
260 #if defined(OPENSSL_SMALL_FOOTPRINT)
261         int  i;
262 #endif
263
264         Htable[0].hi = 0;
265         Htable[0].lo = 0;
266         V.hi = H[0];
267         V.lo = H[1];
268
269 #if defined(OPENSSL_SMALL_FOOTPRINT)
270         for (Htable[8]=V, i=4; i>0; i>>=1) {
271                 REDUCE1BIT(V);
272                 Htable[i] = V;
273         }
274
275         for (i=2; i<16; i<<=1) {
276                 u128 *Hi = Htable+i;
277                 int   j;
278                 for (V=*Hi, j=1; j<i; ++j) {
279                         Hi[j].hi = V.hi^Htable[j].hi;
280                         Hi[j].lo = V.lo^Htable[j].lo;
281                 }
282         }
283 #else
284         Htable[8] = V;
285         REDUCE1BIT(V);
286         Htable[4] = V;
287         REDUCE1BIT(V);
288         Htable[2] = V;
289         REDUCE1BIT(V);
290         Htable[1] = V;
291         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
292         V=Htable[4];
293         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
294         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
295         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
296         V=Htable[8];
297         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
298         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
299         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
300         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
301         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
302         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
303         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #endif
305 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306         /*
307          * ARM assembler expects specific dword order in Htable.
308          */
309         {
310         int j;
311         const union { long one; char little; } is_endian = {1};
312
313         if (is_endian.little)
314                 for (j=0;j<16;++j) {
315                         V = Htable[j];
316                         Htable[j].hi = V.lo;
317                         Htable[j].lo = V.hi;
318                 }
319         else
320                 for (j=0;j<16;++j) {
321                         V = Htable[j];
322                         Htable[j].hi = V.lo<<32|V.lo>>32;
323                         Htable[j].lo = V.hi<<32|V.hi>>32;
324                 }
325         }
326 #endif
327 }
328
329 #ifndef GHASH_ASM
330 __fips_constseg
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but the priority is to minimize
462      * cache footprint...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     __fips_constseg
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 #if     TABLE_BITS==4 && defined(GHASH_ASM)
649 # if    !defined(I386_ONLY) && \
650         (defined(__i386)        || defined(__i386__)    || \
651          defined(__x86_64)      || defined(__x86_64__)  || \
652          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
653 #  define GHASH_ASM_X86_OR_64
654 #  define GCM_FUNCREF_4BIT
655 extern unsigned int OPENSSL_ia32cap_P[2];
656
657 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
658 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
662 #   define GHASH_ASM_X86
663 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665
666 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
667 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
668 #  endif
669 # elif defined(__arm__) || defined(__arm)
670 #  include "arm_arch.h"
671 #  if __ARM_ARCH__>=7
672 #   define GHASH_ASM_ARM
673 #   define GCM_FUNCREF_4BIT
674 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
675 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 #  endif
677 # endif
678 #endif
679
680 #ifdef GCM_FUNCREF_4BIT
681 # undef  GCM_MUL
682 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
683 # ifdef GHASH
684 #  undef  GHASH
685 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
686 # endif
687 #endif
688
689 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
690 {
691         const union { long one; char little; } is_endian = {1};
692
693         memset(ctx,0,sizeof(*ctx));
694         ctx->block = block;
695         ctx->key   = key;
696
697         (*block)(ctx->H.c,ctx->H.c,key);
698
699         if (is_endian.little) {
700                 /* H is stored in host byte order */
701 #ifdef BSWAP8
702                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
703                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
704 #else
705                 u8 *p = ctx->H.c;
706                 u64 hi,lo;
707                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
708                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
709                 ctx->H.u[0] = hi;
710                 ctx->H.u[1] = lo;
711 #endif
712         }
713
714 #if     TABLE_BITS==8
715         gcm_init_8bit(ctx->Htable,ctx->H.u);
716 #elif   TABLE_BITS==4
717 # if    defined(GHASH_ASM_X86_OR_64)
718 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
719         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
720             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
721                 gcm_init_clmul(ctx->Htable,ctx->H.u);
722                 ctx->gmult = gcm_gmult_clmul;
723                 ctx->ghash = gcm_ghash_clmul;
724                 return;
725         }
726 #  endif
727         gcm_init_4bit(ctx->Htable,ctx->H.u);
728 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
729         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
730                 ctx->gmult = gcm_gmult_4bit_mmx;
731                 ctx->ghash = gcm_ghash_4bit_mmx;
732         } else {
733                 ctx->gmult = gcm_gmult_4bit_x86;
734                 ctx->ghash = gcm_ghash_4bit_x86;
735         }
736 #  else
737         ctx->gmult = gcm_gmult_4bit;
738         ctx->ghash = gcm_ghash_4bit;
739 #  endif
740 # elif  defined(GHASH_ASM_ARM)
741         if (OPENSSL_armcap_P & ARMV7_NEON) {
742                 ctx->gmult = gcm_gmult_neon;
743                 ctx->ghash = gcm_ghash_neon;
744         } else {
745                 gcm_init_4bit(ctx->Htable,ctx->H.u);
746                 ctx->gmult = gcm_gmult_4bit;
747                 ctx->ghash = gcm_ghash_4bit;
748         }
749 # else
750         gcm_init_4bit(ctx->Htable,ctx->H.u);
751 # endif
752 #endif
753 }
754
755 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
756 {
757         const union { long one; char little; } is_endian = {1};
758         unsigned int ctr;
759 #ifdef GCM_FUNCREF_4BIT
760         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
761 #endif
762
763         ctx->Yi.u[0]  = 0;
764         ctx->Yi.u[1]  = 0;
765         ctx->Xi.u[0]  = 0;
766         ctx->Xi.u[1]  = 0;
767         ctx->len.u[0] = 0;      /* AAD length */
768         ctx->len.u[1] = 0;      /* message length */
769         ctx->ares = 0;
770         ctx->mres = 0;
771
772         if (len==12) {
773                 memcpy(ctx->Yi.c,iv,12);
774                 ctx->Yi.c[15]=1;
775                 ctr=1;
776         }
777         else {
778                 size_t i;
779                 u64 len0 = len;
780
781                 while (len>=16) {
782                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
783                         GCM_MUL(ctx,Yi);
784                         iv += 16;
785                         len -= 16;
786                 }
787                 if (len) {
788                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
789                         GCM_MUL(ctx,Yi);
790                 }
791                 len0 <<= 3;
792                 if (is_endian.little) {
793 #ifdef BSWAP8
794                         ctx->Yi.u[1]  ^= BSWAP8(len0);
795 #else
796                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
797                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
798                         ctx->Yi.c[10] ^= (u8)(len0>>40);
799                         ctx->Yi.c[11] ^= (u8)(len0>>32);
800                         ctx->Yi.c[12] ^= (u8)(len0>>24);
801                         ctx->Yi.c[13] ^= (u8)(len0>>16);
802                         ctx->Yi.c[14] ^= (u8)(len0>>8);
803                         ctx->Yi.c[15] ^= (u8)(len0);
804 #endif
805                 }
806                 else
807                         ctx->Yi.u[1]  ^= len0;
808
809                 GCM_MUL(ctx,Yi);
810
811                 if (is_endian.little)
812                         ctr = GETU32(ctx->Yi.c+12);
813                 else
814                         ctr = ctx->Yi.d[3];
815         }
816
817         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
818         ++ctr;
819         if (is_endian.little)
820                 PUTU32(ctx->Yi.c+12,ctr);
821         else
822                 ctx->Yi.d[3] = ctr;
823 }
824
825 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
826 {
827         size_t i;
828         unsigned int n;
829         u64 alen = ctx->len.u[0];
830 #ifdef GCM_FUNCREF_4BIT
831         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
832 # ifdef GHASH
833         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
834                                 const u8 *inp,size_t len)       = ctx->ghash;
835 # endif
836 #endif
837
838         if (ctx->len.u[1]) return -2;
839
840         alen += len;
841         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
842                 return -1;
843         ctx->len.u[0] = alen;
844
845         n = ctx->ares;
846         if (n) {
847                 while (n && len) {
848                         ctx->Xi.c[n] ^= *(aad++);
849                         --len;
850                         n = (n+1)%16;
851                 }
852                 if (n==0) GCM_MUL(ctx,Xi);
853                 else {
854                         ctx->ares = n;
855                         return 0;
856                 }
857         }
858
859 #ifdef GHASH
860         if ((i = (len&(size_t)-16))) {
861                 GHASH(ctx,aad,i);
862                 aad += i;
863                 len -= i;
864         }
865 #else
866         while (len>=16) {
867                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
868                 GCM_MUL(ctx,Xi);
869                 aad += 16;
870                 len -= 16;
871         }
872 #endif
873         if (len) {
874                 n = (unsigned int)len;
875                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
876         }
877
878         ctx->ares = n;
879         return 0;
880 }
881
882 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
883                 const unsigned char *in, unsigned char *out,
884                 size_t len)
885 {
886         const union { long one; char little; } is_endian = {1};
887         unsigned int n, ctr;
888         size_t i;
889         u64        mlen  = ctx->len.u[1];
890         block128_f block = ctx->block;
891         void      *key   = ctx->key;
892 #ifdef GCM_FUNCREF_4BIT
893         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
894 # ifdef GHASH
895         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
896                                 const u8 *inp,size_t len)       = ctx->ghash;
897 # endif
898 #endif
899
900 #if 0
901         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
902 #endif
903         mlen += len;
904         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
905                 return -1;
906         ctx->len.u[1] = mlen;
907
908         if (ctx->ares) {
909                 /* First call to encrypt finalizes GHASH(AAD) */
910                 GCM_MUL(ctx,Xi);
911                 ctx->ares = 0;
912         }
913
914         if (is_endian.little)
915                 ctr = GETU32(ctx->Yi.c+12);
916         else
917                 ctr = ctx->Yi.d[3];
918
919         n = ctx->mres;
920 #if !defined(OPENSSL_SMALL_FOOTPRINT)
921         if (16%sizeof(size_t) == 0) do {        /* always true actually */
922                 if (n) {
923                         while (n && len) {
924                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
925                                 --len;
926                                 n = (n+1)%16;
927                         }
928                         if (n==0) GCM_MUL(ctx,Xi);
929                         else {
930                                 ctx->mres = n;
931                                 return 0;
932                         }
933                 }
934 #if defined(STRICT_ALIGNMENT)
935                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
936                         break;
937 #endif
938 #if defined(GHASH) && defined(GHASH_CHUNK)
939                 while (len>=GHASH_CHUNK) {
940                     size_t j=GHASH_CHUNK;
941
942                     while (j) {
943                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
944                         ++ctr;
945                         if (is_endian.little)
946                                 PUTU32(ctx->Yi.c+12,ctr);
947                         else
948                                 ctx->Yi.d[3] = ctr;
949                         for (i=0; i<16; i+=sizeof(size_t))
950                                 *(size_t *)(out+i) =
951                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
952                         out += 16;
953                         in  += 16;
954                         j   -= 16;
955                     }
956                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
957                     len -= GHASH_CHUNK;
958                 }
959                 if ((i = (len&(size_t)-16))) {
960                     size_t j=i;
961
962                     while (len>=16) {
963                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
964                         ++ctr;
965                         if (is_endian.little)
966                                 PUTU32(ctx->Yi.c+12,ctr);
967                         else
968                                 ctx->Yi.d[3] = ctr;
969                         for (i=0; i<16; i+=sizeof(size_t))
970                                 *(size_t *)(out+i) =
971                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
972                         out += 16;
973                         in  += 16;
974                         len -= 16;
975                     }
976                     GHASH(ctx,out-j,j);
977                 }
978 #else
979                 while (len>=16) {
980                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
981                         ++ctr;
982                         if (is_endian.little)
983                                 PUTU32(ctx->Yi.c+12,ctr);
984                         else
985                                 ctx->Yi.d[3] = ctr;
986                         for (i=0; i<16; i+=sizeof(size_t))
987                                 *(size_t *)(ctx->Xi.c+i) ^=
988                                 *(size_t *)(out+i) =
989                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
990                         GCM_MUL(ctx,Xi);
991                         out += 16;
992                         in  += 16;
993                         len -= 16;
994                 }
995 #endif
996                 if (len) {
997                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
998                         ++ctr;
999                         if (is_endian.little)
1000                                 PUTU32(ctx->Yi.c+12,ctr);
1001                         else
1002                                 ctx->Yi.d[3] = ctr;
1003                         while (len--) {
1004                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1005                                 ++n;
1006                         }
1007                 }
1008
1009                 ctx->mres = n;
1010                 return 0;
1011         } while(0);
1012 #endif
1013         for (i=0;i<len;++i) {
1014                 if (n==0) {
1015                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1016                         ++ctr;
1017                         if (is_endian.little)
1018                                 PUTU32(ctx->Yi.c+12,ctr);
1019                         else
1020                                 ctx->Yi.d[3] = ctr;
1021                 }
1022                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1023                 n = (n+1)%16;
1024                 if (n==0)
1025                         GCM_MUL(ctx,Xi);
1026         }
1027
1028         ctx->mres = n;
1029         return 0;
1030 }
1031
1032 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1033                 const unsigned char *in, unsigned char *out,
1034                 size_t len)
1035 {
1036         const union { long one; char little; } is_endian = {1};
1037         unsigned int n, ctr;
1038         size_t i;
1039         u64        mlen  = ctx->len.u[1];
1040         block128_f block = ctx->block;
1041         void      *key   = ctx->key;
1042 #ifdef GCM_FUNCREF_4BIT
1043         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1044 # ifdef GHASH
1045         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1046                                 const u8 *inp,size_t len)       = ctx->ghash;
1047 # endif
1048 #endif
1049
1050         mlen += len;
1051         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1052                 return -1;
1053         ctx->len.u[1] = mlen;
1054
1055         if (ctx->ares) {
1056                 /* First call to decrypt finalizes GHASH(AAD) */
1057                 GCM_MUL(ctx,Xi);
1058                 ctx->ares = 0;
1059         }
1060
1061         if (is_endian.little)
1062                 ctr = GETU32(ctx->Yi.c+12);
1063         else
1064                 ctr = ctx->Yi.d[3];
1065
1066         n = ctx->mres;
1067 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1068         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1069                 if (n) {
1070                         while (n && len) {
1071                                 u8 c = *(in++);
1072                                 *(out++) = c^ctx->EKi.c[n];
1073                                 ctx->Xi.c[n] ^= c;
1074                                 --len;
1075                                 n = (n+1)%16;
1076                         }
1077                         if (n==0) GCM_MUL (ctx,Xi);
1078                         else {
1079                                 ctx->mres = n;
1080                                 return 0;
1081                         }
1082                 }
1083 #if defined(STRICT_ALIGNMENT)
1084                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1085                         break;
1086 #endif
1087 #if defined(GHASH) && defined(GHASH_CHUNK)
1088                 while (len>=GHASH_CHUNK) {
1089                     size_t j=GHASH_CHUNK;
1090
1091                     GHASH(ctx,in,GHASH_CHUNK);
1092                     while (j) {
1093                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1094                         ++ctr;
1095                         if (is_endian.little)
1096                                 PUTU32(ctx->Yi.c+12,ctr);
1097                         else
1098                                 ctx->Yi.d[3] = ctr;
1099                         for (i=0; i<16; i+=sizeof(size_t))
1100                                 *(size_t *)(out+i) =
1101                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1102                         out += 16;
1103                         in  += 16;
1104                         j   -= 16;
1105                     }
1106                     len -= GHASH_CHUNK;
1107                 }
1108                 if ((i = (len&(size_t)-16))) {
1109                     GHASH(ctx,in,i);
1110                     while (len>=16) {
1111                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1112                         ++ctr;
1113                         if (is_endian.little)
1114                                 PUTU32(ctx->Yi.c+12,ctr);
1115                         else
1116                                 ctx->Yi.d[3] = ctr;
1117                         for (i=0; i<16; i+=sizeof(size_t))
1118                                 *(size_t *)(out+i) =
1119                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1120                         out += 16;
1121                         in  += 16;
1122                         len -= 16;
1123                     }
1124                 }
1125 #else
1126                 while (len>=16) {
1127                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1128                         ++ctr;
1129                         if (is_endian.little)
1130                                 PUTU32(ctx->Yi.c+12,ctr);
1131                         else
1132                                 ctx->Yi.d[3] = ctr;
1133                         for (i=0; i<16; i+=sizeof(size_t)) {
1134                                 size_t c = *(size_t *)(in+i);
1135                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1136                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1137                         }
1138                         GCM_MUL(ctx,Xi);
1139                         out += 16;
1140                         in  += 16;
1141                         len -= 16;
1142                 }
1143 #endif
1144                 if (len) {
1145                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1146                         ++ctr;
1147                         if (is_endian.little)
1148                                 PUTU32(ctx->Yi.c+12,ctr);
1149                         else
1150                                 ctx->Yi.d[3] = ctr;
1151                         while (len--) {
1152                                 u8 c = in[n];
1153                                 ctx->Xi.c[n] ^= c;
1154                                 out[n] = c^ctx->EKi.c[n];
1155                                 ++n;
1156                         }
1157                 }
1158
1159                 ctx->mres = n;
1160                 return 0;
1161         } while(0);
1162 #endif
1163         for (i=0;i<len;++i) {
1164                 u8 c;
1165                 if (n==0) {
1166                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1167                         ++ctr;
1168                         if (is_endian.little)
1169                                 PUTU32(ctx->Yi.c+12,ctr);
1170                         else
1171                                 ctx->Yi.d[3] = ctr;
1172                 }
1173                 c = in[i];
1174                 out[i] = c^ctx->EKi.c[n];
1175                 ctx->Xi.c[n] ^= c;
1176                 n = (n+1)%16;
1177                 if (n==0)
1178                         GCM_MUL(ctx,Xi);
1179         }
1180
1181         ctx->mres = n;
1182         return 0;
1183 }
1184
1185 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1186                 const unsigned char *in, unsigned char *out,
1187                 size_t len, ctr128_f stream)
1188 {
1189         const union { long one; char little; } is_endian = {1};
1190         unsigned int n, ctr;
1191         size_t i;
1192         u64   mlen = ctx->len.u[1];
1193         void *key  = ctx->key;
1194 #ifdef GCM_FUNCREF_4BIT
1195         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1196 # ifdef GHASH
1197         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1198                                 const u8 *inp,size_t len)       = ctx->ghash;
1199 # endif
1200 #endif
1201
1202         mlen += len;
1203         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1204                 return -1;
1205         ctx->len.u[1] = mlen;
1206
1207         if (ctx->ares) {
1208                 /* First call to encrypt finalizes GHASH(AAD) */
1209                 GCM_MUL(ctx,Xi);
1210                 ctx->ares = 0;
1211         }
1212
1213         if (is_endian.little)
1214                 ctr = GETU32(ctx->Yi.c+12);
1215         else
1216                 ctr = ctx->Yi.d[3];
1217
1218         n = ctx->mres;
1219         if (n) {
1220                 while (n && len) {
1221                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1222                         --len;
1223                         n = (n+1)%16;
1224                 }
1225                 if (n==0) GCM_MUL(ctx,Xi);
1226                 else {
1227                         ctx->mres = n;
1228                         return 0;
1229                 }
1230         }
1231 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1232         while (len>=GHASH_CHUNK) {
1233                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1234                 ctr += GHASH_CHUNK/16;
1235                 if (is_endian.little)
1236                         PUTU32(ctx->Yi.c+12,ctr);
1237                 else
1238                         ctx->Yi.d[3] = ctr;
1239                 GHASH(ctx,out,GHASH_CHUNK);
1240                 out += GHASH_CHUNK;
1241                 in  += GHASH_CHUNK;
1242                 len -= GHASH_CHUNK;
1243         }
1244 #endif
1245         if ((i = (len&(size_t)-16))) {
1246                 size_t j=i/16;
1247
1248                 (*stream)(in,out,j,key,ctx->Yi.c);
1249                 ctr += (unsigned int)j;
1250                 if (is_endian.little)
1251                         PUTU32(ctx->Yi.c+12,ctr);
1252                 else
1253                         ctx->Yi.d[3] = ctr;
1254                 in  += i;
1255                 len -= i;
1256 #if defined(GHASH)
1257                 GHASH(ctx,out,i);
1258                 out += i;
1259 #else
1260                 while (j--) {
1261                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1262                         GCM_MUL(ctx,Xi);
1263                         out += 16;
1264                 }
1265 #endif
1266         }
1267         if (len) {
1268                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1269                 ++ctr;
1270                 if (is_endian.little)
1271                         PUTU32(ctx->Yi.c+12,ctr);
1272                 else
1273                         ctx->Yi.d[3] = ctr;
1274                 while (len--) {
1275                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1276                         ++n;
1277                 }
1278         }
1279
1280         ctx->mres = n;
1281         return 0;
1282 }
1283
1284 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1285                 const unsigned char *in, unsigned char *out,
1286                 size_t len,ctr128_f stream)
1287 {
1288         const union { long one; char little; } is_endian = {1};
1289         unsigned int n, ctr;
1290         size_t i;
1291         u64   mlen = ctx->len.u[1];
1292         void *key  = ctx->key;
1293 #ifdef GCM_FUNCREF_4BIT
1294         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1295 # ifdef GHASH
1296         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1297                                 const u8 *inp,size_t len)       = ctx->ghash;
1298 # endif
1299 #endif
1300
1301         mlen += len;
1302         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1303                 return -1;
1304         ctx->len.u[1] = mlen;
1305
1306         if (ctx->ares) {
1307                 /* First call to decrypt finalizes GHASH(AAD) */
1308                 GCM_MUL(ctx,Xi);
1309                 ctx->ares = 0;
1310         }
1311
1312         if (is_endian.little)
1313                 ctr = GETU32(ctx->Yi.c+12);
1314         else
1315                 ctr = ctx->Yi.d[3];
1316
1317         n = ctx->mres;
1318         if (n) {
1319                 while (n && len) {
1320                         u8 c = *(in++);
1321                         *(out++) = c^ctx->EKi.c[n];
1322                         ctx->Xi.c[n] ^= c;
1323                         --len;
1324                         n = (n+1)%16;
1325                 }
1326                 if (n==0) GCM_MUL (ctx,Xi);
1327                 else {
1328                         ctx->mres = n;
1329                         return 0;
1330                 }
1331         }
1332 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1333         while (len>=GHASH_CHUNK) {
1334                 GHASH(ctx,in,GHASH_CHUNK);
1335                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1336                 ctr += GHASH_CHUNK/16;
1337                 if (is_endian.little)
1338                         PUTU32(ctx->Yi.c+12,ctr);
1339                 else
1340                         ctx->Yi.d[3] = ctr;
1341                 out += GHASH_CHUNK;
1342                 in  += GHASH_CHUNK;
1343                 len -= GHASH_CHUNK;
1344         }
1345 #endif
1346         if ((i = (len&(size_t)-16))) {
1347                 size_t j=i/16;
1348
1349 #if defined(GHASH)
1350                 GHASH(ctx,in,i);
1351 #else
1352                 while (j--) {
1353                         size_t k;
1354                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1355                         GCM_MUL(ctx,Xi);
1356                         in += 16;
1357                 }
1358                 j   = i/16;
1359                 in -= i;
1360 #endif
1361                 (*stream)(in,out,j,key,ctx->Yi.c);
1362                 ctr += (unsigned int)j;
1363                 if (is_endian.little)
1364                         PUTU32(ctx->Yi.c+12,ctr);
1365                 else
1366                         ctx->Yi.d[3] = ctr;
1367                 out += i;
1368                 in  += i;
1369                 len -= i;
1370         }
1371         if (len) {
1372                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1373                 ++ctr;
1374                 if (is_endian.little)
1375                         PUTU32(ctx->Yi.c+12,ctr);
1376                 else
1377                         ctx->Yi.d[3] = ctr;
1378                 while (len--) {
1379                         u8 c = in[n];
1380                         ctx->Xi.c[n] ^= c;
1381                         out[n] = c^ctx->EKi.c[n];
1382                         ++n;
1383                 }
1384         }
1385
1386         ctx->mres = n;
1387         return 0;
1388 }
1389
1390 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1391                         size_t len)
1392 {
1393         const union { long one; char little; } is_endian = {1};
1394         u64 alen = ctx->len.u[0]<<3;
1395         u64 clen = ctx->len.u[1]<<3;
1396 #ifdef GCM_FUNCREF_4BIT
1397         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1398 #endif
1399
1400         if (ctx->mres)
1401                 GCM_MUL(ctx,Xi);
1402
1403         if (is_endian.little) {
1404 #ifdef BSWAP8
1405                 alen = BSWAP8(alen);
1406                 clen = BSWAP8(clen);
1407 #else
1408                 u8 *p = ctx->len.c;
1409
1410                 ctx->len.u[0] = alen;
1411                 ctx->len.u[1] = clen;
1412
1413                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1414                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1415 #endif
1416         }
1417
1418         ctx->Xi.u[0] ^= alen;
1419         ctx->Xi.u[1] ^= clen;
1420         GCM_MUL(ctx,Xi);
1421
1422         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1423         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1424
1425         if (tag && len<=sizeof(ctx->Xi))
1426                 return memcmp(ctx->Xi.c,tag,len);
1427         else
1428                 return -1;
1429 }
1430
1431 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1432 {
1433         CRYPTO_gcm128_finish(ctx, NULL, 0);
1434         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1435 }
1436
1437 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1438 {
1439         GCM128_CONTEXT *ret;
1440
1441         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1442                 CRYPTO_gcm128_init(ret,key,block);
1443
1444         return ret;
1445 }
1446
1447 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1448 {
1449         if (ctx) {
1450                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1451                 OPENSSL_free(ctx);
1452         }
1453 }
1454
1455 #if defined(SELFTEST)
1456 #include <stdio.h>
1457 #include <openssl/aes.h>
1458
1459 /* Test Case 1 */
1460 static const u8 K1[16],
1461                 *P1=NULL,
1462                 *A1=NULL,
1463                 IV1[12],
1464                 *C1=NULL,
1465                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1466
1467 /* Test Case 2 */
1468 #define K2 K1
1469 #define A2 A1
1470 #define IV2 IV1
1471 static const u8 P2[16],
1472                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1473                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1474
1475 /* Test Case 3 */
1476 #define A3 A2
1477 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1478                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1479                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1480                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1481                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1482                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1483                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1484                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1485                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1486                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1487                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1488
1489 /* Test Case 4 */
1490 #define K4 K3
1491 #define IV4 IV3
1492 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1493                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1494                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1495                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1496                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1497                         0xab,0xad,0xda,0xd2},
1498                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1499                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1500                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1501                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1502                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1503
1504 /* Test Case 5 */
1505 #define K5 K4
1506 #define P5 P4
1507 #define A5 A4
1508 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1509                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1510                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1511                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1512                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1513                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1514
1515 /* Test Case 6 */
1516 #define K6 K5
1517 #define P6 P5
1518 #define A6 A5
1519 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1520                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1521                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1522                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1523                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1524                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1525                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1526                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1527                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1528
1529 /* Test Case 7 */
1530 static const u8 K7[24],
1531                 *P7=NULL,
1532                 *A7=NULL,
1533                 IV7[12],
1534                 *C7=NULL,
1535                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1536
1537 /* Test Case 8 */
1538 #define K8 K7
1539 #define IV8 IV7
1540 #define A8 A7
1541 static const u8 P8[16],
1542                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1543                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1544
1545 /* Test Case 9 */
1546 #define A9 A8
1547 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1548                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1549                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1550                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1551                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1552                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1553                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1554                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1555                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1556                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1557                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1558                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1559
1560 /* Test Case 10 */
1561 #define K10 K9
1562 #define IV10 IV9
1563 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1564                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1565                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1566                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1567                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1568                         0xab,0xad,0xda,0xd2},
1569                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1570                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1571                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1572                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1573                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1574
1575 /* Test Case 11 */
1576 #define K11 K10
1577 #define P11 P10
1578 #define A11 A10
1579 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1580                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1581                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1582                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1583                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1584                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1585
1586 /* Test Case 12 */
1587 #define K12 K11
1588 #define P12 P11
1589 #define A12 A11
1590 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1591                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1592                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1593                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1594                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1595                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1596                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1597                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1598                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1599
1600 /* Test Case 13 */
1601 static const u8 K13[32],
1602                 *P13=NULL,
1603                 *A13=NULL,
1604                 IV13[12],
1605                 *C13=NULL,
1606                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1607
1608 /* Test Case 14 */
1609 #define K14 K13
1610 #define A14 A13
1611 static const u8 P14[16],
1612                 IV14[12],
1613                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1614                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1615
1616 /* Test Case 15 */
1617 #define A15 A14
1618 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1619                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1620                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1621                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1622                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1623                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1624                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1625                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1626                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1627                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1628                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1629                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1630
1631 /* Test Case 16 */
1632 #define K16 K15
1633 #define IV16 IV15
1634 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1635                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1636                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1637                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1638                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1639                         0xab,0xad,0xda,0xd2},
1640                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1641                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1642                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1643                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1644                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1645
1646 /* Test Case 17 */
1647 #define K17 K16
1648 #define P17 P16
1649 #define A17 A16
1650 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1651                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1652                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1653                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1654                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1655                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1656
1657 /* Test Case 18 */
1658 #define K18 K17
1659 #define P18 P17
1660 #define A18 A17
1661 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1662                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1663                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1664                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1665                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1666                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1667                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1668                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1669                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1670
1671 #define TEST_CASE(n)    do {                                    \
1672         u8 out[sizeof(P##n)];                                   \
1673         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1674         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1675         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1676         memset(out,0,sizeof(out));                              \
1677         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1678         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1679         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1680             (C##n && memcmp(out,C##n,sizeof(out))))             \
1681                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1682         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1683         memset(out,0,sizeof(out));                              \
1684         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1685         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1686         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1687             (P##n && memcmp(out,P##n,sizeof(out))))             \
1688                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1689         } while(0)
1690
1691 int main()
1692 {
1693         GCM128_CONTEXT ctx;
1694         AES_KEY key;
1695         int ret=0;
1696
1697         TEST_CASE(1);
1698         TEST_CASE(2);
1699         TEST_CASE(3);
1700         TEST_CASE(4);
1701         TEST_CASE(5);
1702         TEST_CASE(6);
1703         TEST_CASE(7);
1704         TEST_CASE(8);
1705         TEST_CASE(9);
1706         TEST_CASE(10);
1707         TEST_CASE(11);
1708         TEST_CASE(12);
1709         TEST_CASE(13);
1710         TEST_CASE(14);
1711         TEST_CASE(15);
1712         TEST_CASE(16);
1713         TEST_CASE(17);
1714         TEST_CASE(18);
1715
1716 #ifdef OPENSSL_CPUID_OBJ
1717         {
1718         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1719         union { u64 u; u8 c[1024]; } buf;
1720         int i;
1721
1722         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1723         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1724         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1725
1726         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1727         start = OPENSSL_rdtsc();
1728         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1729         gcm_t = OPENSSL_rdtsc() - start;
1730
1731         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1732                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1733                         (block128_f)AES_encrypt);
1734         start = OPENSSL_rdtsc();
1735         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1736                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1737                         (block128_f)AES_encrypt);
1738         ctr_t = OPENSSL_rdtsc() - start;
1739
1740         printf("%.2f-%.2f=%.2f\n",
1741                         gcm_t/(double)sizeof(buf),
1742                         ctr_t/(double)sizeof(buf),
1743                         (gcm_t-ctr_t)/(double)sizeof(buf));
1744 #ifdef GHASH
1745         GHASH(&ctx,buf.c,sizeof(buf));
1746         start = OPENSSL_rdtsc();
1747         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1748         gcm_t = OPENSSL_rdtsc() - start;
1749         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1750 #endif
1751         }
1752 #endif
1753
1754         return ret;
1755 }
1756 #endif