875f6cab7d68d68432ad0ea4fdbd7cccdb953c2a
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         static const size_t rem_8bit[256] = {
152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217         while (1) {
218                 Z.hi ^= Htable[n].hi;
219                 Z.lo ^= Htable[n].lo;
220
221                 if ((u8 *)Xi==xi)       break;
222
223                 n = *(--xi);
224
225                 rem  = (size_t)Z.lo&0xff;
226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227                 Z.hi = (Z.hi>>8);
228                 if (sizeof(size_t)==8)
229                         Z.hi ^= rem_8bit[rem];
230                 else
231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
232         }
233
234         if (is_endian.little) {
235 #ifdef BSWAP8
236                 Xi[0] = BSWAP8(Z.hi);
237                 Xi[1] = BSWAP8(Z.lo);
238 #else
239                 u8 *p = (u8 *)Xi;
240                 u32 v;
241                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
243                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
245 #endif
246         }
247         else {
248                 Xi[0] = Z.hi;
249                 Xi[1] = Z.lo;
250         }
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254 #elif   TABLE_BITS==4
255
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258         u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         int  i;
261 #endif
262
263         Htable[0].hi = 0;
264         Htable[0].lo = 0;
265         V.hi = H[0];
266         V.lo = H[1];
267
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269         for (Htable[8]=V, i=4; i>0; i>>=1) {
270                 REDUCE1BIT(V);
271                 Htable[i] = V;
272         }
273
274         for (i=2; i<16; i<<=1) {
275                 u128 *Hi = Htable+i;
276                 int   j;
277                 for (V=*Hi, j=1; j<i; ++j) {
278                         Hi[j].hi = V.hi^Htable[j].hi;
279                         Hi[j].lo = V.lo^Htable[j].lo;
280                 }
281         }
282 #else
283         Htable[8] = V;
284         REDUCE1BIT(V);
285         Htable[4] = V;
286         REDUCE1BIT(V);
287         Htable[2] = V;
288         REDUCE1BIT(V);
289         Htable[1] = V;
290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291         V=Htable[4];
292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295         V=Htable[8];
296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305         /*
306          * ARM assembler expects specific dword order in Htable.
307          */
308         {
309         int j;
310         const union { long one; char little; } is_endian = {1};
311
312         if (is_endian.little)
313                 for (j=0;j<16;++j) {
314                         V = Htable[j];
315                         Htable[j].hi = V.lo;
316                         Htable[j].lo = V.hi;
317                 }
318         else
319                 for (j=0;j<16;++j) {
320                         V = Htable[j];
321                         Htable[j].hi = V.lo<<32|V.lo>>32;
322                         Htable[j].lo = V.hi<<32|V.hi>>32;
323                 }
324         }
325 #endif
326 }
327
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337         u128 Z;
338         int cnt = 15;
339         size_t rem, nlo, nhi;
340         const union { long one; char little; } is_endian = {1};
341
342         nlo  = ((const u8 *)Xi)[15];
343         nhi  = nlo>>4;
344         nlo &= 0xf;
345
346         Z.hi = Htable[nlo].hi;
347         Z.lo = Htable[nlo].lo;
348
349         while (1) {
350                 rem  = (size_t)Z.lo&0xf;
351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352                 Z.hi = (Z.hi>>4);
353                 if (sizeof(size_t)==8)
354                         Z.hi ^= rem_4bit[rem];
355                 else
356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358                 Z.hi ^= Htable[nhi].hi;
359                 Z.lo ^= Htable[nhi].lo;
360
361                 if (--cnt<0)            break;
362
363                 nlo  = ((const u8 *)Xi)[cnt];
364                 nhi  = nlo>>4;
365                 nlo &= 0xf;
366
367                 rem  = (size_t)Z.lo&0xf;
368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369                 Z.hi = (Z.hi>>4);
370                 if (sizeof(size_t)==8)
371                         Z.hi ^= rem_4bit[rem];
372                 else
373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375                 Z.hi ^= Htable[nlo].hi;
376                 Z.lo ^= Htable[nlo].lo;
377         }
378
379         if (is_endian.little) {
380 #ifdef BSWAP8
381                 Xi[0] = BSWAP8(Z.hi);
382                 Xi[1] = BSWAP8(Z.lo);
383 #else
384                 u8 *p = (u8 *)Xi;
385                 u32 v;
386                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
388                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
390 #endif
391         }
392         else {
393                 Xi[0] = Z.hi;
394                 Xi[1] = Z.lo;
395         }
396 }
397
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407                                 const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413
414 #if 1
415     do {
416         cnt  = 15;
417         nlo  = ((const u8 *)Xi)[15];
418         nlo ^= inp[15];
419         nhi  = nlo>>4;
420         nlo &= 0xf;
421
422         Z.hi = Htable[nlo].hi;
423         Z.lo = Htable[nlo].lo;
424
425         while (1) {
426                 rem  = (size_t)Z.lo&0xf;
427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428                 Z.hi = (Z.hi>>4);
429                 if (sizeof(size_t)==8)
430                         Z.hi ^= rem_4bit[rem];
431                 else
432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434                 Z.hi ^= Htable[nhi].hi;
435                 Z.lo ^= Htable[nhi].lo;
436
437                 if (--cnt<0)            break;
438
439                 nlo  = ((const u8 *)Xi)[cnt];
440                 nlo ^= inp[cnt];
441                 nhi  = nlo>>4;
442                 nlo &= 0xf;
443
444                 rem  = (size_t)Z.lo&0xf;
445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446                 Z.hi = (Z.hi>>4);
447                 if (sizeof(size_t)==8)
448                         Z.hi ^= rem_4bit[rem];
449                 else
450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452                 Z.hi ^= Htable[nlo].hi;
453                 Z.lo ^= Htable[nlo].lo;
454         }
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */ 
462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504         Z.hi = Htable[cnt].hi;
505         Z.lo = Htable[cnt].lo;
506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507         Hshr4[cnt].hi = (Z.hi>>4);
508         Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510
511     do {
512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513                 nlo  = ((const u8 *)Xi)[cnt];
514                 nlo ^= inp[cnt];
515                 nhi  = nlo>>4;
516                 nlo &= 0xf;
517
518                 Z.hi ^= Htable[nlo].hi;
519                 Z.lo ^= Htable[nlo].lo;
520
521                 rem = (size_t)Z.lo&0xff;
522
523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524                 Z.hi = (Z.hi>>8);
525
526                 Z.hi ^= Hshr4[nhi].hi;
527                 Z.lo ^= Hshr4[nhi].lo;
528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529         }
530
531         nlo  = ((const u8 *)Xi)[0];
532         nlo ^= inp[0];
533         nhi  = nlo>>4;
534         nlo &= 0xf;
535
536         Z.hi ^= Htable[nlo].hi;
537         Z.lo ^= Htable[nlo].lo;
538
539         rem = (size_t)Z.lo&0xf;
540
541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
542         Z.hi = (Z.hi>>4);
543
544         Z.hi ^= Htable[nhi].hi;
545         Z.lo ^= Htable[nhi].lo;
546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548
549         if (is_endian.little) {
550 #ifdef BSWAP8
551                 Xi[0] = BSWAP8(Z.hi);
552                 Xi[1] = BSWAP8(Z.lo);
553 #else
554                 u8 *p = (u8 *)Xi;
555                 u32 v;
556                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
558                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
560 #endif
561         }
562         else {
563                 Xi[0] = Z.hi;
564                 Xi[1] = Z.lo;
565         }
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582
583 #else   /* TABLE_BITS */
584
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587         u128 V,Z = { 0,0 };
588         long X;
589         int  i,j;
590         const long *xi = (const long *)Xi;
591         const union { long one; char little; } is_endian = {1};
592
593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
594         V.lo = H[1];
595
596         for (j=0; j<16/sizeof(long); ++j) {
597                 if (is_endian.little) {
598                         if (sizeof(long)==8) {
599 #ifdef BSWAP8
600                                 X = (long)(BSWAP8(xi[j]));
601 #else
602                                 const u8 *p = (const u8 *)(xi+j);
603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605                         }
606                         else {
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)GETU32(p);
609                         }
610                 }
611                 else
612                         X = xi[j];
613
614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
616                         Z.hi ^= V.hi&M;
617                         Z.lo ^= V.lo&M;
618
619                         REDUCE1BIT(V);
620                 }
621         }
622
623         if (is_endian.little) {
624 #ifdef BSWAP8
625                 Xi[0] = BSWAP8(Z.hi);
626                 Xi[1] = BSWAP8(Z.lo);
627 #else
628                 u8 *p = (u8 *)Xi;
629                 u32 v;
630                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
632                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
634 #endif
635         }
636         else {
637                 Xi[0] = Z.hi;
638                 Xi[1] = Z.lo;
639         }
640 }
641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643 #endif
644
645 #if     TABLE_BITS==4 && defined(GHASH_ASM)
646 # if    !defined(I386_ONLY) && \
647         (defined(__i386)        || defined(__i386__)    || \
648          defined(__x86_64)      || defined(__x86_64__)  || \
649          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 #   define GHASH_ASM_X86
660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665 #  endif
666 # elif defined(__arm__) || defined(__arm)
667 #  include "arm_arch.h"
668 #  if __ARM_ARCH__>=7
669 #   define GHASH_ASM_ARM
670 #   define GCM_FUNCREF_4BIT
671 extern unsigned int OPENSSL_armcap;
672
673 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675 #  endif
676 # endif
677 #endif
678
679 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
680 {
681         const union { long one; char little; } is_endian = {1};
682
683         memset(ctx,0,sizeof(*ctx));
684         ctx->block = block;
685         ctx->key   = key;
686
687         (*block)(ctx->H.c,ctx->H.c,key);
688
689         if (is_endian.little) {
690                 /* H is stored in host byte order */
691 #ifdef BSWAP8
692                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
693                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
694 #else
695                 u8 *p = ctx->H.c;
696                 u64 hi,lo;
697                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
698                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
699                 ctx->H.u[0] = hi;
700                 ctx->H.u[1] = lo;
701 #endif
702         }
703
704 #if     TABLE_BITS==8
705         gcm_init_8bit(ctx->Htable,ctx->H.u);
706 #elif   TABLE_BITS==4
707 # if    defined(GHASH_ASM_X86_OR_64)
708 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
709         if (OPENSSL_ia32cap_P[1]&(1<<1)) {
710                 gcm_init_clmul(ctx->Htable,ctx->H.u);
711                 ctx->gmult = gcm_gmult_clmul;
712                 ctx->ghash = gcm_ghash_clmul;
713                 return;
714         }
715 #  endif
716         gcm_init_4bit(ctx->Htable,ctx->H.u);
717 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
718         if (OPENSSL_ia32cap_P[0]&(1<<23)) {
719                 ctx->gmult = gcm_gmult_4bit_mmx;
720                 ctx->ghash = gcm_ghash_4bit_mmx;
721         } else {
722                 ctx->gmult = gcm_gmult_4bit_x86;
723                 ctx->ghash = gcm_ghash_4bit_x86;
724         }
725 #  else
726         ctx->gmult = gcm_gmult_4bit;
727         ctx->ghash = gcm_ghash_4bit;
728 #  endif
729 # elif  defined(GHASH_ASM_ARM)
730         if (OPENSSL_armcap & 1) {
731                 ctx->gmult = gcm_gmult_neon;
732                 ctx->ghash = gcm_ghash_neon;
733         } else {
734                 gcm_init_4bit(ctx->Htable,ctx->H.u);
735                 ctx->gmult = gcm_gmult_4bit;
736                 ctx->ghash = gcm_ghash_4bit;
737         }
738 # else
739         gcm_init_4bit(ctx->Htable,ctx->H.u);
740 # endif
741 #endif
742 }
743
744 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
745 {
746         const union { long one; char little; } is_endian = {1};
747         unsigned int ctr;
748 #ifdef GCM_FUNCREF_4BIT
749         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
750 #endif
751
752         ctx->Yi.u[0]  = 0;
753         ctx->Yi.u[1]  = 0;
754         ctx->Xi.u[0]  = 0;
755         ctx->Xi.u[1]  = 0;
756         ctx->len.u[0] = 0;      /* AAD length */
757         ctx->len.u[1] = 0;      /* message length */
758         ctx->ares = 0;
759         ctx->mres = 0;
760
761         if (len==12) {
762                 memcpy(ctx->Yi.c,iv,12);
763                 ctx->Yi.c[15]=1;
764                 ctr=1;
765         }
766         else {
767                 size_t i;
768                 u64 len0 = len;
769
770                 while (len>=16) {
771                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
772                         GCM_MUL(ctx,Yi);
773                         iv += 16;
774                         len -= 16;
775                 }
776                 if (len) {
777                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
778                         GCM_MUL(ctx,Yi);
779                 }
780                 len0 <<= 3;
781                 if (is_endian.little) {
782 #ifdef BSWAP8
783                         ctx->Yi.u[1]  ^= BSWAP8(len0);
784 #else
785                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
786                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
787                         ctx->Yi.c[10] ^= (u8)(len0>>40);
788                         ctx->Yi.c[11] ^= (u8)(len0>>32);
789                         ctx->Yi.c[12] ^= (u8)(len0>>24);
790                         ctx->Yi.c[13] ^= (u8)(len0>>16);
791                         ctx->Yi.c[14] ^= (u8)(len0>>8);
792                         ctx->Yi.c[15] ^= (u8)(len0);
793 #endif
794                 }
795                 else
796                         ctx->Yi.u[1]  ^= len0;
797
798                 GCM_MUL(ctx,Yi);
799
800                 if (is_endian.little)
801                         ctr = GETU32(ctx->Yi.c+12);
802                 else
803                         ctr = ctx->Yi.d[3];
804         }
805
806         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
807         ++ctr;
808         if (is_endian.little)
809                 PUTU32(ctx->Yi.c+12,ctr);
810         else
811                 ctx->Yi.d[3] = ctr;
812 }
813
814 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
815 {
816         size_t i;
817         unsigned int n;
818         u64 alen = ctx->len.u[0];
819 #ifdef GCM_FUNCREF_4BIT
820         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
821 # ifdef GHASH
822         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
823                                 const u8 *inp,size_t len) = ctx->ghash;
824 # endif
825 #endif
826
827         if (ctx->len.u[1]) return -2;
828
829         alen += len;
830         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
831                 return -1;
832         ctx->len.u[0] = alen;
833
834         n = ctx->ares;
835         if (n) {
836                 while (n && len) {
837                         ctx->Xi.c[n] ^= *(aad++);
838                         --len;
839                         n = (n+1)%16;
840                 }
841                 if (n==0) GCM_MUL(ctx,Xi);
842                 else {
843                         ctx->ares = n;
844                         return 0;
845                 }
846         }
847
848 #ifdef GHASH
849         if ((i = (len&(size_t)-16))) {
850                 GHASH(ctx,aad,i);
851                 aad += i;
852                 len -= i;
853         }
854 #else
855         while (len>=16) {
856                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
857                 GCM_MUL(ctx,Xi);
858                 aad += 16;
859                 len -= 16;
860         }
861 #endif
862         if (len) {
863                 n = (unsigned int)len;
864                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
865         }
866
867         ctx->ares = n;
868         return 0;
869 }
870
871 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
872                 const unsigned char *in, unsigned char *out,
873                 size_t len)
874 {
875         const union { long one; char little; } is_endian = {1};
876         unsigned int n, ctr;
877         size_t i;
878         u64 mlen = ctx->len.u[1];
879 #ifdef GCM_FUNCREF_4BIT
880         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
881 # ifdef GHASH
882         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
883                                 const u8 *inp,size_t len) = ctx->ghash;
884 # endif
885 #endif
886
887 #if 0
888         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
889 #endif
890         mlen += len;
891         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
892                 return -1;
893         ctx->len.u[1] = mlen;
894
895         if (ctx->ares) {
896                 /* First call to encrypt finalizes GHASH(AAD) */
897                 GCM_MUL(ctx,Xi);
898                 ctx->ares = 0;
899         }
900
901         if (is_endian.little)
902                 ctr = GETU32(ctx->Yi.c+12);
903         else
904                 ctr = ctx->Yi.d[3];
905
906         n = ctx->mres;
907 #if !defined(OPENSSL_SMALL_FOOTPRINT)
908         if (16%sizeof(size_t) == 0) do {        /* always true actually */
909                 if (n) {
910                         while (n && len) {
911                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
912                                 --len;
913                                 n = (n+1)%16;
914                         }
915                         if (n==0) GCM_MUL(ctx,Xi);
916                         else {
917                                 ctx->mres = n;
918                                 return 0;
919                         }
920                 }
921 #if defined(STRICT_ALIGNMENT)
922                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
923                         break;
924 #endif
925 #if defined(GHASH) && defined(GHASH_CHUNK)
926                 while (len>=GHASH_CHUNK) {
927                     size_t j=GHASH_CHUNK;
928
929                     while (j) {
930                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
931                         ++ctr;
932                         if (is_endian.little)
933                                 PUTU32(ctx->Yi.c+12,ctr);
934                         else
935                                 ctx->Yi.d[3] = ctr;
936                         for (i=0; i<16; i+=sizeof(size_t))
937                                 *(size_t *)(out+i) =
938                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
939                         out += 16;
940                         in  += 16;
941                         j   -= 16;
942                     }
943                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
944                     len -= GHASH_CHUNK;
945                 }
946                 if ((i = (len&(size_t)-16))) {
947                     size_t j=i;
948
949                     while (len>=16) {
950                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
951                         ++ctr;
952                         if (is_endian.little)
953                                 PUTU32(ctx->Yi.c+12,ctr);
954                         else
955                                 ctx->Yi.d[3] = ctr;
956                         for (i=0; i<16; i+=sizeof(size_t))
957                                 *(size_t *)(out+i) =
958                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
959                         out += 16;
960                         in  += 16;
961                         len -= 16;
962                     }
963                     GHASH(ctx,out-j,j);
964                 }
965 #else
966                 while (len>=16) {
967                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
968                         ++ctr;
969                         if (is_endian.little)
970                                 PUTU32(ctx->Yi.c+12,ctr);
971                         else
972                                 ctx->Yi.d[3] = ctr;
973                         for (i=0; i<16; i+=sizeof(size_t))
974                                 *(size_t *)(ctx->Xi.c+i) ^=
975                                 *(size_t *)(out+i) =
976                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
977                         GCM_MUL(ctx,Xi);
978                         out += 16;
979                         in  += 16;
980                         len -= 16;
981                 }
982 #endif
983                 if (len) {
984                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
985                         ++ctr;
986                         if (is_endian.little)
987                                 PUTU32(ctx->Yi.c+12,ctr);
988                         else
989                                 ctx->Yi.d[3] = ctr;
990                         while (len--) {
991                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
992                                 ++n;
993                         }
994                 }
995
996                 ctx->mres = n;
997                 return 0;
998         } while(0);
999 #endif
1000         for (i=0;i<len;++i) {
1001                 if (n==0) {
1002                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1003                         ++ctr;
1004                         if (is_endian.little)
1005                                 PUTU32(ctx->Yi.c+12,ctr);
1006                         else
1007                                 ctx->Yi.d[3] = ctr;
1008                 }
1009                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1010                 n = (n+1)%16;
1011                 if (n==0)
1012                         GCM_MUL(ctx,Xi);
1013         }
1014
1015         ctx->mres = n;
1016         return 0;
1017 }
1018
1019 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1020                 const unsigned char *in, unsigned char *out,
1021                 size_t len)
1022 {
1023         const union { long one; char little; } is_endian = {1};
1024         unsigned int n, ctr;
1025         size_t i;
1026         u64 mlen = ctx->len.u[1];
1027 #ifdef GCM_FUNCREF_4BIT
1028         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1029 # ifdef GHASH
1030         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1031                                 const u8 *inp,size_t len) = ctx->ghash;
1032 # endif
1033 #endif
1034
1035         mlen += len;
1036         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1037                 return -1;
1038         ctx->len.u[1] = mlen;
1039
1040         if (ctx->ares) {
1041                 /* First call to decrypt finalizes GHASH(AAD) */
1042                 GCM_MUL(ctx,Xi);
1043                 ctx->ares = 0;
1044         }
1045
1046         if (is_endian.little)
1047                 ctr = GETU32(ctx->Yi.c+12);
1048         else
1049                 ctr = ctx->Yi.d[3];
1050
1051         n = ctx->mres;
1052 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1053         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1054                 if (n) {
1055                         while (n && len) {
1056                                 u8 c = *(in++);
1057                                 *(out++) = c^ctx->EKi.c[n];
1058                                 ctx->Xi.c[n] ^= c;
1059                                 --len;
1060                                 n = (n+1)%16;
1061                         }
1062                         if (n==0) GCM_MUL (ctx,Xi);
1063                         else {
1064                                 ctx->mres = n;
1065                                 return 0;
1066                         }
1067                 }
1068 #if defined(STRICT_ALIGNMENT)
1069                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1070                         break;
1071 #endif
1072 #if defined(GHASH) && defined(GHASH_CHUNK)
1073                 while (len>=GHASH_CHUNK) {
1074                     size_t j=GHASH_CHUNK;
1075
1076                     GHASH(ctx,in,GHASH_CHUNK);
1077                     while (j) {
1078                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1079                         ++ctr;
1080                         if (is_endian.little)
1081                                 PUTU32(ctx->Yi.c+12,ctr);
1082                         else
1083                                 ctx->Yi.d[3] = ctr;
1084                         for (i=0; i<16; i+=sizeof(size_t))
1085                                 *(size_t *)(out+i) =
1086                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1087                         out += 16;
1088                         in  += 16;
1089                         j   -= 16;
1090                     }
1091                     len -= GHASH_CHUNK;
1092                 }
1093                 if ((i = (len&(size_t)-16))) {
1094                     GHASH(ctx,in,i);
1095                     while (len>=16) {
1096                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1097                         ++ctr;
1098                         if (is_endian.little)
1099                                 PUTU32(ctx->Yi.c+12,ctr);
1100                         else
1101                                 ctx->Yi.d[3] = ctr;
1102                         for (i=0; i<16; i+=sizeof(size_t))
1103                                 *(size_t *)(out+i) =
1104                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1105                         out += 16;
1106                         in  += 16;
1107                         len -= 16;
1108                     }
1109                 }
1110 #else
1111                 while (len>=16) {
1112                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1113                         ++ctr;
1114                         if (is_endian.little)
1115                                 PUTU32(ctx->Yi.c+12,ctr);
1116                         else
1117                                 ctx->Yi.d[3] = ctr;
1118                         for (i=0; i<16; i+=sizeof(size_t)) {
1119                                 size_t c = *(size_t *)(in+i);
1120                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1121                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1122                         }
1123                         GCM_MUL(ctx,Xi);
1124                         out += 16;
1125                         in  += 16;
1126                         len -= 16;
1127                 }
1128 #endif
1129                 if (len) {
1130                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1131                         ++ctr;
1132                         if (is_endian.little)
1133                                 PUTU32(ctx->Yi.c+12,ctr);
1134                         else
1135                                 ctx->Yi.d[3] = ctr;
1136                         while (len--) {
1137                                 u8 c = in[n];
1138                                 ctx->Xi.c[n] ^= c;
1139                                 out[n] = c^ctx->EKi.c[n];
1140                                 ++n;
1141                         }
1142                 }
1143
1144                 ctx->mres = n;
1145                 return 0;
1146         } while(0);
1147 #endif
1148         for (i=0;i<len;++i) {
1149                 u8 c;
1150                 if (n==0) {
1151                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1152                         ++ctr;
1153                         if (is_endian.little)
1154                                 PUTU32(ctx->Yi.c+12,ctr);
1155                         else
1156                                 ctx->Yi.d[3] = ctr;
1157                 }
1158                 c = in[i];
1159                 out[i] = c^ctx->EKi.c[n];
1160                 ctx->Xi.c[n] ^= c;
1161                 n = (n+1)%16;
1162                 if (n==0)
1163                         GCM_MUL(ctx,Xi);
1164         }
1165
1166         ctx->mres = n;
1167         return 0;
1168 }
1169
1170 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1171                 const unsigned char *in, unsigned char *out,
1172                 size_t len, ctr128_f stream)
1173 {
1174         const union { long one; char little; } is_endian = {1};
1175         unsigned int n, ctr;
1176         size_t i;
1177         u64 mlen = ctx->len.u[1];
1178 #ifdef GCM_FUNCREF_4BIT
1179         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1180 # ifdef GHASH
1181         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1182                                 const u8 *inp,size_t len) = ctx->ghash;
1183 # endif
1184 #endif
1185
1186         mlen += len;
1187         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1188                 return -1;
1189         ctx->len.u[1] = mlen;
1190
1191         if (ctx->ares) {
1192                 /* First call to encrypt finalizes GHASH(AAD) */
1193                 GCM_MUL(ctx,Xi);
1194                 ctx->ares = 0;
1195         }
1196
1197         if (is_endian.little)
1198                 ctr = GETU32(ctx->Yi.c+12);
1199         else
1200                 ctr = ctx->Yi.d[3];
1201
1202         n = ctx->mres;
1203         if (n) {
1204                 while (n && len) {
1205                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1206                         --len;
1207                         n = (n+1)%16;
1208                 }
1209                 if (n==0) GCM_MUL(ctx,Xi);
1210                 else {
1211                         ctx->mres = n;
1212                         return 0;
1213                 }
1214         }
1215 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1216         while (len>=GHASH_CHUNK) {
1217                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1218                 ctr += GHASH_CHUNK/16;
1219                 if (is_endian.little)
1220                         PUTU32(ctx->Yi.c+12,ctr);
1221                 else
1222                         ctx->Yi.d[3] = ctr;
1223                 GHASH(ctx,out,GHASH_CHUNK);
1224                 out += GHASH_CHUNK;
1225                 in  += GHASH_CHUNK;
1226                 len -= GHASH_CHUNK;
1227         }
1228 #endif
1229         if ((i = (len&(size_t)-16))) {
1230                 size_t j=i/16;
1231
1232                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1233                 ctr += (unsigned int)j;
1234                 if (is_endian.little)
1235                         PUTU32(ctx->Yi.c+12,ctr);
1236                 else
1237                         ctx->Yi.d[3] = ctr;
1238                 in  += i;
1239                 len -= i;
1240 #if defined(GHASH)
1241                 GHASH(ctx,out,i);
1242                 out += i;
1243 #else
1244                 while (j--) {
1245                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1246                         GCM_MUL(ctx,Xi);
1247                         out += 16;
1248                 }
1249 #endif
1250         }
1251         if (len) {
1252                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1253                 ++ctr;
1254                 if (is_endian.little)
1255                         PUTU32(ctx->Yi.c+12,ctr);
1256                 else
1257                         ctx->Yi.d[3] = ctr;
1258                 while (len--) {
1259                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1260                         ++n;
1261                 }
1262         }
1263
1264         ctx->mres = n;
1265         return 0;
1266 }
1267
1268 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1269                 const unsigned char *in, unsigned char *out,
1270                 size_t len,ctr128_f stream)
1271 {
1272         const union { long one; char little; } is_endian = {1};
1273         unsigned int n, ctr;
1274         size_t i;
1275         u64 mlen = ctx->len.u[1];
1276 #ifdef GCM_FUNCREF_4BIT
1277         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1278 # ifdef GHASH
1279         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1280                                 const u8 *inp,size_t len) = ctx->ghash;
1281 # endif
1282 #endif
1283
1284         mlen += len;
1285         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1286                 return -1;
1287         ctx->len.u[1] = mlen;
1288
1289         if (ctx->ares) {
1290                 /* First call to decrypt finalizes GHASH(AAD) */
1291                 GCM_MUL(ctx,Xi);
1292                 ctx->ares = 0;
1293         }
1294
1295         if (is_endian.little)
1296                 ctr = GETU32(ctx->Yi.c+12);
1297         else
1298                 ctr = ctx->Yi.d[3];
1299
1300         n = ctx->mres;
1301         if (n) {
1302                 while (n && len) {
1303                         u8 c = *(in++);
1304                         *(out++) = c^ctx->EKi.c[n];
1305                         ctx->Xi.c[n] ^= c;
1306                         --len;
1307                         n = (n+1)%16;
1308                 }
1309                 if (n==0) GCM_MUL (ctx,Xi);
1310                 else {
1311                         ctx->mres = n;
1312                         return 0;
1313                 }
1314         }
1315 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1316         while (len>=GHASH_CHUNK) {
1317                 GHASH(ctx,in,GHASH_CHUNK);
1318                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1319                 ctr += GHASH_CHUNK/16;
1320                 if (is_endian.little)
1321                         PUTU32(ctx->Yi.c+12,ctr);
1322                 else
1323                         ctx->Yi.d[3] = ctr;
1324                 out += GHASH_CHUNK;
1325                 in  += GHASH_CHUNK;
1326                 len -= GHASH_CHUNK;
1327         }
1328 #endif
1329         if ((i = (len&(size_t)-16))) {
1330                 size_t j=i/16;
1331
1332 #if defined(GHASH)
1333                 GHASH(ctx,in,i);
1334 #else
1335                 while (j--) {
1336                         size_t k;
1337                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1338                         GCM_MUL(ctx,Xi);
1339                         in += 16;
1340                 }
1341                 j   = i/16;
1342                 in -= i;
1343 #endif
1344                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1345                 ctr += (unsigned int)j;
1346                 if (is_endian.little)
1347                         PUTU32(ctx->Yi.c+12,ctr);
1348                 else
1349                         ctx->Yi.d[3] = ctr;
1350                 out += i;
1351                 in  += i;
1352                 len -= i;
1353         }
1354         if (len) {
1355                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1356                 ++ctr;
1357                 if (is_endian.little)
1358                         PUTU32(ctx->Yi.c+12,ctr);
1359                 else
1360                         ctx->Yi.d[3] = ctr;
1361                 while (len--) {
1362                         u8 c = in[n];
1363                         ctx->Xi.c[n] ^= c;
1364                         out[n] = c^ctx->EKi.c[n];
1365                         ++n;
1366                 }
1367         }
1368
1369         ctx->mres = n;
1370         return 0;
1371 }
1372
1373 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1374                         size_t len)
1375 {
1376         const union { long one; char little; } is_endian = {1};
1377         u64 alen = ctx->len.u[0]<<3;
1378         u64 clen = ctx->len.u[1]<<3;
1379 #ifdef GCM_FUNCREF_4BIT
1380         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1381 #endif
1382
1383         if (ctx->mres)
1384                 GCM_MUL(ctx,Xi);
1385
1386         if (is_endian.little) {
1387 #ifdef BSWAP8
1388                 alen = BSWAP8(alen);
1389                 clen = BSWAP8(clen);
1390 #else
1391                 u8 *p = ctx->len.c;
1392
1393                 ctx->len.u[0] = alen;
1394                 ctx->len.u[1] = clen;
1395
1396                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1397                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1398 #endif
1399         }
1400
1401         ctx->Xi.u[0] ^= alen;
1402         ctx->Xi.u[1] ^= clen;
1403         GCM_MUL(ctx,Xi);
1404
1405         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1406         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1407
1408         if (tag && len<=sizeof(ctx->Xi))
1409                 return memcmp(ctx->Xi.c,tag,len);
1410         else
1411                 return -1;
1412 }
1413
1414 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1415 {
1416         CRYPTO_gcm128_finish(ctx, NULL, 0);
1417         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1418 }
1419
1420 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1421 {
1422         GCM128_CONTEXT *ret;
1423
1424         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1425                 CRYPTO_gcm128_init(ret,key,block);
1426
1427         return ret;
1428 }
1429
1430 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1431 {
1432         if (ctx) {
1433                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1434                 OPENSSL_free(ctx);
1435         }
1436 }
1437
1438 #if defined(SELFTEST)
1439 #include <stdio.h>
1440 #include <openssl/aes.h>
1441
1442 /* Test Case 1 */
1443 static const u8 K1[16],
1444                 *P1=NULL,
1445                 *A1=NULL,
1446                 IV1[12],
1447                 *C1=NULL,
1448                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1449
1450 /* Test Case 2 */
1451 #define K2 K1
1452 #define A2 A1
1453 #define IV2 IV1
1454 static const u8 P2[16],
1455                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1456                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1457
1458 /* Test Case 3 */
1459 #define A3 A2
1460 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1461                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1462                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1463                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1464                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1465                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1466                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1467                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1468                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1469                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1470                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1471
1472 /* Test Case 4 */
1473 #define K4 K3
1474 #define IV4 IV3
1475 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1476                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1477                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1478                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1479                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1480                         0xab,0xad,0xda,0xd2},
1481                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1482                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1483                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1484                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1485                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1486
1487 /* Test Case 5 */
1488 #define K5 K4
1489 #define P5 P4
1490 #define A5 A4
1491 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1492                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1493                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1494                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1495                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1496                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1497
1498 /* Test Case 6 */
1499 #define K6 K5
1500 #define P6 P5
1501 #define A6 A5
1502 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1503                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1504                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1505                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1506                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1507                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1508                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1509                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1510                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1511
1512 /* Test Case 7 */
1513 static const u8 K7[24],
1514                 *P7=NULL,
1515                 *A7=NULL,
1516                 IV7[12],
1517                 *C7=NULL,
1518                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1519
1520 /* Test Case 8 */
1521 #define K8 K7
1522 #define IV8 IV7
1523 #define A8 A7
1524 static const u8 P8[16],
1525                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1526                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1527
1528 /* Test Case 9 */
1529 #define A9 A8
1530 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1531                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1532                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1533                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1534                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1535                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1536                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1537                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1538                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1539                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1540                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1541                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1542
1543 /* Test Case 10 */
1544 #define K10 K9
1545 #define IV10 IV9
1546 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1547                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1548                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1549                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1550                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1551                         0xab,0xad,0xda,0xd2},
1552                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1553                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1554                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1555                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1556                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1557
1558 /* Test Case 11 */
1559 #define K11 K10
1560 #define P11 P10
1561 #define A11 A10
1562 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1563                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1564                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1565                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1566                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1567                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1568
1569 /* Test Case 12 */
1570 #define K12 K11
1571 #define P12 P11
1572 #define A12 A11
1573 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1574                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1575                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1576                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1577                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1578                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1579                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1580                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1581                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1582
1583 /* Test Case 13 */
1584 static const u8 K13[32],
1585                 *P13=NULL,
1586                 *A13=NULL,
1587                 IV13[12],
1588                 *C13=NULL,
1589                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1590
1591 /* Test Case 14 */
1592 #define K14 K13
1593 #define A14 A13
1594 static const u8 P14[16],
1595                 IV14[12],
1596                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1597                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1598
1599 /* Test Case 15 */
1600 #define A15 A14
1601 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1602                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1603                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1604                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1605                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1606                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1607                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1608                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1609                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1610                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1611                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1612                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1613
1614 /* Test Case 16 */
1615 #define K16 K15
1616 #define IV16 IV15
1617 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1618                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1619                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1620                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1621                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1622                         0xab,0xad,0xda,0xd2},
1623                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1624                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1625                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1626                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1627                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1628
1629 /* Test Case 17 */
1630 #define K17 K16
1631 #define P17 P16
1632 #define A17 A16
1633 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1634                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1635                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1636                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1637                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1638                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1639
1640 /* Test Case 18 */
1641 #define K18 K17
1642 #define P18 P17
1643 #define A18 A17
1644 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1645                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1646                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1647                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1648                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1649                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1650                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1651                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1652                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1653
1654 #define TEST_CASE(n)    do {                                    \
1655         u8 out[sizeof(P##n)];                                   \
1656         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1657         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1658         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1659         memset(out,0,sizeof(out));                              \
1660         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1661         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1662         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1663             (C##n && memcmp(out,C##n,sizeof(out))))             \
1664                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1665         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1666         memset(out,0,sizeof(out));                              \
1667         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1668         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1669         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1670             (P##n && memcmp(out,P##n,sizeof(out))))             \
1671                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1672         } while(0)
1673
1674 int main()
1675 {
1676         GCM128_CONTEXT ctx;
1677         AES_KEY key;
1678         int ret=0;
1679
1680         TEST_CASE(1);
1681         TEST_CASE(2);
1682         TEST_CASE(3);
1683         TEST_CASE(4);
1684         TEST_CASE(5);
1685         TEST_CASE(6);
1686         TEST_CASE(7);
1687         TEST_CASE(8);
1688         TEST_CASE(9);
1689         TEST_CASE(10);
1690         TEST_CASE(11);
1691         TEST_CASE(12);
1692         TEST_CASE(13);
1693         TEST_CASE(14);
1694         TEST_CASE(15);
1695         TEST_CASE(16);
1696         TEST_CASE(17);
1697         TEST_CASE(18);
1698
1699 #ifdef OPENSSL_CPUID_OBJ
1700         {
1701         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1702         union { u64 u; u8 c[1024]; } buf;
1703         int i;
1704
1705         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1706         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1707         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1708
1709         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1710         start = OPENSSL_rdtsc();
1711         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1712         gcm_t = OPENSSL_rdtsc() - start;
1713
1714         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1715                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1716                         (block128_f)AES_encrypt);
1717         start = OPENSSL_rdtsc();
1718         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1719                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1720                         (block128_f)AES_encrypt);
1721         ctr_t = OPENSSL_rdtsc() - start;
1722
1723         printf("%.2f-%.2f=%.2f\n",
1724                         gcm_t/(double)sizeof(buf),
1725                         ctr_t/(double)sizeof(buf),
1726                         (gcm_t-ctr_t)/(double)sizeof(buf));
1727 #ifdef GHASH
1728         GHASH(&ctx,buf.c,sizeof(buf));
1729         start = OPENSSL_rdtsc();
1730         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1731         gcm_t = OPENSSL_rdtsc() - start;
1732         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1733 #endif
1734         }
1735 #endif
1736
1737         return ret;
1738 }
1739 #endif