Backport GCM support from HEAD. Minimal support at present: no assembly
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         static const size_t rem_8bit[256] = {
152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217         while (1) {
218                 Z.hi ^= Htable[n].hi;
219                 Z.lo ^= Htable[n].lo;
220
221                 if ((u8 *)Xi==xi)       break;
222
223                 n = *(--xi);
224
225                 rem  = (size_t)Z.lo&0xff;
226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227                 Z.hi = (Z.hi>>8);
228                 if (sizeof(size_t)==8)
229                         Z.hi ^= rem_8bit[rem];
230                 else
231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
232         }
233
234         if (is_endian.little) {
235 #ifdef BSWAP8
236                 Xi[0] = BSWAP8(Z.hi);
237                 Xi[1] = BSWAP8(Z.lo);
238 #else
239                 u8 *p = (u8 *)Xi;
240                 u32 v;
241                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
243                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
245 #endif
246         }
247         else {
248                 Xi[0] = Z.hi;
249                 Xi[1] = Z.lo;
250         }
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254 #elif   TABLE_BITS==4
255
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258         u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         int  i;
261 #endif
262
263         Htable[0].hi = 0;
264         Htable[0].lo = 0;
265         V.hi = H[0];
266         V.lo = H[1];
267
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269         for (Htable[8]=V, i=4; i>0; i>>=1) {
270                 REDUCE1BIT(V);
271                 Htable[i] = V;
272         }
273
274         for (i=2; i<16; i<<=1) {
275                 u128 *Hi = Htable+i;
276                 int   j;
277                 for (V=*Hi, j=1; j<i; ++j) {
278                         Hi[j].hi = V.hi^Htable[j].hi;
279                         Hi[j].lo = V.lo^Htable[j].lo;
280                 }
281         }
282 #else
283         Htable[8] = V;
284         REDUCE1BIT(V);
285         Htable[4] = V;
286         REDUCE1BIT(V);
287         Htable[2] = V;
288         REDUCE1BIT(V);
289         Htable[1] = V;
290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291         V=Htable[4];
292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295         V=Htable[8];
296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305         /*
306          * ARM assembler expects specific dword order in Htable.
307          */
308         {
309         int j;
310         const union { long one; char little; } is_endian = {1};
311
312         if (is_endian.little)
313                 for (j=0;j<16;++j) {
314                         V = Htable[j];
315                         Htable[j].hi = V.lo;
316                         Htable[j].lo = V.hi;
317                 }
318         else
319                 for (j=0;j<16;++j) {
320                         V = Htable[j];
321                         Htable[j].hi = V.lo<<32|V.lo>>32;
322                         Htable[j].lo = V.hi<<32|V.hi>>32;
323                 }
324         }
325 #endif
326 }
327
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337         u128 Z;
338         int cnt = 15;
339         size_t rem, nlo, nhi;
340         const union { long one; char little; } is_endian = {1};
341
342         nlo  = ((const u8 *)Xi)[15];
343         nhi  = nlo>>4;
344         nlo &= 0xf;
345
346         Z.hi = Htable[nlo].hi;
347         Z.lo = Htable[nlo].lo;
348
349         while (1) {
350                 rem  = (size_t)Z.lo&0xf;
351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352                 Z.hi = (Z.hi>>4);
353                 if (sizeof(size_t)==8)
354                         Z.hi ^= rem_4bit[rem];
355                 else
356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358                 Z.hi ^= Htable[nhi].hi;
359                 Z.lo ^= Htable[nhi].lo;
360
361                 if (--cnt<0)            break;
362
363                 nlo  = ((const u8 *)Xi)[cnt];
364                 nhi  = nlo>>4;
365                 nlo &= 0xf;
366
367                 rem  = (size_t)Z.lo&0xf;
368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369                 Z.hi = (Z.hi>>4);
370                 if (sizeof(size_t)==8)
371                         Z.hi ^= rem_4bit[rem];
372                 else
373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375                 Z.hi ^= Htable[nlo].hi;
376                 Z.lo ^= Htable[nlo].lo;
377         }
378
379         if (is_endian.little) {
380 #ifdef BSWAP8
381                 Xi[0] = BSWAP8(Z.hi);
382                 Xi[1] = BSWAP8(Z.lo);
383 #else
384                 u8 *p = (u8 *)Xi;
385                 u32 v;
386                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
388                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
390 #endif
391         }
392         else {
393                 Xi[0] = Z.hi;
394                 Xi[1] = Z.lo;
395         }
396 }
397
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407                                 const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413
414 #if 1
415     do {
416         cnt  = 15;
417         nlo  = ((const u8 *)Xi)[15];
418         nlo ^= inp[15];
419         nhi  = nlo>>4;
420         nlo &= 0xf;
421
422         Z.hi = Htable[nlo].hi;
423         Z.lo = Htable[nlo].lo;
424
425         while (1) {
426                 rem  = (size_t)Z.lo&0xf;
427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428                 Z.hi = (Z.hi>>4);
429                 if (sizeof(size_t)==8)
430                         Z.hi ^= rem_4bit[rem];
431                 else
432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434                 Z.hi ^= Htable[nhi].hi;
435                 Z.lo ^= Htable[nhi].lo;
436
437                 if (--cnt<0)            break;
438
439                 nlo  = ((const u8 *)Xi)[cnt];
440                 nlo ^= inp[cnt];
441                 nhi  = nlo>>4;
442                 nlo &= 0xf;
443
444                 rem  = (size_t)Z.lo&0xf;
445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446                 Z.hi = (Z.hi>>4);
447                 if (sizeof(size_t)==8)
448                         Z.hi ^= rem_4bit[rem];
449                 else
450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452                 Z.hi ^= Htable[nlo].hi;
453                 Z.lo ^= Htable[nlo].lo;
454         }
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */ 
462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504         Z.hi = Htable[cnt].hi;
505         Z.lo = Htable[cnt].lo;
506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507         Hshr4[cnt].hi = (Z.hi>>4);
508         Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510
511     do {
512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513                 nlo  = ((const u8 *)Xi)[cnt];
514                 nlo ^= inp[cnt];
515                 nhi  = nlo>>4;
516                 nlo &= 0xf;
517
518                 Z.hi ^= Htable[nlo].hi;
519                 Z.lo ^= Htable[nlo].lo;
520
521                 rem = (size_t)Z.lo&0xff;
522
523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524                 Z.hi = (Z.hi>>8);
525
526                 Z.hi ^= Hshr4[nhi].hi;
527                 Z.lo ^= Hshr4[nhi].lo;
528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529         }
530
531         nlo  = ((const u8 *)Xi)[0];
532         nlo ^= inp[0];
533         nhi  = nlo>>4;
534         nlo &= 0xf;
535
536         Z.hi ^= Htable[nlo].hi;
537         Z.lo ^= Htable[nlo].lo;
538
539         rem = (size_t)Z.lo&0xf;
540
541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
542         Z.hi = (Z.hi>>4);
543
544         Z.hi ^= Htable[nhi].hi;
545         Z.lo ^= Htable[nhi].lo;
546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548
549         if (is_endian.little) {
550 #ifdef BSWAP8
551                 Xi[0] = BSWAP8(Z.hi);
552                 Xi[1] = BSWAP8(Z.lo);
553 #else
554                 u8 *p = (u8 *)Xi;
555                 u32 v;
556                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
558                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
560 #endif
561         }
562         else {
563                 Xi[0] = Z.hi;
564                 Xi[1] = Z.lo;
565         }
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582
583 #else   /* TABLE_BITS */
584
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587         u128 V,Z = { 0,0 };
588         long X;
589         int  i,j;
590         const long *xi = (const long *)Xi;
591         const union { long one; char little; } is_endian = {1};
592
593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
594         V.lo = H[1];
595
596         for (j=0; j<16/sizeof(long); ++j) {
597                 if (is_endian.little) {
598                         if (sizeof(long)==8) {
599 #ifdef BSWAP8
600                                 X = (long)(BSWAP8(xi[j]));
601 #else
602                                 const u8 *p = (const u8 *)(xi+j);
603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605                         }
606                         else {
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)GETU32(p);
609                         }
610                 }
611                 else
612                         X = xi[j];
613
614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
616                         Z.hi ^= V.hi&M;
617                         Z.lo ^= V.lo&M;
618
619                         REDUCE1BIT(V);
620                 }
621         }
622
623         if (is_endian.little) {
624 #ifdef BSWAP8
625                 Xi[0] = BSWAP8(Z.hi);
626                 Xi[1] = BSWAP8(Z.lo);
627 #else
628                 u8 *p = (u8 *)Xi;
629                 u32 v;
630                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
632                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
634 #endif
635         }
636         else {
637                 Xi[0] = Z.hi;
638                 Xi[1] = Z.lo;
639         }
640 }
641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643 #endif
644
645 #if     TABLE_BITS==4 && defined(GHASH_ASM)
646 # if    !defined(I386_ONLY) && \
647         (defined(__i386)        || defined(__i386__)    || \
648          defined(__x86_64)      || defined(__x86_64__)  || \
649          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 #   define GHASH_ASM_X86
660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665 #  endif
666 # elif defined(__arm__) || defined(__arm)
667 #  include "arm_arch.h"
668 #  if __ARM_ARCH__>=7
669 #   define GHASH_ASM_ARM
670 #   define GCM_FUNCREF_4BIT
671 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
672 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
673 #  endif
674 # endif
675 #endif
676
677 #ifdef GCM_FUNCREF_4BIT
678 # undef  GCM_MUL
679 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
680 # ifdef GHASH
681 #  undef  GHASH
682 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
683 # endif
684 #endif
685
686 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
687 {
688         const union { long one; char little; } is_endian = {1};
689
690         memset(ctx,0,sizeof(*ctx));
691         ctx->block = block;
692         ctx->key   = key;
693
694         (*block)(ctx->H.c,ctx->H.c,key);
695
696         if (is_endian.little) {
697                 /* H is stored in host byte order */
698 #ifdef BSWAP8
699                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
700                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
701 #else
702                 u8 *p = ctx->H.c;
703                 u64 hi,lo;
704                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
705                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
706                 ctx->H.u[0] = hi;
707                 ctx->H.u[1] = lo;
708 #endif
709         }
710
711 #if     TABLE_BITS==8
712         gcm_init_8bit(ctx->Htable,ctx->H.u);
713 #elif   TABLE_BITS==4
714 # if    defined(GHASH_ASM_X86_OR_64)
715 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
716         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
717             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
718                 gcm_init_clmul(ctx->Htable,ctx->H.u);
719                 ctx->gmult = gcm_gmult_clmul;
720                 ctx->ghash = gcm_ghash_clmul;
721                 return;
722         }
723 #  endif
724         gcm_init_4bit(ctx->Htable,ctx->H.u);
725 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
726         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
727                 ctx->gmult = gcm_gmult_4bit_mmx;
728                 ctx->ghash = gcm_ghash_4bit_mmx;
729         } else {
730                 ctx->gmult = gcm_gmult_4bit_x86;
731                 ctx->ghash = gcm_ghash_4bit_x86;
732         }
733 #  else
734         ctx->gmult = gcm_gmult_4bit;
735         ctx->ghash = gcm_ghash_4bit;
736 #  endif
737 # elif  defined(GHASH_ASM_ARM)
738         if (OPENSSL_armcap_P & ARMV7_NEON) {
739                 ctx->gmult = gcm_gmult_neon;
740                 ctx->ghash = gcm_ghash_neon;
741         } else {
742                 gcm_init_4bit(ctx->Htable,ctx->H.u);
743                 ctx->gmult = gcm_gmult_4bit;
744                 ctx->ghash = gcm_ghash_4bit;
745         }
746 # else
747         gcm_init_4bit(ctx->Htable,ctx->H.u);
748 # endif
749 #endif
750 }
751
752 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
753 {
754         const union { long one; char little; } is_endian = {1};
755         unsigned int ctr;
756 #ifdef GCM_FUNCREF_4BIT
757         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
758 #endif
759
760         ctx->Yi.u[0]  = 0;
761         ctx->Yi.u[1]  = 0;
762         ctx->Xi.u[0]  = 0;
763         ctx->Xi.u[1]  = 0;
764         ctx->len.u[0] = 0;      /* AAD length */
765         ctx->len.u[1] = 0;      /* message length */
766         ctx->ares = 0;
767         ctx->mres = 0;
768
769         if (len==12) {
770                 memcpy(ctx->Yi.c,iv,12);
771                 ctx->Yi.c[15]=1;
772                 ctr=1;
773         }
774         else {
775                 size_t i;
776                 u64 len0 = len;
777
778                 while (len>=16) {
779                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
780                         GCM_MUL(ctx,Yi);
781                         iv += 16;
782                         len -= 16;
783                 }
784                 if (len) {
785                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
786                         GCM_MUL(ctx,Yi);
787                 }
788                 len0 <<= 3;
789                 if (is_endian.little) {
790 #ifdef BSWAP8
791                         ctx->Yi.u[1]  ^= BSWAP8(len0);
792 #else
793                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
794                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
795                         ctx->Yi.c[10] ^= (u8)(len0>>40);
796                         ctx->Yi.c[11] ^= (u8)(len0>>32);
797                         ctx->Yi.c[12] ^= (u8)(len0>>24);
798                         ctx->Yi.c[13] ^= (u8)(len0>>16);
799                         ctx->Yi.c[14] ^= (u8)(len0>>8);
800                         ctx->Yi.c[15] ^= (u8)(len0);
801 #endif
802                 }
803                 else
804                         ctx->Yi.u[1]  ^= len0;
805
806                 GCM_MUL(ctx,Yi);
807
808                 if (is_endian.little)
809                         ctr = GETU32(ctx->Yi.c+12);
810                 else
811                         ctr = ctx->Yi.d[3];
812         }
813
814         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
815         ++ctr;
816         if (is_endian.little)
817                 PUTU32(ctx->Yi.c+12,ctr);
818         else
819                 ctx->Yi.d[3] = ctr;
820 }
821
822 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
823 {
824         size_t i;
825         unsigned int n;
826         u64 alen = ctx->len.u[0];
827 #ifdef GCM_FUNCREF_4BIT
828         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
829 # ifdef GHASH
830         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
831                                 const u8 *inp,size_t len)       = ctx->ghash;
832 # endif
833 #endif
834
835         if (ctx->len.u[1]) return -2;
836
837         alen += len;
838         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
839                 return -1;
840         ctx->len.u[0] = alen;
841
842         n = ctx->ares;
843         if (n) {
844                 while (n && len) {
845                         ctx->Xi.c[n] ^= *(aad++);
846                         --len;
847                         n = (n+1)%16;
848                 }
849                 if (n==0) GCM_MUL(ctx,Xi);
850                 else {
851                         ctx->ares = n;
852                         return 0;
853                 }
854         }
855
856 #ifdef GHASH
857         if ((i = (len&(size_t)-16))) {
858                 GHASH(ctx,aad,i);
859                 aad += i;
860                 len -= i;
861         }
862 #else
863         while (len>=16) {
864                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
865                 GCM_MUL(ctx,Xi);
866                 aad += 16;
867                 len -= 16;
868         }
869 #endif
870         if (len) {
871                 n = (unsigned int)len;
872                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
873         }
874
875         ctx->ares = n;
876         return 0;
877 }
878
879 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
880                 const unsigned char *in, unsigned char *out,
881                 size_t len)
882 {
883         const union { long one; char little; } is_endian = {1};
884         unsigned int n, ctr;
885         size_t i;
886         u64        mlen  = ctx->len.u[1];
887         block128_f block = ctx->block;
888         void      *key   = ctx->key;
889 #ifdef GCM_FUNCREF_4BIT
890         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
891 # ifdef GHASH
892         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
893                                 const u8 *inp,size_t len)       = ctx->ghash;
894 # endif
895 #endif
896
897 #if 0
898         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
899 #endif
900         mlen += len;
901         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
902                 return -1;
903         ctx->len.u[1] = mlen;
904
905         if (ctx->ares) {
906                 /* First call to encrypt finalizes GHASH(AAD) */
907                 GCM_MUL(ctx,Xi);
908                 ctx->ares = 0;
909         }
910
911         if (is_endian.little)
912                 ctr = GETU32(ctx->Yi.c+12);
913         else
914                 ctr = ctx->Yi.d[3];
915
916         n = ctx->mres;
917 #if !defined(OPENSSL_SMALL_FOOTPRINT)
918         if (16%sizeof(size_t) == 0) do {        /* always true actually */
919                 if (n) {
920                         while (n && len) {
921                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
922                                 --len;
923                                 n = (n+1)%16;
924                         }
925                         if (n==0) GCM_MUL(ctx,Xi);
926                         else {
927                                 ctx->mres = n;
928                                 return 0;
929                         }
930                 }
931 #if defined(STRICT_ALIGNMENT)
932                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
933                         break;
934 #endif
935 #if defined(GHASH) && defined(GHASH_CHUNK)
936                 while (len>=GHASH_CHUNK) {
937                     size_t j=GHASH_CHUNK;
938
939                     while (j) {
940                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
941                         ++ctr;
942                         if (is_endian.little)
943                                 PUTU32(ctx->Yi.c+12,ctr);
944                         else
945                                 ctx->Yi.d[3] = ctr;
946                         for (i=0; i<16; i+=sizeof(size_t))
947                                 *(size_t *)(out+i) =
948                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
949                         out += 16;
950                         in  += 16;
951                         j   -= 16;
952                     }
953                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
954                     len -= GHASH_CHUNK;
955                 }
956                 if ((i = (len&(size_t)-16))) {
957                     size_t j=i;
958
959                     while (len>=16) {
960                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
961                         ++ctr;
962                         if (is_endian.little)
963                                 PUTU32(ctx->Yi.c+12,ctr);
964                         else
965                                 ctx->Yi.d[3] = ctr;
966                         for (i=0; i<16; i+=sizeof(size_t))
967                                 *(size_t *)(out+i) =
968                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
969                         out += 16;
970                         in  += 16;
971                         len -= 16;
972                     }
973                     GHASH(ctx,out-j,j);
974                 }
975 #else
976                 while (len>=16) {
977                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
978                         ++ctr;
979                         if (is_endian.little)
980                                 PUTU32(ctx->Yi.c+12,ctr);
981                         else
982                                 ctx->Yi.d[3] = ctr;
983                         for (i=0; i<16; i+=sizeof(size_t))
984                                 *(size_t *)(ctx->Xi.c+i) ^=
985                                 *(size_t *)(out+i) =
986                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
987                         GCM_MUL(ctx,Xi);
988                         out += 16;
989                         in  += 16;
990                         len -= 16;
991                 }
992 #endif
993                 if (len) {
994                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
995                         ++ctr;
996                         if (is_endian.little)
997                                 PUTU32(ctx->Yi.c+12,ctr);
998                         else
999                                 ctx->Yi.d[3] = ctr;
1000                         while (len--) {
1001                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1002                                 ++n;
1003                         }
1004                 }
1005
1006                 ctx->mres = n;
1007                 return 0;
1008         } while(0);
1009 #endif
1010         for (i=0;i<len;++i) {
1011                 if (n==0) {
1012                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1013                         ++ctr;
1014                         if (is_endian.little)
1015                                 PUTU32(ctx->Yi.c+12,ctr);
1016                         else
1017                                 ctx->Yi.d[3] = ctr;
1018                 }
1019                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1020                 n = (n+1)%16;
1021                 if (n==0)
1022                         GCM_MUL(ctx,Xi);
1023         }
1024
1025         ctx->mres = n;
1026         return 0;
1027 }
1028
1029 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1030                 const unsigned char *in, unsigned char *out,
1031                 size_t len)
1032 {
1033         const union { long one; char little; } is_endian = {1};
1034         unsigned int n, ctr;
1035         size_t i;
1036         u64        mlen  = ctx->len.u[1];
1037         block128_f block = ctx->block;
1038         void      *key   = ctx->key;
1039 #ifdef GCM_FUNCREF_4BIT
1040         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1041 # ifdef GHASH
1042         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1043                                 const u8 *inp,size_t len)       = ctx->ghash;
1044 # endif
1045 #endif
1046
1047         mlen += len;
1048         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1049                 return -1;
1050         ctx->len.u[1] = mlen;
1051
1052         if (ctx->ares) {
1053                 /* First call to decrypt finalizes GHASH(AAD) */
1054                 GCM_MUL(ctx,Xi);
1055                 ctx->ares = 0;
1056         }
1057
1058         if (is_endian.little)
1059                 ctr = GETU32(ctx->Yi.c+12);
1060         else
1061                 ctr = ctx->Yi.d[3];
1062
1063         n = ctx->mres;
1064 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1065         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1066                 if (n) {
1067                         while (n && len) {
1068                                 u8 c = *(in++);
1069                                 *(out++) = c^ctx->EKi.c[n];
1070                                 ctx->Xi.c[n] ^= c;
1071                                 --len;
1072                                 n = (n+1)%16;
1073                         }
1074                         if (n==0) GCM_MUL (ctx,Xi);
1075                         else {
1076                                 ctx->mres = n;
1077                                 return 0;
1078                         }
1079                 }
1080 #if defined(STRICT_ALIGNMENT)
1081                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1082                         break;
1083 #endif
1084 #if defined(GHASH) && defined(GHASH_CHUNK)
1085                 while (len>=GHASH_CHUNK) {
1086                     size_t j=GHASH_CHUNK;
1087
1088                     GHASH(ctx,in,GHASH_CHUNK);
1089                     while (j) {
1090                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1091                         ++ctr;
1092                         if (is_endian.little)
1093                                 PUTU32(ctx->Yi.c+12,ctr);
1094                         else
1095                                 ctx->Yi.d[3] = ctr;
1096                         for (i=0; i<16; i+=sizeof(size_t))
1097                                 *(size_t *)(out+i) =
1098                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1099                         out += 16;
1100                         in  += 16;
1101                         j   -= 16;
1102                     }
1103                     len -= GHASH_CHUNK;
1104                 }
1105                 if ((i = (len&(size_t)-16))) {
1106                     GHASH(ctx,in,i);
1107                     while (len>=16) {
1108                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1109                         ++ctr;
1110                         if (is_endian.little)
1111                                 PUTU32(ctx->Yi.c+12,ctr);
1112                         else
1113                                 ctx->Yi.d[3] = ctr;
1114                         for (i=0; i<16; i+=sizeof(size_t))
1115                                 *(size_t *)(out+i) =
1116                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1117                         out += 16;
1118                         in  += 16;
1119                         len -= 16;
1120                     }
1121                 }
1122 #else
1123                 while (len>=16) {
1124                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1125                         ++ctr;
1126                         if (is_endian.little)
1127                                 PUTU32(ctx->Yi.c+12,ctr);
1128                         else
1129                                 ctx->Yi.d[3] = ctr;
1130                         for (i=0; i<16; i+=sizeof(size_t)) {
1131                                 size_t c = *(size_t *)(in+i);
1132                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1133                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1134                         }
1135                         GCM_MUL(ctx,Xi);
1136                         out += 16;
1137                         in  += 16;
1138                         len -= 16;
1139                 }
1140 #endif
1141                 if (len) {
1142                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1143                         ++ctr;
1144                         if (is_endian.little)
1145                                 PUTU32(ctx->Yi.c+12,ctr);
1146                         else
1147                                 ctx->Yi.d[3] = ctr;
1148                         while (len--) {
1149                                 u8 c = in[n];
1150                                 ctx->Xi.c[n] ^= c;
1151                                 out[n] = c^ctx->EKi.c[n];
1152                                 ++n;
1153                         }
1154                 }
1155
1156                 ctx->mres = n;
1157                 return 0;
1158         } while(0);
1159 #endif
1160         for (i=0;i<len;++i) {
1161                 u8 c;
1162                 if (n==0) {
1163                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1164                         ++ctr;
1165                         if (is_endian.little)
1166                                 PUTU32(ctx->Yi.c+12,ctr);
1167                         else
1168                                 ctx->Yi.d[3] = ctr;
1169                 }
1170                 c = in[i];
1171                 out[i] = c^ctx->EKi.c[n];
1172                 ctx->Xi.c[n] ^= c;
1173                 n = (n+1)%16;
1174                 if (n==0)
1175                         GCM_MUL(ctx,Xi);
1176         }
1177
1178         ctx->mres = n;
1179         return 0;
1180 }
1181
1182 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1183                 const unsigned char *in, unsigned char *out,
1184                 size_t len, ctr128_f stream)
1185 {
1186         const union { long one; char little; } is_endian = {1};
1187         unsigned int n, ctr;
1188         size_t i;
1189         u64   mlen = ctx->len.u[1];
1190         void *key  = ctx->key;
1191 #ifdef GCM_FUNCREF_4BIT
1192         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1193 # ifdef GHASH
1194         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1195                                 const u8 *inp,size_t len)       = ctx->ghash;
1196 # endif
1197 #endif
1198
1199         mlen += len;
1200         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1201                 return -1;
1202         ctx->len.u[1] = mlen;
1203
1204         if (ctx->ares) {
1205                 /* First call to encrypt finalizes GHASH(AAD) */
1206                 GCM_MUL(ctx,Xi);
1207                 ctx->ares = 0;
1208         }
1209
1210         if (is_endian.little)
1211                 ctr = GETU32(ctx->Yi.c+12);
1212         else
1213                 ctr = ctx->Yi.d[3];
1214
1215         n = ctx->mres;
1216         if (n) {
1217                 while (n && len) {
1218                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1219                         --len;
1220                         n = (n+1)%16;
1221                 }
1222                 if (n==0) GCM_MUL(ctx,Xi);
1223                 else {
1224                         ctx->mres = n;
1225                         return 0;
1226                 }
1227         }
1228 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1229         while (len>=GHASH_CHUNK) {
1230                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1231                 ctr += GHASH_CHUNK/16;
1232                 if (is_endian.little)
1233                         PUTU32(ctx->Yi.c+12,ctr);
1234                 else
1235                         ctx->Yi.d[3] = ctr;
1236                 GHASH(ctx,out,GHASH_CHUNK);
1237                 out += GHASH_CHUNK;
1238                 in  += GHASH_CHUNK;
1239                 len -= GHASH_CHUNK;
1240         }
1241 #endif
1242         if ((i = (len&(size_t)-16))) {
1243                 size_t j=i/16;
1244
1245                 (*stream)(in,out,j,key,ctx->Yi.c);
1246                 ctr += (unsigned int)j;
1247                 if (is_endian.little)
1248                         PUTU32(ctx->Yi.c+12,ctr);
1249                 else
1250                         ctx->Yi.d[3] = ctr;
1251                 in  += i;
1252                 len -= i;
1253 #if defined(GHASH)
1254                 GHASH(ctx,out,i);
1255                 out += i;
1256 #else
1257                 while (j--) {
1258                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1259                         GCM_MUL(ctx,Xi);
1260                         out += 16;
1261                 }
1262 #endif
1263         }
1264         if (len) {
1265                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1266                 ++ctr;
1267                 if (is_endian.little)
1268                         PUTU32(ctx->Yi.c+12,ctr);
1269                 else
1270                         ctx->Yi.d[3] = ctr;
1271                 while (len--) {
1272                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1273                         ++n;
1274                 }
1275         }
1276
1277         ctx->mres = n;
1278         return 0;
1279 }
1280
1281 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1282                 const unsigned char *in, unsigned char *out,
1283                 size_t len,ctr128_f stream)
1284 {
1285         const union { long one; char little; } is_endian = {1};
1286         unsigned int n, ctr;
1287         size_t i;
1288         u64   mlen = ctx->len.u[1];
1289         void *key  = ctx->key;
1290 #ifdef GCM_FUNCREF_4BIT
1291         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1292 # ifdef GHASH
1293         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1294                                 const u8 *inp,size_t len)       = ctx->ghash;
1295 # endif
1296 #endif
1297
1298         mlen += len;
1299         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1300                 return -1;
1301         ctx->len.u[1] = mlen;
1302
1303         if (ctx->ares) {
1304                 /* First call to decrypt finalizes GHASH(AAD) */
1305                 GCM_MUL(ctx,Xi);
1306                 ctx->ares = 0;
1307         }
1308
1309         if (is_endian.little)
1310                 ctr = GETU32(ctx->Yi.c+12);
1311         else
1312                 ctr = ctx->Yi.d[3];
1313
1314         n = ctx->mres;
1315         if (n) {
1316                 while (n && len) {
1317                         u8 c = *(in++);
1318                         *(out++) = c^ctx->EKi.c[n];
1319                         ctx->Xi.c[n] ^= c;
1320                         --len;
1321                         n = (n+1)%16;
1322                 }
1323                 if (n==0) GCM_MUL (ctx,Xi);
1324                 else {
1325                         ctx->mres = n;
1326                         return 0;
1327                 }
1328         }
1329 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1330         while (len>=GHASH_CHUNK) {
1331                 GHASH(ctx,in,GHASH_CHUNK);
1332                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1333                 ctr += GHASH_CHUNK/16;
1334                 if (is_endian.little)
1335                         PUTU32(ctx->Yi.c+12,ctr);
1336                 else
1337                         ctx->Yi.d[3] = ctr;
1338                 out += GHASH_CHUNK;
1339                 in  += GHASH_CHUNK;
1340                 len -= GHASH_CHUNK;
1341         }
1342 #endif
1343         if ((i = (len&(size_t)-16))) {
1344                 size_t j=i/16;
1345
1346 #if defined(GHASH)
1347                 GHASH(ctx,in,i);
1348 #else
1349                 while (j--) {
1350                         size_t k;
1351                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1352                         GCM_MUL(ctx,Xi);
1353                         in += 16;
1354                 }
1355                 j   = i/16;
1356                 in -= i;
1357 #endif
1358                 (*stream)(in,out,j,key,ctx->Yi.c);
1359                 ctr += (unsigned int)j;
1360                 if (is_endian.little)
1361                         PUTU32(ctx->Yi.c+12,ctr);
1362                 else
1363                         ctx->Yi.d[3] = ctr;
1364                 out += i;
1365                 in  += i;
1366                 len -= i;
1367         }
1368         if (len) {
1369                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1370                 ++ctr;
1371                 if (is_endian.little)
1372                         PUTU32(ctx->Yi.c+12,ctr);
1373                 else
1374                         ctx->Yi.d[3] = ctr;
1375                 while (len--) {
1376                         u8 c = in[n];
1377                         ctx->Xi.c[n] ^= c;
1378                         out[n] = c^ctx->EKi.c[n];
1379                         ++n;
1380                 }
1381         }
1382
1383         ctx->mres = n;
1384         return 0;
1385 }
1386
1387 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1388                         size_t len)
1389 {
1390         const union { long one; char little; } is_endian = {1};
1391         u64 alen = ctx->len.u[0]<<3;
1392         u64 clen = ctx->len.u[1]<<3;
1393 #ifdef GCM_FUNCREF_4BIT
1394         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1395 #endif
1396
1397         if (ctx->mres)
1398                 GCM_MUL(ctx,Xi);
1399
1400         if (is_endian.little) {
1401 #ifdef BSWAP8
1402                 alen = BSWAP8(alen);
1403                 clen = BSWAP8(clen);
1404 #else
1405                 u8 *p = ctx->len.c;
1406
1407                 ctx->len.u[0] = alen;
1408                 ctx->len.u[1] = clen;
1409
1410                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1411                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1412 #endif
1413         }
1414
1415         ctx->Xi.u[0] ^= alen;
1416         ctx->Xi.u[1] ^= clen;
1417         GCM_MUL(ctx,Xi);
1418
1419         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1420         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1421
1422         if (tag && len<=sizeof(ctx->Xi))
1423                 return memcmp(ctx->Xi.c,tag,len);
1424         else
1425                 return -1;
1426 }
1427
1428 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1429 {
1430         CRYPTO_gcm128_finish(ctx, NULL, 0);
1431         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1432 }
1433
1434 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1435 {
1436         GCM128_CONTEXT *ret;
1437
1438         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1439                 CRYPTO_gcm128_init(ret,key,block);
1440
1441         return ret;
1442 }
1443
1444 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1445 {
1446         if (ctx) {
1447                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1448                 OPENSSL_free(ctx);
1449         }
1450 }
1451
1452 #if defined(SELFTEST)
1453 #include <stdio.h>
1454 #include <openssl/aes.h>
1455
1456 /* Test Case 1 */
1457 static const u8 K1[16],
1458                 *P1=NULL,
1459                 *A1=NULL,
1460                 IV1[12],
1461                 *C1=NULL,
1462                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1463
1464 /* Test Case 2 */
1465 #define K2 K1
1466 #define A2 A1
1467 #define IV2 IV1
1468 static const u8 P2[16],
1469                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1470                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1471
1472 /* Test Case 3 */
1473 #define A3 A2
1474 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1475                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1476                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1477                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1478                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1479                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1480                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1481                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1482                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1483                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1484                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1485
1486 /* Test Case 4 */
1487 #define K4 K3
1488 #define IV4 IV3
1489 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1490                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1491                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1492                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1493                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1494                         0xab,0xad,0xda,0xd2},
1495                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1496                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1497                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1498                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1499                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1500
1501 /* Test Case 5 */
1502 #define K5 K4
1503 #define P5 P4
1504 #define A5 A4
1505 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1506                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1507                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1508                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1509                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1510                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1511
1512 /* Test Case 6 */
1513 #define K6 K5
1514 #define P6 P5
1515 #define A6 A5
1516 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1517                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1518                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1519                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1520                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1521                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1522                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1523                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1524                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1525
1526 /* Test Case 7 */
1527 static const u8 K7[24],
1528                 *P7=NULL,
1529                 *A7=NULL,
1530                 IV7[12],
1531                 *C7=NULL,
1532                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1533
1534 /* Test Case 8 */
1535 #define K8 K7
1536 #define IV8 IV7
1537 #define A8 A7
1538 static const u8 P8[16],
1539                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1540                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1541
1542 /* Test Case 9 */
1543 #define A9 A8
1544 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1545                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1546                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1547                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1548                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1549                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1550                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1551                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1552                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1553                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1554                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1555                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1556
1557 /* Test Case 10 */
1558 #define K10 K9
1559 #define IV10 IV9
1560 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1561                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1562                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1563                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1564                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1565                         0xab,0xad,0xda,0xd2},
1566                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1567                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1568                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1569                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1570                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1571
1572 /* Test Case 11 */
1573 #define K11 K10
1574 #define P11 P10
1575 #define A11 A10
1576 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1577                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1578                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1579                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1580                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1581                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1582
1583 /* Test Case 12 */
1584 #define K12 K11
1585 #define P12 P11
1586 #define A12 A11
1587 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1588                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1589                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1590                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1591                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1592                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1593                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1594                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1595                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1596
1597 /* Test Case 13 */
1598 static const u8 K13[32],
1599                 *P13=NULL,
1600                 *A13=NULL,
1601                 IV13[12],
1602                 *C13=NULL,
1603                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1604
1605 /* Test Case 14 */
1606 #define K14 K13
1607 #define A14 A13
1608 static const u8 P14[16],
1609                 IV14[12],
1610                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1611                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1612
1613 /* Test Case 15 */
1614 #define A15 A14
1615 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1616                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1617                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1618                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1619                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1620                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1621                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1622                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1623                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1624                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1625                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1626                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1627
1628 /* Test Case 16 */
1629 #define K16 K15
1630 #define IV16 IV15
1631 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1632                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1633                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1634                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1635                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1636                         0xab,0xad,0xda,0xd2},
1637                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1638                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1639                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1640                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1641                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1642
1643 /* Test Case 17 */
1644 #define K17 K16
1645 #define P17 P16
1646 #define A17 A16
1647 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1648                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1649                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1650                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1651                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1652                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1653
1654 /* Test Case 18 */
1655 #define K18 K17
1656 #define P18 P17
1657 #define A18 A17
1658 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1659                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1660                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1661                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1662                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1663                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1664                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1665                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1666                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1667
1668 #define TEST_CASE(n)    do {                                    \
1669         u8 out[sizeof(P##n)];                                   \
1670         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1671         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1672         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1673         memset(out,0,sizeof(out));                              \
1674         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1675         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1676         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1677             (C##n && memcmp(out,C##n,sizeof(out))))             \
1678                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1679         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1680         memset(out,0,sizeof(out));                              \
1681         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1682         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1683         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1684             (P##n && memcmp(out,P##n,sizeof(out))))             \
1685                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1686         } while(0)
1687
1688 int main()
1689 {
1690         GCM128_CONTEXT ctx;
1691         AES_KEY key;
1692         int ret=0;
1693
1694         TEST_CASE(1);
1695         TEST_CASE(2);
1696         TEST_CASE(3);
1697         TEST_CASE(4);
1698         TEST_CASE(5);
1699         TEST_CASE(6);
1700         TEST_CASE(7);
1701         TEST_CASE(8);
1702         TEST_CASE(9);
1703         TEST_CASE(10);
1704         TEST_CASE(11);
1705         TEST_CASE(12);
1706         TEST_CASE(13);
1707         TEST_CASE(14);
1708         TEST_CASE(15);
1709         TEST_CASE(16);
1710         TEST_CASE(17);
1711         TEST_CASE(18);
1712
1713 #ifdef OPENSSL_CPUID_OBJ
1714         {
1715         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1716         union { u64 u; u8 c[1024]; } buf;
1717         int i;
1718
1719         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1720         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1721         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1722
1723         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1724         start = OPENSSL_rdtsc();
1725         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1726         gcm_t = OPENSSL_rdtsc() - start;
1727
1728         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1729                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1730                         (block128_f)AES_encrypt);
1731         start = OPENSSL_rdtsc();
1732         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1733                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1734                         (block128_f)AES_encrypt);
1735         ctr_t = OPENSSL_rdtsc() - start;
1736
1737         printf("%.2f-%.2f=%.2f\n",
1738                         gcm_t/(double)sizeof(buf),
1739                         ctr_t/(double)sizeof(buf),
1740                         (gcm_t-ctr_t)/(double)sizeof(buf));
1741 #ifdef GHASH
1742         GHASH(&ctx,buf.c,sizeof(buf));
1743         start = OPENSSL_rdtsc();
1744         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1745         gcm_t = OPENSSL_rdtsc() - start;
1746         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1747 #endif
1748         }
1749 #endif
1750
1751         return ret;
1752 }
1753 #endif