Two more symbol renames.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         static const size_t rem_8bit[256] = {
152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217         while (1) {
218                 Z.hi ^= Htable[n].hi;
219                 Z.lo ^= Htable[n].lo;
220
221                 if ((u8 *)Xi==xi)       break;
222
223                 n = *(--xi);
224
225                 rem  = (size_t)Z.lo&0xff;
226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227                 Z.hi = (Z.hi>>8);
228                 if (sizeof(size_t)==8)
229                         Z.hi ^= rem_8bit[rem];
230                 else
231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
232         }
233
234         if (is_endian.little) {
235 #ifdef BSWAP8
236                 Xi[0] = BSWAP8(Z.hi);
237                 Xi[1] = BSWAP8(Z.lo);
238 #else
239                 u8 *p = (u8 *)Xi;
240                 u32 v;
241                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
243                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
245 #endif
246         }
247         else {
248                 Xi[0] = Z.hi;
249                 Xi[1] = Z.lo;
250         }
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254 #elif   TABLE_BITS==4
255
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258         u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         int  i;
261 #endif
262
263         Htable[0].hi = 0;
264         Htable[0].lo = 0;
265         V.hi = H[0];
266         V.lo = H[1];
267
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269         for (Htable[8]=V, i=4; i>0; i>>=1) {
270                 REDUCE1BIT(V);
271                 Htable[i] = V;
272         }
273
274         for (i=2; i<16; i<<=1) {
275                 u128 *Hi = Htable+i;
276                 int   j;
277                 for (V=*Hi, j=1; j<i; ++j) {
278                         Hi[j].hi = V.hi^Htable[j].hi;
279                         Hi[j].lo = V.lo^Htable[j].lo;
280                 }
281         }
282 #else
283         Htable[8] = V;
284         REDUCE1BIT(V);
285         Htable[4] = V;
286         REDUCE1BIT(V);
287         Htable[2] = V;
288         REDUCE1BIT(V);
289         Htable[1] = V;
290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291         V=Htable[4];
292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295         V=Htable[8];
296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305         /*
306          * ARM assembler expects specific dword order in Htable.
307          */
308         {
309         int j;
310         const union { long one; char little; } is_endian = {1};
311
312         if (is_endian.little)
313                 for (j=0;j<16;++j) {
314                         V = Htable[j];
315                         Htable[j].hi = V.lo;
316                         Htable[j].lo = V.hi;
317                 }
318         else
319                 for (j=0;j<16;++j) {
320                         V = Htable[j];
321                         Htable[j].hi = V.lo<<32|V.lo>>32;
322                         Htable[j].lo = V.hi<<32|V.hi>>32;
323                 }
324         }
325 #endif
326 }
327
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337         u128 Z;
338         int cnt = 15;
339         size_t rem, nlo, nhi;
340         const union { long one; char little; } is_endian = {1};
341
342         nlo  = ((const u8 *)Xi)[15];
343         nhi  = nlo>>4;
344         nlo &= 0xf;
345
346         Z.hi = Htable[nlo].hi;
347         Z.lo = Htable[nlo].lo;
348
349         while (1) {
350                 rem  = (size_t)Z.lo&0xf;
351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352                 Z.hi = (Z.hi>>4);
353                 if (sizeof(size_t)==8)
354                         Z.hi ^= rem_4bit[rem];
355                 else
356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358                 Z.hi ^= Htable[nhi].hi;
359                 Z.lo ^= Htable[nhi].lo;
360
361                 if (--cnt<0)            break;
362
363                 nlo  = ((const u8 *)Xi)[cnt];
364                 nhi  = nlo>>4;
365                 nlo &= 0xf;
366
367                 rem  = (size_t)Z.lo&0xf;
368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369                 Z.hi = (Z.hi>>4);
370                 if (sizeof(size_t)==8)
371                         Z.hi ^= rem_4bit[rem];
372                 else
373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375                 Z.hi ^= Htable[nlo].hi;
376                 Z.lo ^= Htable[nlo].lo;
377         }
378
379         if (is_endian.little) {
380 #ifdef BSWAP8
381                 Xi[0] = BSWAP8(Z.hi);
382                 Xi[1] = BSWAP8(Z.lo);
383 #else
384                 u8 *p = (u8 *)Xi;
385                 u32 v;
386                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
388                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
390 #endif
391         }
392         else {
393                 Xi[0] = Z.hi;
394                 Xi[1] = Z.lo;
395         }
396 }
397
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407                                 const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413
414 #if 1
415     do {
416         cnt  = 15;
417         nlo  = ((const u8 *)Xi)[15];
418         nlo ^= inp[15];
419         nhi  = nlo>>4;
420         nlo &= 0xf;
421
422         Z.hi = Htable[nlo].hi;
423         Z.lo = Htable[nlo].lo;
424
425         while (1) {
426                 rem  = (size_t)Z.lo&0xf;
427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428                 Z.hi = (Z.hi>>4);
429                 if (sizeof(size_t)==8)
430                         Z.hi ^= rem_4bit[rem];
431                 else
432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434                 Z.hi ^= Htable[nhi].hi;
435                 Z.lo ^= Htable[nhi].lo;
436
437                 if (--cnt<0)            break;
438
439                 nlo  = ((const u8 *)Xi)[cnt];
440                 nlo ^= inp[cnt];
441                 nhi  = nlo>>4;
442                 nlo &= 0xf;
443
444                 rem  = (size_t)Z.lo&0xf;
445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446                 Z.hi = (Z.hi>>4);
447                 if (sizeof(size_t)==8)
448                         Z.hi ^= rem_4bit[rem];
449                 else
450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452                 Z.hi ^= Htable[nlo].hi;
453                 Z.lo ^= Htable[nlo].lo;
454         }
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */ 
462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504         Z.hi = Htable[cnt].hi;
505         Z.lo = Htable[cnt].lo;
506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507         Hshr4[cnt].hi = (Z.hi>>4);
508         Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510
511     do {
512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513                 nlo  = ((const u8 *)Xi)[cnt];
514                 nlo ^= inp[cnt];
515                 nhi  = nlo>>4;
516                 nlo &= 0xf;
517
518                 Z.hi ^= Htable[nlo].hi;
519                 Z.lo ^= Htable[nlo].lo;
520
521                 rem = (size_t)Z.lo&0xff;
522
523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524                 Z.hi = (Z.hi>>8);
525
526                 Z.hi ^= Hshr4[nhi].hi;
527                 Z.lo ^= Hshr4[nhi].lo;
528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529         }
530
531         nlo  = ((const u8 *)Xi)[0];
532         nlo ^= inp[0];
533         nhi  = nlo>>4;
534         nlo &= 0xf;
535
536         Z.hi ^= Htable[nlo].hi;
537         Z.lo ^= Htable[nlo].lo;
538
539         rem = (size_t)Z.lo&0xf;
540
541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
542         Z.hi = (Z.hi>>4);
543
544         Z.hi ^= Htable[nhi].hi;
545         Z.lo ^= Htable[nhi].lo;
546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548
549         if (is_endian.little) {
550 #ifdef BSWAP8
551                 Xi[0] = BSWAP8(Z.hi);
552                 Xi[1] = BSWAP8(Z.lo);
553 #else
554                 u8 *p = (u8 *)Xi;
555                 u32 v;
556                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
558                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
560 #endif
561         }
562         else {
563                 Xi[0] = Z.hi;
564                 Xi[1] = Z.lo;
565         }
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582
583 #else   /* TABLE_BITS */
584
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587         u128 V,Z = { 0,0 };
588         long X;
589         int  i,j;
590         const long *xi = (const long *)Xi;
591         const union { long one; char little; } is_endian = {1};
592
593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
594         V.lo = H[1];
595
596         for (j=0; j<16/sizeof(long); ++j) {
597                 if (is_endian.little) {
598                         if (sizeof(long)==8) {
599 #ifdef BSWAP8
600                                 X = (long)(BSWAP8(xi[j]));
601 #else
602                                 const u8 *p = (const u8 *)(xi+j);
603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605                         }
606                         else {
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)GETU32(p);
609                         }
610                 }
611                 else
612                         X = xi[j];
613
614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
616                         Z.hi ^= V.hi&M;
617                         Z.lo ^= V.lo&M;
618
619                         REDUCE1BIT(V);
620                 }
621         }
622
623         if (is_endian.little) {
624 #ifdef BSWAP8
625                 Xi[0] = BSWAP8(Z.hi);
626                 Xi[1] = BSWAP8(Z.lo);
627 #else
628                 u8 *p = (u8 *)Xi;
629                 u32 v;
630                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
632                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
634 #endif
635         }
636         else {
637                 Xi[0] = Z.hi;
638                 Xi[1] = Z.lo;
639         }
640 }
641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643 #endif
644
645 #if     TABLE_BITS==4 && defined(GHASH_ASM)
646 # if    !defined(I386_ONLY) && \
647         (defined(__i386)        || defined(__i386__)    || \
648          defined(__x86_64)      || defined(__x86_64__)  || \
649          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 #   define GHASH_ASM_X86
660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665 #  endif
666 # elif defined(__arm__) || defined(__arm)
667 #  include "arm_arch.h"
668 #  if __ARM_ARCH__>=7
669 #   define GHASH_ASM_ARM
670 #   define GCM_FUNCREF_4BIT
671 extern unsigned int OPENSSL_armcap;
672
673 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675 #  endif
676 # endif
677 #endif
678
679 #ifdef GCM_FUNCREF_4BIT
680 # undef  GCM_MUL
681 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
682 # ifdef GHASH
683 #  undef  GHASH
684 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
685 # endif
686 #endif
687
688 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
689 {
690         const union { long one; char little; } is_endian = {1};
691
692         memset(ctx,0,sizeof(*ctx));
693         ctx->block = block;
694         ctx->key   = key;
695
696         (*block)(ctx->H.c,ctx->H.c,key);
697
698         if (is_endian.little) {
699                 /* H is stored in host byte order */
700 #ifdef BSWAP8
701                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
702                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
703 #else
704                 u8 *p = ctx->H.c;
705                 u64 hi,lo;
706                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
707                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
708                 ctx->H.u[0] = hi;
709                 ctx->H.u[1] = lo;
710 #endif
711         }
712
713 #if     TABLE_BITS==8
714         gcm_init_8bit(ctx->Htable,ctx->H.u);
715 #elif   TABLE_BITS==4
716 # if    defined(GHASH_ASM_X86_OR_64)
717 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
718         if (OPENSSL_ia32cap_P[1]&(1<<1)) {      /* check PCLMULQDQ bit */
719                 gcm_init_clmul(ctx->Htable,ctx->H.u);
720                 ctx->gmult = gcm_gmult_clmul;
721                 ctx->ghash = gcm_ghash_clmul;
722                 return;
723         }
724 #  endif
725         gcm_init_4bit(ctx->Htable,ctx->H.u);
726 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
727         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
728                 ctx->gmult = gcm_gmult_4bit_mmx;
729                 ctx->ghash = gcm_ghash_4bit_mmx;
730         } else {
731                 ctx->gmult = gcm_gmult_4bit_x86;
732                 ctx->ghash = gcm_ghash_4bit_x86;
733         }
734 #  else
735         ctx->gmult = gcm_gmult_4bit;
736         ctx->ghash = gcm_ghash_4bit;
737 #  endif
738 # elif  defined(GHASH_ASM_ARM)
739         if (OPENSSL_armcap & 1) {
740                 ctx->gmult = gcm_gmult_neon;
741                 ctx->ghash = gcm_ghash_neon;
742         } else {
743                 gcm_init_4bit(ctx->Htable,ctx->H.u);
744                 ctx->gmult = gcm_gmult_4bit;
745                 ctx->ghash = gcm_ghash_4bit;
746         }
747 # else
748         gcm_init_4bit(ctx->Htable,ctx->H.u);
749 # endif
750 #endif
751 }
752
753 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
754 {
755         const union { long one; char little; } is_endian = {1};
756         unsigned int ctr;
757 #ifdef GCM_FUNCREF_4BIT
758         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
759 #endif
760
761         ctx->Yi.u[0]  = 0;
762         ctx->Yi.u[1]  = 0;
763         ctx->Xi.u[0]  = 0;
764         ctx->Xi.u[1]  = 0;
765         ctx->len.u[0] = 0;      /* AAD length */
766         ctx->len.u[1] = 0;      /* message length */
767         ctx->ares = 0;
768         ctx->mres = 0;
769
770         if (len==12) {
771                 memcpy(ctx->Yi.c,iv,12);
772                 ctx->Yi.c[15]=1;
773                 ctr=1;
774         }
775         else {
776                 size_t i;
777                 u64 len0 = len;
778
779                 while (len>=16) {
780                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
781                         GCM_MUL(ctx,Yi);
782                         iv += 16;
783                         len -= 16;
784                 }
785                 if (len) {
786                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
787                         GCM_MUL(ctx,Yi);
788                 }
789                 len0 <<= 3;
790                 if (is_endian.little) {
791 #ifdef BSWAP8
792                         ctx->Yi.u[1]  ^= BSWAP8(len0);
793 #else
794                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
795                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
796                         ctx->Yi.c[10] ^= (u8)(len0>>40);
797                         ctx->Yi.c[11] ^= (u8)(len0>>32);
798                         ctx->Yi.c[12] ^= (u8)(len0>>24);
799                         ctx->Yi.c[13] ^= (u8)(len0>>16);
800                         ctx->Yi.c[14] ^= (u8)(len0>>8);
801                         ctx->Yi.c[15] ^= (u8)(len0);
802 #endif
803                 }
804                 else
805                         ctx->Yi.u[1]  ^= len0;
806
807                 GCM_MUL(ctx,Yi);
808
809                 if (is_endian.little)
810                         ctr = GETU32(ctx->Yi.c+12);
811                 else
812                         ctr = ctx->Yi.d[3];
813         }
814
815         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
816         ++ctr;
817         if (is_endian.little)
818                 PUTU32(ctx->Yi.c+12,ctr);
819         else
820                 ctx->Yi.d[3] = ctr;
821 }
822
823 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
824 {
825         size_t i;
826         unsigned int n;
827         u64 alen = ctx->len.u[0];
828 #ifdef GCM_FUNCREF_4BIT
829         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
830 # ifdef GHASH
831         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
832                                 const u8 *inp,size_t len)       = ctx->ghash;
833 # endif
834 #endif
835
836         if (ctx->len.u[1]) return -2;
837
838         alen += len;
839         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
840                 return -1;
841         ctx->len.u[0] = alen;
842
843         n = ctx->ares;
844         if (n) {
845                 while (n && len) {
846                         ctx->Xi.c[n] ^= *(aad++);
847                         --len;
848                         n = (n+1)%16;
849                 }
850                 if (n==0) GCM_MUL(ctx,Xi);
851                 else {
852                         ctx->ares = n;
853                         return 0;
854                 }
855         }
856
857 #ifdef GHASH
858         if ((i = (len&(size_t)-16))) {
859                 GHASH(ctx,aad,i);
860                 aad += i;
861                 len -= i;
862         }
863 #else
864         while (len>=16) {
865                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
866                 GCM_MUL(ctx,Xi);
867                 aad += 16;
868                 len -= 16;
869         }
870 #endif
871         if (len) {
872                 n = (unsigned int)len;
873                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
874         }
875
876         ctx->ares = n;
877         return 0;
878 }
879
880 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
881                 const unsigned char *in, unsigned char *out,
882                 size_t len)
883 {
884         const union { long one; char little; } is_endian = {1};
885         unsigned int n, ctr;
886         size_t i;
887         u64        mlen  = ctx->len.u[1];
888         block128_f block = ctx->block;
889         void      *key   = ctx->key;
890 #ifdef GCM_FUNCREF_4BIT
891         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
892 # ifdef GHASH
893         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
894                                 const u8 *inp,size_t len)       = ctx->ghash;
895 # endif
896 #endif
897
898 #if 0
899         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
900 #endif
901         mlen += len;
902         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
903                 return -1;
904         ctx->len.u[1] = mlen;
905
906         if (ctx->ares) {
907                 /* First call to encrypt finalizes GHASH(AAD) */
908                 GCM_MUL(ctx,Xi);
909                 ctx->ares = 0;
910         }
911
912         if (is_endian.little)
913                 ctr = GETU32(ctx->Yi.c+12);
914         else
915                 ctr = ctx->Yi.d[3];
916
917         n = ctx->mres;
918 #if !defined(OPENSSL_SMALL_FOOTPRINT)
919         if (16%sizeof(size_t) == 0) do {        /* always true actually */
920                 if (n) {
921                         while (n && len) {
922                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
923                                 --len;
924                                 n = (n+1)%16;
925                         }
926                         if (n==0) GCM_MUL(ctx,Xi);
927                         else {
928                                 ctx->mres = n;
929                                 return 0;
930                         }
931                 }
932 #if defined(STRICT_ALIGNMENT)
933                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
934                         break;
935 #endif
936 #if defined(GHASH) && defined(GHASH_CHUNK)
937                 while (len>=GHASH_CHUNK) {
938                     size_t j=GHASH_CHUNK;
939
940                     while (j) {
941                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
942                         ++ctr;
943                         if (is_endian.little)
944                                 PUTU32(ctx->Yi.c+12,ctr);
945                         else
946                                 ctx->Yi.d[3] = ctr;
947                         for (i=0; i<16; i+=sizeof(size_t))
948                                 *(size_t *)(out+i) =
949                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
950                         out += 16;
951                         in  += 16;
952                         j   -= 16;
953                     }
954                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
955                     len -= GHASH_CHUNK;
956                 }
957                 if ((i = (len&(size_t)-16))) {
958                     size_t j=i;
959
960                     while (len>=16) {
961                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
962                         ++ctr;
963                         if (is_endian.little)
964                                 PUTU32(ctx->Yi.c+12,ctr);
965                         else
966                                 ctx->Yi.d[3] = ctr;
967                         for (i=0; i<16; i+=sizeof(size_t))
968                                 *(size_t *)(out+i) =
969                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
970                         out += 16;
971                         in  += 16;
972                         len -= 16;
973                     }
974                     GHASH(ctx,out-j,j);
975                 }
976 #else
977                 while (len>=16) {
978                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
979                         ++ctr;
980                         if (is_endian.little)
981                                 PUTU32(ctx->Yi.c+12,ctr);
982                         else
983                                 ctx->Yi.d[3] = ctr;
984                         for (i=0; i<16; i+=sizeof(size_t))
985                                 *(size_t *)(ctx->Xi.c+i) ^=
986                                 *(size_t *)(out+i) =
987                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
988                         GCM_MUL(ctx,Xi);
989                         out += 16;
990                         in  += 16;
991                         len -= 16;
992                 }
993 #endif
994                 if (len) {
995                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
996                         ++ctr;
997                         if (is_endian.little)
998                                 PUTU32(ctx->Yi.c+12,ctr);
999                         else
1000                                 ctx->Yi.d[3] = ctr;
1001                         while (len--) {
1002                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1003                                 ++n;
1004                         }
1005                 }
1006
1007                 ctx->mres = n;
1008                 return 0;
1009         } while(0);
1010 #endif
1011         for (i=0;i<len;++i) {
1012                 if (n==0) {
1013                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1014                         ++ctr;
1015                         if (is_endian.little)
1016                                 PUTU32(ctx->Yi.c+12,ctr);
1017                         else
1018                                 ctx->Yi.d[3] = ctr;
1019                 }
1020                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1021                 n = (n+1)%16;
1022                 if (n==0)
1023                         GCM_MUL(ctx,Xi);
1024         }
1025
1026         ctx->mres = n;
1027         return 0;
1028 }
1029
1030 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1031                 const unsigned char *in, unsigned char *out,
1032                 size_t len)
1033 {
1034         const union { long one; char little; } is_endian = {1};
1035         unsigned int n, ctr;
1036         size_t i;
1037         u64        mlen  = ctx->len.u[1];
1038         block128_f block = ctx->block;
1039         void      *key   = ctx->key;
1040 #ifdef GCM_FUNCREF_4BIT
1041         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1042 # ifdef GHASH
1043         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1044                                 const u8 *inp,size_t len)       = ctx->ghash;
1045 # endif
1046 #endif
1047
1048         mlen += len;
1049         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1050                 return -1;
1051         ctx->len.u[1] = mlen;
1052
1053         if (ctx->ares) {
1054                 /* First call to decrypt finalizes GHASH(AAD) */
1055                 GCM_MUL(ctx,Xi);
1056                 ctx->ares = 0;
1057         }
1058
1059         if (is_endian.little)
1060                 ctr = GETU32(ctx->Yi.c+12);
1061         else
1062                 ctr = ctx->Yi.d[3];
1063
1064         n = ctx->mres;
1065 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1066         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1067                 if (n) {
1068                         while (n && len) {
1069                                 u8 c = *(in++);
1070                                 *(out++) = c^ctx->EKi.c[n];
1071                                 ctx->Xi.c[n] ^= c;
1072                                 --len;
1073                                 n = (n+1)%16;
1074                         }
1075                         if (n==0) GCM_MUL (ctx,Xi);
1076                         else {
1077                                 ctx->mres = n;
1078                                 return 0;
1079                         }
1080                 }
1081 #if defined(STRICT_ALIGNMENT)
1082                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1083                         break;
1084 #endif
1085 #if defined(GHASH) && defined(GHASH_CHUNK)
1086                 while (len>=GHASH_CHUNK) {
1087                     size_t j=GHASH_CHUNK;
1088
1089                     GHASH(ctx,in,GHASH_CHUNK);
1090                     while (j) {
1091                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1092                         ++ctr;
1093                         if (is_endian.little)
1094                                 PUTU32(ctx->Yi.c+12,ctr);
1095                         else
1096                                 ctx->Yi.d[3] = ctr;
1097                         for (i=0; i<16; i+=sizeof(size_t))
1098                                 *(size_t *)(out+i) =
1099                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1100                         out += 16;
1101                         in  += 16;
1102                         j   -= 16;
1103                     }
1104                     len -= GHASH_CHUNK;
1105                 }
1106                 if ((i = (len&(size_t)-16))) {
1107                     GHASH(ctx,in,i);
1108                     while (len>=16) {
1109                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1110                         ++ctr;
1111                         if (is_endian.little)
1112                                 PUTU32(ctx->Yi.c+12,ctr);
1113                         else
1114                                 ctx->Yi.d[3] = ctr;
1115                         for (i=0; i<16; i+=sizeof(size_t))
1116                                 *(size_t *)(out+i) =
1117                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1118                         out += 16;
1119                         in  += 16;
1120                         len -= 16;
1121                     }
1122                 }
1123 #else
1124                 while (len>=16) {
1125                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1126                         ++ctr;
1127                         if (is_endian.little)
1128                                 PUTU32(ctx->Yi.c+12,ctr);
1129                         else
1130                                 ctx->Yi.d[3] = ctr;
1131                         for (i=0; i<16; i+=sizeof(size_t)) {
1132                                 size_t c = *(size_t *)(in+i);
1133                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1134                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1135                         }
1136                         GCM_MUL(ctx,Xi);
1137                         out += 16;
1138                         in  += 16;
1139                         len -= 16;
1140                 }
1141 #endif
1142                 if (len) {
1143                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1144                         ++ctr;
1145                         if (is_endian.little)
1146                                 PUTU32(ctx->Yi.c+12,ctr);
1147                         else
1148                                 ctx->Yi.d[3] = ctr;
1149                         while (len--) {
1150                                 u8 c = in[n];
1151                                 ctx->Xi.c[n] ^= c;
1152                                 out[n] = c^ctx->EKi.c[n];
1153                                 ++n;
1154                         }
1155                 }
1156
1157                 ctx->mres = n;
1158                 return 0;
1159         } while(0);
1160 #endif
1161         for (i=0;i<len;++i) {
1162                 u8 c;
1163                 if (n==0) {
1164                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1165                         ++ctr;
1166                         if (is_endian.little)
1167                                 PUTU32(ctx->Yi.c+12,ctr);
1168                         else
1169                                 ctx->Yi.d[3] = ctr;
1170                 }
1171                 c = in[i];
1172                 out[i] = c^ctx->EKi.c[n];
1173                 ctx->Xi.c[n] ^= c;
1174                 n = (n+1)%16;
1175                 if (n==0)
1176                         GCM_MUL(ctx,Xi);
1177         }
1178
1179         ctx->mres = n;
1180         return 0;
1181 }
1182
1183 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1184                 const unsigned char *in, unsigned char *out,
1185                 size_t len, ctr128_f stream)
1186 {
1187         const union { long one; char little; } is_endian = {1};
1188         unsigned int n, ctr;
1189         size_t i;
1190         u64   mlen = ctx->len.u[1];
1191         void *key  = ctx->key;
1192 #ifdef GCM_FUNCREF_4BIT
1193         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1194 # ifdef GHASH
1195         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1196                                 const u8 *inp,size_t len)       = ctx->ghash;
1197 # endif
1198 #endif
1199
1200         mlen += len;
1201         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1202                 return -1;
1203         ctx->len.u[1] = mlen;
1204
1205         if (ctx->ares) {
1206                 /* First call to encrypt finalizes GHASH(AAD) */
1207                 GCM_MUL(ctx,Xi);
1208                 ctx->ares = 0;
1209         }
1210
1211         if (is_endian.little)
1212                 ctr = GETU32(ctx->Yi.c+12);
1213         else
1214                 ctr = ctx->Yi.d[3];
1215
1216         n = ctx->mres;
1217         if (n) {
1218                 while (n && len) {
1219                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1220                         --len;
1221                         n = (n+1)%16;
1222                 }
1223                 if (n==0) GCM_MUL(ctx,Xi);
1224                 else {
1225                         ctx->mres = n;
1226                         return 0;
1227                 }
1228         }
1229 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1230         while (len>=GHASH_CHUNK) {
1231                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1232                 ctr += GHASH_CHUNK/16;
1233                 if (is_endian.little)
1234                         PUTU32(ctx->Yi.c+12,ctr);
1235                 else
1236                         ctx->Yi.d[3] = ctr;
1237                 GHASH(ctx,out,GHASH_CHUNK);
1238                 out += GHASH_CHUNK;
1239                 in  += GHASH_CHUNK;
1240                 len -= GHASH_CHUNK;
1241         }
1242 #endif
1243         if ((i = (len&(size_t)-16))) {
1244                 size_t j=i/16;
1245
1246                 (*stream)(in,out,j,key,ctx->Yi.c);
1247                 ctr += (unsigned int)j;
1248                 if (is_endian.little)
1249                         PUTU32(ctx->Yi.c+12,ctr);
1250                 else
1251                         ctx->Yi.d[3] = ctr;
1252                 in  += i;
1253                 len -= i;
1254 #if defined(GHASH)
1255                 GHASH(ctx,out,i);
1256                 out += i;
1257 #else
1258                 while (j--) {
1259                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1260                         GCM_MUL(ctx,Xi);
1261                         out += 16;
1262                 }
1263 #endif
1264         }
1265         if (len) {
1266                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1267                 ++ctr;
1268                 if (is_endian.little)
1269                         PUTU32(ctx->Yi.c+12,ctr);
1270                 else
1271                         ctx->Yi.d[3] = ctr;
1272                 while (len--) {
1273                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1274                         ++n;
1275                 }
1276         }
1277
1278         ctx->mres = n;
1279         return 0;
1280 }
1281
1282 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1283                 const unsigned char *in, unsigned char *out,
1284                 size_t len,ctr128_f stream)
1285 {
1286         const union { long one; char little; } is_endian = {1};
1287         unsigned int n, ctr;
1288         size_t i;
1289         u64   mlen = ctx->len.u[1];
1290         void *key  = ctx->key;
1291 #ifdef GCM_FUNCREF_4BIT
1292         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1293 # ifdef GHASH
1294         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1295                                 const u8 *inp,size_t len)       = ctx->ghash;
1296 # endif
1297 #endif
1298
1299         mlen += len;
1300         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1301                 return -1;
1302         ctx->len.u[1] = mlen;
1303
1304         if (ctx->ares) {
1305                 /* First call to decrypt finalizes GHASH(AAD) */
1306                 GCM_MUL(ctx,Xi);
1307                 ctx->ares = 0;
1308         }
1309
1310         if (is_endian.little)
1311                 ctr = GETU32(ctx->Yi.c+12);
1312         else
1313                 ctr = ctx->Yi.d[3];
1314
1315         n = ctx->mres;
1316         if (n) {
1317                 while (n && len) {
1318                         u8 c = *(in++);
1319                         *(out++) = c^ctx->EKi.c[n];
1320                         ctx->Xi.c[n] ^= c;
1321                         --len;
1322                         n = (n+1)%16;
1323                 }
1324                 if (n==0) GCM_MUL (ctx,Xi);
1325                 else {
1326                         ctx->mres = n;
1327                         return 0;
1328                 }
1329         }
1330 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1331         while (len>=GHASH_CHUNK) {
1332                 GHASH(ctx,in,GHASH_CHUNK);
1333                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1334                 ctr += GHASH_CHUNK/16;
1335                 if (is_endian.little)
1336                         PUTU32(ctx->Yi.c+12,ctr);
1337                 else
1338                         ctx->Yi.d[3] = ctr;
1339                 out += GHASH_CHUNK;
1340                 in  += GHASH_CHUNK;
1341                 len -= GHASH_CHUNK;
1342         }
1343 #endif
1344         if ((i = (len&(size_t)-16))) {
1345                 size_t j=i/16;
1346
1347 #if defined(GHASH)
1348                 GHASH(ctx,in,i);
1349 #else
1350                 while (j--) {
1351                         size_t k;
1352                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1353                         GCM_MUL(ctx,Xi);
1354                         in += 16;
1355                 }
1356                 j   = i/16;
1357                 in -= i;
1358 #endif
1359                 (*stream)(in,out,j,key,ctx->Yi.c);
1360                 ctr += (unsigned int)j;
1361                 if (is_endian.little)
1362                         PUTU32(ctx->Yi.c+12,ctr);
1363                 else
1364                         ctx->Yi.d[3] = ctr;
1365                 out += i;
1366                 in  += i;
1367                 len -= i;
1368         }
1369         if (len) {
1370                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1371                 ++ctr;
1372                 if (is_endian.little)
1373                         PUTU32(ctx->Yi.c+12,ctr);
1374                 else
1375                         ctx->Yi.d[3] = ctr;
1376                 while (len--) {
1377                         u8 c = in[n];
1378                         ctx->Xi.c[n] ^= c;
1379                         out[n] = c^ctx->EKi.c[n];
1380                         ++n;
1381                 }
1382         }
1383
1384         ctx->mres = n;
1385         return 0;
1386 }
1387
1388 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1389                         size_t len)
1390 {
1391         const union { long one; char little; } is_endian = {1};
1392         u64 alen = ctx->len.u[0]<<3;
1393         u64 clen = ctx->len.u[1]<<3;
1394 #ifdef GCM_FUNCREF_4BIT
1395         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1396 #endif
1397
1398         if (ctx->mres)
1399                 GCM_MUL(ctx,Xi);
1400
1401         if (is_endian.little) {
1402 #ifdef BSWAP8
1403                 alen = BSWAP8(alen);
1404                 clen = BSWAP8(clen);
1405 #else
1406                 u8 *p = ctx->len.c;
1407
1408                 ctx->len.u[0] = alen;
1409                 ctx->len.u[1] = clen;
1410
1411                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1412                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1413 #endif
1414         }
1415
1416         ctx->Xi.u[0] ^= alen;
1417         ctx->Xi.u[1] ^= clen;
1418         GCM_MUL(ctx,Xi);
1419
1420         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1421         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1422
1423         if (tag && len<=sizeof(ctx->Xi))
1424                 return memcmp(ctx->Xi.c,tag,len);
1425         else
1426                 return -1;
1427 }
1428
1429 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1430 {
1431         CRYPTO_gcm128_finish(ctx, NULL, 0);
1432         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1433 }
1434
1435 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1436 {
1437         GCM128_CONTEXT *ret;
1438
1439         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1440                 CRYPTO_gcm128_init(ret,key,block);
1441
1442         return ret;
1443 }
1444
1445 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1446 {
1447         if (ctx) {
1448                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1449                 OPENSSL_free(ctx);
1450         }
1451 }
1452
1453 #if defined(SELFTEST)
1454 #include <stdio.h>
1455 #include <openssl/aes.h>
1456
1457 /* Test Case 1 */
1458 static const u8 K1[16],
1459                 *P1=NULL,
1460                 *A1=NULL,
1461                 IV1[12],
1462                 *C1=NULL,
1463                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1464
1465 /* Test Case 2 */
1466 #define K2 K1
1467 #define A2 A1
1468 #define IV2 IV1
1469 static const u8 P2[16],
1470                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1471                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1472
1473 /* Test Case 3 */
1474 #define A3 A2
1475 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1476                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1477                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1478                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1479                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1480                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1481                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1482                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1483                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1484                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1485                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1486
1487 /* Test Case 4 */
1488 #define K4 K3
1489 #define IV4 IV3
1490 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1491                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1492                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1493                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1494                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1495                         0xab,0xad,0xda,0xd2},
1496                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1497                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1498                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1499                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1500                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1501
1502 /* Test Case 5 */
1503 #define K5 K4
1504 #define P5 P4
1505 #define A5 A4
1506 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1507                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1508                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1509                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1510                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1511                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1512
1513 /* Test Case 6 */
1514 #define K6 K5
1515 #define P6 P5
1516 #define A6 A5
1517 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1518                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1519                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1520                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1521                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1522                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1523                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1524                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1525                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1526
1527 /* Test Case 7 */
1528 static const u8 K7[24],
1529                 *P7=NULL,
1530                 *A7=NULL,
1531                 IV7[12],
1532                 *C7=NULL,
1533                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1534
1535 /* Test Case 8 */
1536 #define K8 K7
1537 #define IV8 IV7
1538 #define A8 A7
1539 static const u8 P8[16],
1540                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1541                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1542
1543 /* Test Case 9 */
1544 #define A9 A8
1545 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1546                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1547                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1548                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1549                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1550                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1551                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1552                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1553                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1554                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1555                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1556                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1557
1558 /* Test Case 10 */
1559 #define K10 K9
1560 #define IV10 IV9
1561 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1562                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1563                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1564                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1565                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1566                         0xab,0xad,0xda,0xd2},
1567                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1568                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1569                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1570                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1571                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1572
1573 /* Test Case 11 */
1574 #define K11 K10
1575 #define P11 P10
1576 #define A11 A10
1577 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1578                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1579                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1580                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1581                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1582                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1583
1584 /* Test Case 12 */
1585 #define K12 K11
1586 #define P12 P11
1587 #define A12 A11
1588 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1589                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1590                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1591                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1592                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1593                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1594                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1595                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1596                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1597
1598 /* Test Case 13 */
1599 static const u8 K13[32],
1600                 *P13=NULL,
1601                 *A13=NULL,
1602                 IV13[12],
1603                 *C13=NULL,
1604                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1605
1606 /* Test Case 14 */
1607 #define K14 K13
1608 #define A14 A13
1609 static const u8 P14[16],
1610                 IV14[12],
1611                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1612                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1613
1614 /* Test Case 15 */
1615 #define A15 A14
1616 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1617                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1618                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1619                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1620                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1621                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1622                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1623                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1624                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1625                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1626                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1627                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1628
1629 /* Test Case 16 */
1630 #define K16 K15
1631 #define IV16 IV15
1632 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1633                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1634                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1635                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1636                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1637                         0xab,0xad,0xda,0xd2},
1638                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1639                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1640                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1641                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1642                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1643
1644 /* Test Case 17 */
1645 #define K17 K16
1646 #define P17 P16
1647 #define A17 A16
1648 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1649                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1650                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1651                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1652                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1653                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1654
1655 /* Test Case 18 */
1656 #define K18 K17
1657 #define P18 P17
1658 #define A18 A17
1659 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1660                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1661                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1662                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1663                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1664                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1665                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1666                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1667                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1668
1669 #define TEST_CASE(n)    do {                                    \
1670         u8 out[sizeof(P##n)];                                   \
1671         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1672         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1673         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1674         memset(out,0,sizeof(out));                              \
1675         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1676         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1677         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1678             (C##n && memcmp(out,C##n,sizeof(out))))             \
1679                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1680         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1681         memset(out,0,sizeof(out));                              \
1682         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1683         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1684         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1685             (P##n && memcmp(out,P##n,sizeof(out))))             \
1686                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1687         } while(0)
1688
1689 int main()
1690 {
1691         GCM128_CONTEXT ctx;
1692         AES_KEY key;
1693         int ret=0;
1694
1695         TEST_CASE(1);
1696         TEST_CASE(2);
1697         TEST_CASE(3);
1698         TEST_CASE(4);
1699         TEST_CASE(5);
1700         TEST_CASE(6);
1701         TEST_CASE(7);
1702         TEST_CASE(8);
1703         TEST_CASE(9);
1704         TEST_CASE(10);
1705         TEST_CASE(11);
1706         TEST_CASE(12);
1707         TEST_CASE(13);
1708         TEST_CASE(14);
1709         TEST_CASE(15);
1710         TEST_CASE(16);
1711         TEST_CASE(17);
1712         TEST_CASE(18);
1713
1714 #ifdef OPENSSL_CPUID_OBJ
1715         {
1716         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1717         union { u64 u; u8 c[1024]; } buf;
1718         int i;
1719
1720         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1721         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1722         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1723
1724         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1725         start = OPENSSL_rdtsc();
1726         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1727         gcm_t = OPENSSL_rdtsc() - start;
1728
1729         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1730                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1731                         (block128_f)AES_encrypt);
1732         start = OPENSSL_rdtsc();
1733         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1734                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1735                         (block128_f)AES_encrypt);
1736         ctr_t = OPENSSL_rdtsc() - start;
1737
1738         printf("%.2f-%.2f=%.2f\n",
1739                         gcm_t/(double)sizeof(buf),
1740                         ctr_t/(double)sizeof(buf),
1741                         (gcm_t-ctr_t)/(double)sizeof(buf));
1742 #ifdef GHASH
1743         GHASH(&ctx,buf.c,sizeof(buf));
1744         start = OPENSSL_rdtsc();
1745         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1746         gcm_t = OPENSSL_rdtsc() - start;
1747         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1748 #endif
1749         }
1750 #endif
1751
1752         return ret;
1753 }
1754 #endif