3f6b70df4b3607658408353324e3076a9d018732
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #include <openssl/crypto.h>
51 #include "modes_lcl.h"
52 #include <string.h>
53
54 #ifndef MODES_DEBUG
55 # ifndef NDEBUG
56 #  define NDEBUG
57 # endif
58 #endif
59 #include <assert.h>
60
61 typedef struct { u64 hi,lo; } u128;
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 #ifdef  TABLE_BITS
86 #undef  TABLE_BITS
87 #endif
88 /*
89  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
90  * never be set to 8. 8 is effectively reserved for testing purposes.
91  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
92  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
93  * whole spectrum of possible table driven implementations. Why? In
94  * non-"Shoup's" case memory access pattern is segmented in such manner,
95  * that it's trivial to see that cache timing information can reveal
96  * fair portion of intermediate hash value. Given that ciphertext is
97  * always available to attacker, it's possible for him to attempt to
98  * deduce secret parameter H and if successful, tamper with messages
99  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
100  * not as trivial, but there is no reason to believe that it's resistant
101  * to cache-timing attack. And the thing about "8-bit" implementation is
102  * that it consumes 16 (sixteen) times more memory, 4KB per individual
103  * key + 1KB shared. Well, on pros side it should be twice as fast as
104  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
105  * was observed to run ~75% faster, closer to 100% for commercial
106  * compilers... Yet "4-bit" procedure is preferred, because it's
107  * believed to provide better security-performance balance and adequate
108  * all-round performance. "All-round" refers to things like:
109  *
110  * - shorter setup time effectively improves overall timing for
111  *   handling short messages;
112  * - larger table allocation can become unbearable because of VM
113  *   subsystem penalties (for example on Windows large enough free
114  *   results in VM working set trimming, meaning that consequent
115  *   malloc would immediately incur working set expansion);
116  * - larger table has larger cache footprint, which can affect
117  *   performance of other code paths (not necessarily even from same
118  *   thread in Hyper-Threading world);
119  */
120 #define TABLE_BITS 4
121
122 #if     TABLE_BITS==8
123
124 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
125 {
126         int  i, j;
127         u128 V;
128
129         Htable[0].hi = 0;
130         Htable[0].lo = 0;
131         V.hi = H[0];
132         V.lo = H[1];
133
134         for (Htable[128]=V, i=64; i>0; i>>=1) {
135                 REDUCE1BIT(V);
136                 Htable[i] = V;
137         }
138
139         for (i=2; i<256; i<<=1) {
140                 u128 *Hi = Htable+i, H0 = *Hi;
141                 for (j=1; j<i; ++j) {
142                         Hi[j].hi = H0.hi^Htable[j].hi;
143                         Hi[j].lo = H0.lo^Htable[j].lo;
144                 }
145         }
146 }
147
148 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
149 {
150         u128 Z = { 0, 0};
151         const u8 *xi = (const u8 *)Xi+15;
152         size_t rem, n = *xi;
153         const union { long one; char little; } is_endian = {1};
154         static const size_t rem_8bit[256] = {
155                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
156                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
157                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
158                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
159                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
160                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
161                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
162                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
163                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
164                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
165                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
166                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
167                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
168                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
169                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
170                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
171                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
172                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
173                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
174                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
175                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
176                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
177                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
178                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
179                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
180                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
181                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
182                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
183                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
184                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
185                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
186                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
187                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
188                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
189                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
190                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
191                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
192                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
193                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
194                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
195                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
196                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
197                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
198                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
199                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
200                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
201                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
202                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
203                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
204                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
205                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
206                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
207                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
208                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
209                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
210                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
211                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
212                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
213                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
214                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
215                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
216                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
217                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
218                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
219
220         while (1) {
221                 Z.hi ^= Htable[n].hi;
222                 Z.lo ^= Htable[n].lo;
223
224                 if ((u8 *)Xi==xi)       break;
225
226                 n = *(--xi);
227
228                 rem  = (size_t)Z.lo&0xff;
229                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
230                 Z.hi = (Z.hi>>8);
231                 if (sizeof(size_t)==8)
232                         Z.hi ^= rem_8bit[rem];
233                 else
234                         Z.hi ^= (u64)rem_8bit[rem]<<32;
235         }
236
237         if (is_endian.little) {
238 #ifdef BSWAP8
239                 Xi[0] = BSWAP8(Z.hi);
240                 Xi[1] = BSWAP8(Z.lo);
241 #else
242                 u8 *p = (u8 *)Xi;
243                 u32 v;
244                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
245                 v = (u32)(Z.hi);        PUTU32(p+4,v);
246                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
247                 v = (u32)(Z.lo);        PUTU32(p+12,v);
248 #endif
249         }
250         else {
251                 Xi[0] = Z.hi;
252                 Xi[1] = Z.lo;
253         }
254 }
255 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
256
257 #elif   TABLE_BITS==4
258
259 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
260 {
261         u128 V;
262 #if defined(OPENSSL_SMALL_FOOTPRINT)
263         int  i;
264 #endif
265
266         Htable[0].hi = 0;
267         Htable[0].lo = 0;
268         V.hi = H[0];
269         V.lo = H[1];
270
271 #if defined(OPENSSL_SMALL_FOOTPRINT)
272         for (Htable[8]=V, i=4; i>0; i>>=1) {
273                 REDUCE1BIT(V);
274                 Htable[i] = V;
275         }
276
277         for (i=2; i<16; i<<=1) {
278                 u128 *Hi = Htable+i;
279                 int   j;
280                 for (V=*Hi, j=1; j<i; ++j) {
281                         Hi[j].hi = V.hi^Htable[j].hi;
282                         Hi[j].lo = V.lo^Htable[j].lo;
283                 }
284         }
285 #else
286         Htable[8] = V;
287         REDUCE1BIT(V);
288         Htable[4] = V;
289         REDUCE1BIT(V);
290         Htable[2] = V;
291         REDUCE1BIT(V);
292         Htable[1] = V;
293         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
294         V=Htable[4];
295         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
296         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
297         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
298         V=Htable[8];
299         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
300         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
301         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
302         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
303         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
304         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
305         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
306 #endif
307 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
308         /*
309          * ARM assembler expects specific dword order in Htable.
310          */
311         {
312         int j;
313         const union { long one; char little; } is_endian = {1};
314
315         if (is_endian.little)
316                 for (j=0;j<16;++j) {
317                         V = Htable[j];
318                         Htable[j].hi = V.lo;
319                         Htable[j].lo = V.hi;
320                 }
321         else
322                 for (j=0;j<16;++j) {
323                         V = Htable[j];
324                         Htable[j].hi = V.lo<<32|V.lo>>32;
325                         Htable[j].lo = V.hi<<32|V.hi>>32;
326                 }
327         }
328 #endif
329 }
330
331 #ifndef GHASH_ASM
332 static const size_t rem_4bit[16] = {
333         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
334         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
335         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
336         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
337
338 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
339 {
340         u128 Z;
341         int cnt = 15;
342         size_t rem, nlo, nhi;
343         const union { long one; char little; } is_endian = {1};
344
345         nlo  = ((const u8 *)Xi)[15];
346         nhi  = nlo>>4;
347         nlo &= 0xf;
348
349         Z.hi = Htable[nlo].hi;
350         Z.lo = Htable[nlo].lo;
351
352         while (1) {
353                 rem  = (size_t)Z.lo&0xf;
354                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
355                 Z.hi = (Z.hi>>4);
356                 if (sizeof(size_t)==8)
357                         Z.hi ^= rem_4bit[rem];
358                 else
359                         Z.hi ^= (u64)rem_4bit[rem]<<32;
360
361                 Z.hi ^= Htable[nhi].hi;
362                 Z.lo ^= Htable[nhi].lo;
363
364                 if (--cnt<0)            break;
365
366                 nlo  = ((const u8 *)Xi)[cnt];
367                 nhi  = nlo>>4;
368                 nlo &= 0xf;
369
370                 rem  = (size_t)Z.lo&0xf;
371                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
372                 Z.hi = (Z.hi>>4);
373                 if (sizeof(size_t)==8)
374                         Z.hi ^= rem_4bit[rem];
375                 else
376                         Z.hi ^= (u64)rem_4bit[rem]<<32;
377
378                 Z.hi ^= Htable[nlo].hi;
379                 Z.lo ^= Htable[nlo].lo;
380         }
381
382         if (is_endian.little) {
383 #ifdef BSWAP8
384                 Xi[0] = BSWAP8(Z.hi);
385                 Xi[1] = BSWAP8(Z.lo);
386 #else
387                 u8 *p = (u8 *)Xi;
388                 u32 v;
389                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
390                 v = (u32)(Z.hi);        PUTU32(p+4,v);
391                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
392                 v = (u32)(Z.lo);        PUTU32(p+12,v);
393 #endif
394         }
395         else {
396                 Xi[0] = Z.hi;
397                 Xi[1] = Z.lo;
398         }
399 }
400
401 #if !defined(OPENSSL_SMALL_FOOTPRINT)
402 /*
403  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
404  * details... Compiler-generated code doesn't seem to give any
405  * performance improvement, at least not on x86[_64]. It's here
406  * mostly as reference and a placeholder for possible future
407  * non-trivial optimization[s]...
408  */
409 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
410                                 const u8 *inp,size_t len)
411 {
412     u128 Z;
413     int cnt;
414     size_t rem, nlo, nhi;
415     const union { long one; char little; } is_endian = {1};
416
417 #if 1
418     do {
419         cnt  = 15;
420         nlo  = ((const u8 *)Xi)[15];
421         nlo ^= inp[15];
422         nhi  = nlo>>4;
423         nlo &= 0xf;
424
425         Z.hi = Htable[nlo].hi;
426         Z.lo = Htable[nlo].lo;
427
428         while (1) {
429                 rem  = (size_t)Z.lo&0xf;
430                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
431                 Z.hi = (Z.hi>>4);
432                 if (sizeof(size_t)==8)
433                         Z.hi ^= rem_4bit[rem];
434                 else
435                         Z.hi ^= (u64)rem_4bit[rem]<<32;
436
437                 Z.hi ^= Htable[nhi].hi;
438                 Z.lo ^= Htable[nhi].lo;
439
440                 if (--cnt<0)            break;
441
442                 nlo  = ((const u8 *)Xi)[cnt];
443                 nlo ^= inp[cnt];
444                 nhi  = nlo>>4;
445                 nlo &= 0xf;
446
447                 rem  = (size_t)Z.lo&0xf;
448                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
449                 Z.hi = (Z.hi>>4);
450                 if (sizeof(size_t)==8)
451                         Z.hi ^= rem_4bit[rem];
452                 else
453                         Z.hi ^= (u64)rem_4bit[rem]<<32;
454
455                 Z.hi ^= Htable[nlo].hi;
456                 Z.lo ^= Htable[nlo].lo;
457         }
458 #else
459     /*
460      * Extra 256+16 bytes per-key plus 512 bytes shared tables
461      * [should] give ~50% improvement... One could have PACK()-ed
462      * the rem_8bit even here, but the priority is to minimize
463      * cache footprint...
464      */ 
465     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
466     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 struct gcm128_context {
649         /* Following 6 names follow names in GCM specification */
650         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
651                                                 Xi,H,len;
652         /* Pre-computed table used by gcm_gmult_* */
653 #if TABLE_BITS==8
654         u128 Htable[256];
655 #else
656         u128 Htable[16];
657         void (*gmult)(u64 Xi[2],const u128 Htable[16]);
658         void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
659 #endif
660         unsigned int mres, ares;
661         block128_f block;
662         void *key;
663 };
664
665 #if     TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
666         (defined(__i386)        || defined(__i386__)    || \
667          defined(__x86_64)      || defined(__x86_64__)  || \
668          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
669 # define GHASH_ASM_IAX
670 extern unsigned int OPENSSL_ia32cap_P[2];
671
672 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
673 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675
676 # if    defined(__i386) || defined(__i386__) || defined(_M_IX86)
677 #  define GHASH_ASM_X86
678 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
679 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
680
681 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
682 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
683 # endif
684
685 # undef  GCM_MUL
686 # define GCM_MUL(ctx,Xi)   (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
687 # undef  GHASH
688 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
689 #endif
690
691 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
692 {
693         const union { long one; char little; } is_endian = {1};
694
695         memset(ctx,0,sizeof(*ctx));
696         ctx->block = block;
697         ctx->key   = key;
698
699         (*block)(ctx->H.c,ctx->H.c,key);
700
701         if (is_endian.little) {
702                 /* H is stored in host byte order */
703 #ifdef BSWAP8
704                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
705                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
706 #else
707                 u8 *p = ctx->H.c;
708                 u64 hi,lo;
709                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
710                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
711                 ctx->H.u[0] = hi;
712                 ctx->H.u[1] = lo;
713 #endif
714         }
715
716 #if     TABLE_BITS==8
717         gcm_init_8bit(ctx->Htable,ctx->H.u);
718 #elif   TABLE_BITS==4
719 # if    defined(GHASH_ASM_IAX)                  /* both x86 and x86_64 */
720         if (OPENSSL_ia32cap_P[1]&(1<<1)) {
721                 gcm_init_clmul(ctx->Htable,ctx->H.u);
722                 ctx->gmult = gcm_gmult_clmul;
723                 ctx->ghash = gcm_ghash_clmul;
724                 return;
725         }
726         gcm_init_4bit(ctx->Htable,ctx->H.u);
727 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
728         if (OPENSSL_ia32cap_P[0]&(1<<23)) {
729                 ctx->gmult = gcm_gmult_4bit_mmx;
730                 ctx->ghash = gcm_ghash_4bit_mmx;
731         } else {
732                 ctx->gmult = gcm_gmult_4bit_x86;
733                 ctx->ghash = gcm_ghash_4bit_x86;
734         }
735 #  else
736         ctx->gmult = gcm_gmult_4bit;
737         ctx->ghash = gcm_ghash_4bit;
738 #  endif
739 # else
740         gcm_init_4bit(ctx->Htable,ctx->H.u);
741 # endif
742 #endif
743 }
744
745 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
746 {
747         const union { long one; char little; } is_endian = {1};
748         unsigned int ctr;
749
750         ctx->Yi.u[0]  = 0;
751         ctx->Yi.u[1]  = 0;
752         ctx->Xi.u[0]  = 0;
753         ctx->Xi.u[1]  = 0;
754         ctx->len.u[0] = 0;      /* AAD length */
755         ctx->len.u[1] = 0;      /* message length */
756         ctx->ares = 0;
757         ctx->mres = 0;
758
759         if (len==12) {
760                 memcpy(ctx->Yi.c,iv,12);
761                 ctx->Yi.c[15]=1;
762                 ctr=1;
763         }
764         else {
765                 size_t i;
766                 u64 len0 = len;
767
768                 while (len>=16) {
769                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
770                         GCM_MUL(ctx,Yi);
771                         iv += 16;
772                         len -= 16;
773                 }
774                 if (len) {
775                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
776                         GCM_MUL(ctx,Yi);
777                 }
778                 len0 <<= 3;
779                 if (is_endian.little) {
780 #ifdef BSWAP8
781                         ctx->Yi.u[1]  ^= BSWAP8(len0);
782 #else
783                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
784                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
785                         ctx->Yi.c[10] ^= (u8)(len0>>40);
786                         ctx->Yi.c[11] ^= (u8)(len0>>32);
787                         ctx->Yi.c[12] ^= (u8)(len0>>24);
788                         ctx->Yi.c[13] ^= (u8)(len0>>16);
789                         ctx->Yi.c[14] ^= (u8)(len0>>8);
790                         ctx->Yi.c[15] ^= (u8)(len0);
791 #endif
792                 }
793                 else
794                         ctx->Yi.u[1]  ^= len0;
795
796                 GCM_MUL(ctx,Yi);
797
798                 if (is_endian.little)
799                         ctr = GETU32(ctx->Yi.c+12);
800                 else
801                         ctr = ctx->Yi.d[3];
802         }
803
804         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
805         ++ctr;
806         if (is_endian.little)
807                 PUTU32(ctx->Yi.c+12,ctr);
808         else
809                 ctx->Yi.d[3] = ctr;
810 }
811
812 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
813 {
814         size_t i;
815         unsigned int n;
816         u64 alen = ctx->len.u[0];
817
818         if (ctx->len.u[1]) return -2;
819
820         alen += len;
821         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
822                 return -1;
823         ctx->len.u[0] = alen;
824
825         n = ctx->ares;
826         if (n) {
827                 while (n && len) {
828                         ctx->Xi.c[n] ^= *(aad++);
829                         --len;
830                         n = (n+1)%16;
831                 }
832                 if (n==0) GCM_MUL(ctx,Xi);
833                 else {
834                         ctx->ares = n;
835                         return 0;
836                 }
837         }
838
839 #ifdef GHASH
840         if ((i = (len&(size_t)-16))) {
841                 GHASH(ctx,aad,i);
842                 aad += i;
843                 len -= i;
844         }
845 #else
846         while (len>=16) {
847                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
848                 GCM_MUL(ctx,Xi);
849                 aad += 16;
850                 len -= 16;
851         }
852 #endif
853         if (len) {
854                 n = (unsigned int)len;
855                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
856         }
857
858         ctx->ares = n;
859         return 0;
860 }
861
862 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
863                 const unsigned char *in, unsigned char *out,
864                 size_t len)
865 {
866         const union { long one; char little; } is_endian = {1};
867         unsigned int n, ctr;
868         size_t i;
869         u64 mlen = ctx->len.u[1];
870
871 #if 0
872         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
873 #endif
874         mlen += len;
875         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
876                 return -1;
877         ctx->len.u[1] = mlen;
878
879         if (ctx->ares) {
880                 /* First call to encrypt finalizes GHASH(AAD) */
881                 GCM_MUL(ctx,Xi);
882                 ctx->ares = 0;
883         }
884
885         if (is_endian.little)
886                 ctr = GETU32(ctx->Yi.c+12);
887         else
888                 ctr = ctx->Yi.d[3];
889
890         n = ctx->mres;
891 #if !defined(OPENSSL_SMALL_FOOTPRINT)
892         if (16%sizeof(size_t) == 0) do {        /* always true actually */
893                 if (n) {
894                         while (n && len) {
895                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
896                                 --len;
897                                 n = (n+1)%16;
898                         }
899                         if (n==0) GCM_MUL(ctx,Xi);
900                         else {
901                                 ctx->mres = n;
902                                 return 0;
903                         }
904                 }
905 #if defined(STRICT_ALIGNMENT)
906                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
907                         break;
908 #endif
909 #if defined(GHASH) && defined(GHASH_CHUNK)
910                 while (len>=GHASH_CHUNK) {
911                     size_t j=GHASH_CHUNK;
912
913                     while (j) {
914                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
915                         ++ctr;
916                         if (is_endian.little)
917                                 PUTU32(ctx->Yi.c+12,ctr);
918                         else
919                                 ctx->Yi.d[3] = ctr;
920                         for (i=0; i<16; i+=sizeof(size_t))
921                                 *(size_t *)(out+i) =
922                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
923                         out += 16;
924                         in  += 16;
925                         j   -= 16;
926                     }
927                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
928                     len -= GHASH_CHUNK;
929                 }
930                 if ((i = (len&(size_t)-16))) {
931                     size_t j=i;
932
933                     while (len>=16) {
934                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
935                         ++ctr;
936                         if (is_endian.little)
937                                 PUTU32(ctx->Yi.c+12,ctr);
938                         else
939                                 ctx->Yi.d[3] = ctr;
940                         for (i=0; i<16; i+=sizeof(size_t))
941                                 *(size_t *)(out+i) =
942                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
943                         out += 16;
944                         in  += 16;
945                         len -= 16;
946                     }
947                     GHASH(ctx,out-j,j);
948                 }
949 #else
950                 while (len>=16) {
951                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
952                         ++ctr;
953                         if (is_endian.little)
954                                 PUTU32(ctx->Yi.c+12,ctr);
955                         else
956                                 ctx->Yi.d[3] = ctr;
957                         for (i=0; i<16; i+=sizeof(size_t))
958                                 *(size_t *)(ctx->Xi.c+i) ^=
959                                 *(size_t *)(out+i) =
960                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
961                         GCM_MUL(ctx,Xi);
962                         out += 16;
963                         in  += 16;
964                         len -= 16;
965                 }
966 #endif
967                 if (len) {
968                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
969                         ++ctr;
970                         if (is_endian.little)
971                                 PUTU32(ctx->Yi.c+12,ctr);
972                         else
973                                 ctx->Yi.d[3] = ctr;
974                         while (len--) {
975                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
976                                 ++n;
977                         }
978                 }
979
980                 ctx->mres = n;
981                 return 0;
982         } while(0);
983 #endif
984         for (i=0;i<len;++i) {
985                 if (n==0) {
986                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
987                         ++ctr;
988                         if (is_endian.little)
989                                 PUTU32(ctx->Yi.c+12,ctr);
990                         else
991                                 ctx->Yi.d[3] = ctr;
992                 }
993                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
994                 n = (n+1)%16;
995                 if (n==0)
996                         GCM_MUL(ctx,Xi);
997         }
998
999         ctx->mres = n;
1000         return 0;
1001 }
1002
1003 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1004                 const unsigned char *in, unsigned char *out,
1005                 size_t len)
1006 {
1007         const union { long one; char little; } is_endian = {1};
1008         unsigned int n, ctr;
1009         size_t i;
1010         u64 mlen = ctx->len.u[1];
1011
1012         mlen += len;
1013         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1014                 return -1;
1015         ctx->len.u[1] = mlen;
1016
1017         if (ctx->ares) {
1018                 /* First call to decrypt finalizes GHASH(AAD) */
1019                 GCM_MUL(ctx,Xi);
1020                 ctx->ares = 0;
1021         }
1022
1023         if (is_endian.little)
1024                 ctr = GETU32(ctx->Yi.c+12);
1025         else
1026                 ctr = ctx->Yi.d[3];
1027
1028         n = ctx->mres;
1029 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1030         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1031                 if (n) {
1032                         while (n && len) {
1033                                 u8 c = *(in++);
1034                                 *(out++) = c^ctx->EKi.c[n];
1035                                 ctx->Xi.c[n] ^= c;
1036                                 --len;
1037                                 n = (n+1)%16;
1038                         }
1039                         if (n==0) GCM_MUL (ctx,Xi);
1040                         else {
1041                                 ctx->mres = n;
1042                                 return 0;
1043                         }
1044                 }
1045 #if defined(STRICT_ALIGNMENT)
1046                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1047                         break;
1048 #endif
1049 #if defined(GHASH) && defined(GHASH_CHUNK)
1050                 while (len>=GHASH_CHUNK) {
1051                     size_t j=GHASH_CHUNK;
1052
1053                     GHASH(ctx,in,GHASH_CHUNK);
1054                     while (j) {
1055                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1056                         ++ctr;
1057                         if (is_endian.little)
1058                                 PUTU32(ctx->Yi.c+12,ctr);
1059                         else
1060                                 ctx->Yi.d[3] = ctr;
1061                         for (i=0; i<16; i+=sizeof(size_t))
1062                                 *(size_t *)(out+i) =
1063                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1064                         out += 16;
1065                         in  += 16;
1066                         j   -= 16;
1067                     }
1068                     len -= GHASH_CHUNK;
1069                 }
1070                 if ((i = (len&(size_t)-16))) {
1071                     GHASH(ctx,in,i);
1072                     while (len>=16) {
1073                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1074                         ++ctr;
1075                         if (is_endian.little)
1076                                 PUTU32(ctx->Yi.c+12,ctr);
1077                         else
1078                                 ctx->Yi.d[3] = ctr;
1079                         for (i=0; i<16; i+=sizeof(size_t))
1080                                 *(size_t *)(out+i) =
1081                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1082                         out += 16;
1083                         in  += 16;
1084                         len -= 16;
1085                     }
1086                 }
1087 #else
1088                 while (len>=16) {
1089                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1090                         ++ctr;
1091                         if (is_endian.little)
1092                                 PUTU32(ctx->Yi.c+12,ctr);
1093                         else
1094                                 ctx->Yi.d[3] = ctr;
1095                         for (i=0; i<16; i+=sizeof(size_t)) {
1096                                 size_t c = *(size_t *)(in+i);
1097                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1098                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1099                         }
1100                         GCM_MUL(ctx,Xi);
1101                         out += 16;
1102                         in  += 16;
1103                         len -= 16;
1104                 }
1105 #endif
1106                 if (len) {
1107                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1108                         ++ctr;
1109                         if (is_endian.little)
1110                                 PUTU32(ctx->Yi.c+12,ctr);
1111                         else
1112                                 ctx->Yi.d[3] = ctr;
1113                         while (len--) {
1114                                 u8 c = in[n];
1115                                 ctx->Xi.c[n] ^= c;
1116                                 out[n] = c^ctx->EKi.c[n];
1117                                 ++n;
1118                         }
1119                 }
1120
1121                 ctx->mres = n;
1122                 return 0;
1123         } while(0);
1124 #endif
1125         for (i=0;i<len;++i) {
1126                 u8 c;
1127                 if (n==0) {
1128                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1129                         ++ctr;
1130                         if (is_endian.little)
1131                                 PUTU32(ctx->Yi.c+12,ctr);
1132                         else
1133                                 ctx->Yi.d[3] = ctr;
1134                 }
1135                 c = in[i];
1136                 out[i] = c^ctx->EKi.c[n];
1137                 ctx->Xi.c[n] ^= c;
1138                 n = (n+1)%16;
1139                 if (n==0)
1140                         GCM_MUL(ctx,Xi);
1141         }
1142
1143         ctx->mres = n;
1144         return 0;
1145 }
1146
1147 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1148                 const unsigned char *in, unsigned char *out,
1149                 size_t len, ctr128_f stream)
1150 {
1151         const union { long one; char little; } is_endian = {1};
1152         unsigned int n, ctr;
1153         size_t i;
1154         u64 mlen = ctx->len.u[1];
1155
1156         mlen += len;
1157         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1158                 return -1;
1159         ctx->len.u[1] = mlen;
1160
1161         if (ctx->ares) {
1162                 /* First call to encrypt finalizes GHASH(AAD) */
1163                 GCM_MUL(ctx,Xi);
1164                 ctx->ares = 0;
1165         }
1166
1167         if (is_endian.little)
1168                 ctr = GETU32(ctx->Yi.c+12);
1169         else
1170                 ctr = ctx->Yi.d[3];
1171
1172         n = ctx->mres;
1173         if (n) {
1174                 while (n && len) {
1175                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1176                         --len;
1177                         n = (n+1)%16;
1178                 }
1179                 if (n==0) GCM_MUL(ctx,Xi);
1180                 else {
1181                         ctx->mres = n;
1182                         return 0;
1183                 }
1184         }
1185 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1186         while (len>=GHASH_CHUNK) {
1187                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1188                 ctr += GHASH_CHUNK/16;
1189                 if (is_endian.little)
1190                         PUTU32(ctx->Yi.c+12,ctr);
1191                 else
1192                         ctx->Yi.d[3] = ctr;
1193                 GHASH(ctx,out,GHASH_CHUNK);
1194                 out += GHASH_CHUNK;
1195                 in  += GHASH_CHUNK;
1196                 len -= GHASH_CHUNK;
1197         }
1198 #endif
1199         if ((i = (len&(size_t)-16))) {
1200                 size_t j=i/16;
1201
1202                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1203                 ctr += (unsigned int)j;
1204                 if (is_endian.little)
1205                         PUTU32(ctx->Yi.c+12,ctr);
1206                 else
1207                         ctx->Yi.d[3] = ctr;
1208                 in  += i;
1209                 len -= i;
1210 #if defined(GHASH)
1211                 GHASH(ctx,out,i);
1212                 out += i;
1213 #else
1214                 while (j--) {
1215                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1216                         GCM_MUL(ctx,Xi);
1217                         out += 16;
1218                 }
1219 #endif
1220         }
1221         if (len) {
1222                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1223                 ++ctr;
1224                 if (is_endian.little)
1225                         PUTU32(ctx->Yi.c+12,ctr);
1226                 else
1227                         ctx->Yi.d[3] = ctr;
1228                 while (len--) {
1229                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1230                         ++n;
1231                 }
1232         }
1233
1234         ctx->mres = n;
1235         return 0;
1236 }
1237
1238 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1239                 const unsigned char *in, unsigned char *out,
1240                 size_t len,ctr128_f stream)
1241 {
1242         const union { long one; char little; } is_endian = {1};
1243         unsigned int n, ctr;
1244         size_t i;
1245         u64 mlen = ctx->len.u[1];
1246
1247         mlen += len;
1248         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1249                 return -1;
1250         ctx->len.u[1] = mlen;
1251
1252         if (ctx->ares) {
1253                 /* First call to decrypt finalizes GHASH(AAD) */
1254                 GCM_MUL(ctx,Xi);
1255                 ctx->ares = 0;
1256         }
1257
1258         if (is_endian.little)
1259                 ctr = GETU32(ctx->Yi.c+12);
1260         else
1261                 ctr = ctx->Yi.d[3];
1262
1263         n = ctx->mres;
1264         if (n) {
1265                 while (n && len) {
1266                         u8 c = *(in++);
1267                         *(out++) = c^ctx->EKi.c[n];
1268                         ctx->Xi.c[n] ^= c;
1269                         --len;
1270                         n = (n+1)%16;
1271                 }
1272                 if (n==0) GCM_MUL (ctx,Xi);
1273                 else {
1274                         ctx->mres = n;
1275                         return 0;
1276                 }
1277         }
1278 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1279         while (len>=GHASH_CHUNK) {
1280                 GHASH(ctx,in,GHASH_CHUNK);
1281                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1282                 ctr += GHASH_CHUNK/16;
1283                 if (is_endian.little)
1284                         PUTU32(ctx->Yi.c+12,ctr);
1285                 else
1286                         ctx->Yi.d[3] = ctr;
1287                 out += GHASH_CHUNK;
1288                 in  += GHASH_CHUNK;
1289                 len -= GHASH_CHUNK;
1290         }
1291 #endif
1292         if ((i = (len&(size_t)-16))) {
1293                 size_t j=i/16;
1294
1295 #if defined(GHASH)
1296                 GHASH(ctx,in,i);
1297 #else
1298                 while (j--) {
1299                         size_t k;
1300                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1301                         GCM_MUL(ctx,Xi);
1302                         in += 16;
1303                 }
1304                 j   = i/16;
1305                 in -= i;
1306 #endif
1307                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1308                 ctr += (unsigned int)j;
1309                 if (is_endian.little)
1310                         PUTU32(ctx->Yi.c+12,ctr);
1311                 else
1312                         ctx->Yi.d[3] = ctr;
1313                 out += i;
1314                 in  += i;
1315                 len -= i;
1316         }
1317         if (len) {
1318                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1319                 ++ctr;
1320                 if (is_endian.little)
1321                         PUTU32(ctx->Yi.c+12,ctr);
1322                 else
1323                         ctx->Yi.d[3] = ctr;
1324                 while (len--) {
1325                         u8 c = in[n];
1326                         ctx->Xi.c[n] ^= c;
1327                         out[n] = c^ctx->EKi.c[n];
1328                         ++n;
1329                 }
1330         }
1331
1332         ctx->mres = n;
1333         return 0;
1334 }
1335
1336 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1337                         size_t len)
1338 {
1339         const union { long one; char little; } is_endian = {1};
1340         u64 alen = ctx->len.u[0]<<3;
1341         u64 clen = ctx->len.u[1]<<3;
1342
1343         if (ctx->mres)
1344                 GCM_MUL(ctx,Xi);
1345
1346         if (is_endian.little) {
1347 #ifdef BSWAP8
1348                 alen = BSWAP8(alen);
1349                 clen = BSWAP8(clen);
1350 #else
1351                 u8 *p = ctx->len.c;
1352
1353                 ctx->len.u[0] = alen;
1354                 ctx->len.u[1] = clen;
1355
1356                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1357                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1358 #endif
1359         }
1360
1361         ctx->Xi.u[0] ^= alen;
1362         ctx->Xi.u[1] ^= clen;
1363         GCM_MUL(ctx,Xi);
1364
1365         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1366         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1367
1368         if (tag && len<=sizeof(ctx->Xi))
1369                 return memcmp(ctx->Xi.c,tag,len);
1370         else
1371                 return -1;
1372 }
1373
1374 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1375 {
1376         CRYPTO_gcm128_finish(ctx, NULL, 0);
1377         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1378 }
1379
1380 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1381 {
1382         GCM128_CONTEXT *ret;
1383
1384         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1385                 CRYPTO_gcm128_init(ret,key,block);
1386
1387         return ret;
1388 }
1389
1390 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1391 {
1392         if (ctx) {
1393                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1394                 OPENSSL_free(ctx);
1395         }
1396 }
1397
1398 #if defined(SELFTEST)
1399 #include <stdio.h>
1400 #include <openssl/aes.h>
1401
1402 /* Test Case 1 */
1403 static const u8 K1[16],
1404                 *P1=NULL,
1405                 *A1=NULL,
1406                 IV1[12],
1407                 *C1=NULL,
1408                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1409
1410 /* Test Case 2 */
1411 #define K2 K1
1412 #define A2 A1
1413 #define IV2 IV1
1414 static const u8 P2[16],
1415                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1416                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1417
1418 /* Test Case 3 */
1419 #define A3 A2
1420 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1421                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1422                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1423                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1424                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1425                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1426                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1427                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1428                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1429                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1430                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1431
1432 /* Test Case 4 */
1433 #define K4 K3
1434 #define IV4 IV3
1435 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1436                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1437                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1438                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1439                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1440                         0xab,0xad,0xda,0xd2},
1441                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1442                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1443                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1444                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1445                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1446
1447 /* Test Case 5 */
1448 #define K5 K4
1449 #define P5 P4
1450 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1451                         0xab,0xad,0xda,0xd2},
1452                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1453                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1454                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1455                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1456                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1457                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1458
1459 /* Test Case 6 */
1460 #define K6 K5
1461 #define P6 P5
1462 #define A6 A5
1463 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1464                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1465                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1466                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1467                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1468                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1469                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1470                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1471                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1472
1473 /* Test Case 7 */
1474 static const u8 K7[24],
1475                 *P7=NULL,
1476                 *A7=NULL,
1477                 IV7[12],
1478                 *C7=NULL,
1479                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1480
1481 /* Test Case 8 */
1482 #define K8 K7
1483 #define IV8 IV7
1484 #define A8 A7
1485 static const u8 P8[16],
1486                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1487                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1488
1489 /* Test Case 9 */
1490 #define A9 A8
1491 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1492                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1493                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1494                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1495                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1496                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1497                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1498                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1499                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1500                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1501                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1502                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1503
1504 /* Test Case 10 */
1505 #define K10 K9
1506 #define IV10 IV9
1507 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1508                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1509                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1510                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1511                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1512                         0xab,0xad,0xda,0xd2},
1513                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1514                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1515                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1516                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1517                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1518
1519 /* Test Case 11 */
1520 #define K11 K10
1521 #define P11 P10
1522 #define A11 A10
1523 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1524                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1525                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1526                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1527                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1528                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1529
1530 /* Test Case 12 */
1531 #define K12 K11
1532 #define P12 P11
1533 #define A12 A11
1534 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1535                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1536                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1537                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1538                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1539                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1540                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1541                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1542                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1543
1544 /* Test Case 13 */
1545 static const u8 K13[32],
1546                 *P13=NULL,
1547                 *A13=NULL,
1548                 IV13[12],
1549                 *C13=NULL,
1550                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1551
1552 /* Test Case 14 */
1553 #define K14 K13
1554 #define A14 A13
1555 static const u8 P14[16],
1556                 IV14[12],
1557                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1558                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1559
1560 /* Test Case 15 */
1561 #define A15 A14
1562 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1563                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1564                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1565                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1566                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1567                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1568                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1569                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1570                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1571                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1572                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1573                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1574
1575 /* Test Case 16 */
1576 #define K16 K15
1577 #define IV16 IV15
1578 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1579                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1580                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1581                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1582                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1583                         0xab,0xad,0xda,0xd2},
1584                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1585                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1586                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1587                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1588                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1589
1590 /* Test Case 17 */
1591 #define K17 K16
1592 #define P17 P16
1593 #define A17 A16
1594 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1595                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1596                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1597                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1598                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1599                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1600
1601 /* Test Case 18 */
1602 #define K18 K17
1603 #define P18 P17
1604 #define A18 A17
1605 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1606                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1607                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1608                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1609                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1610                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1611                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1612                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1613                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1614
1615 #define TEST_CASE(n)    do {                                    \
1616         u8 out[sizeof(P##n)];                                   \
1617         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1618         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1619         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1620         memset(out,0,sizeof(out));                              \
1621         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1622         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1623         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1624             (C##n && memcmp(out,C##n,sizeof(out))))             \
1625                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1626         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1627         memset(out,0,sizeof(out));                              \
1628         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1629         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1630         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1631             (P##n && memcmp(out,P##n,sizeof(out))))             \
1632                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1633         } while(0)
1634
1635 int main()
1636 {
1637         GCM128_CONTEXT ctx;
1638         AES_KEY key;
1639         int ret=0;
1640
1641         TEST_CASE(1);
1642         TEST_CASE(2);
1643         TEST_CASE(3);
1644         TEST_CASE(4);
1645         TEST_CASE(5);
1646         TEST_CASE(6);
1647         TEST_CASE(7);
1648         TEST_CASE(8);
1649         TEST_CASE(9);
1650         TEST_CASE(10);
1651         TEST_CASE(11);
1652         TEST_CASE(12);
1653         TEST_CASE(13);
1654         TEST_CASE(14);
1655         TEST_CASE(15);
1656         TEST_CASE(16);
1657         TEST_CASE(17);
1658         TEST_CASE(18);
1659
1660 #ifdef OPENSSL_CPUID_OBJ
1661         {
1662         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1663         union { u64 u; u8 c[1024]; } buf;
1664         int i;
1665
1666         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1667         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1668         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1669
1670         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1671         start = OPENSSL_rdtsc();
1672         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1673         gcm_t = OPENSSL_rdtsc() - start;
1674
1675         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1676                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1677                         (block128_f)AES_encrypt);
1678         start = OPENSSL_rdtsc();
1679         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1680                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1681                         (block128_f)AES_encrypt);
1682         ctr_t = OPENSSL_rdtsc() - start;
1683
1684         printf("%.2f-%.2f=%.2f\n",
1685                         gcm_t/(double)sizeof(buf),
1686                         ctr_t/(double)sizeof(buf),
1687                         (gcm_t-ctr_t)/(double)sizeof(buf));
1688 #ifdef GHASH
1689         GHASH(&ctx,buf.c,sizeof(buf));
1690         start = OPENSSL_rdtsc();
1691         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1692         gcm_t = OPENSSL_rdtsc() - start;
1693         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1694 #endif
1695         }
1696 #endif
1697
1698         return ret;
1699 }
1700 #endif