gcm128.c: commentary and formatting updates.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #include "modes.h"
51 #include <string.h>
52
53 #ifndef MODES_DEBUG
54 # ifndef NDEBUG
55 #  define NDEBUG
56 # endif
57 #endif
58 #include <assert.h>
59
60 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
61 typedef __int64 i64;
62 typedef unsigned __int64 u64;
63 #define U64(C) C##UI64
64 #elif defined(__arch64__)
65 typedef long i64;
66 typedef unsigned long u64;
67 #define U64(C) C##UL
68 #else
69 typedef long long i64;
70 typedef unsigned long long u64;
71 #define U64(C) C##ULL
72 #endif
73
74 typedef unsigned int u32;
75 typedef unsigned char u8;
76 typedef struct { u64 hi,lo; } u128;
77
78 #define STRICT_ALIGNMENT
79 #if defined(__i386)     || defined(__i386__)    || \
80     defined(__x86_64)   || defined(__x86_64__)  || \
81     defined(_M_IX86)    || defined(_M_AMD64)    || defined(_M_X64) || \
82     defined(__s390__)   || defined(__s390x__)
83 # undef STRICT_ALIGNMENT
84 #endif
85
86 #if defined(__GNUC__) && __GNUC__>=2
87 # if defined(__x86_64) || defined(__x86_64__)
88 #  define BSWAP8(x) ({  u64 ret=(x);                    \
89                         asm volatile ("bswapq %0"       \
90                         : "+r"(ret));   ret;            })
91 #  define BSWAP4(x) ({  u32 ret=(x);                    \
92                         asm volatile ("bswapl %0"       \
93                         : "+r"(ret));   ret;            })
94 # elif defined(__i386) || defined(__i386__)
95 #  define BSWAP8(x) ({  u32 lo=(u64)(x)>>32,hi=(x);     \
96                         asm volatile ("bswapl %0; bswapl %1"    \
97                         : "+r"(hi),"+r"(lo));           \
98                         (u64)hi<<32|lo;                 })
99 #  define BSWAP4(x) ({  u32 ret=(x);                    \
100                         asm volatile ("bswapl %0"       \
101                         : "+r"(ret));   ret;            })
102 # endif
103 #elif defined(_MSC_VER)
104 # if _MSC_VER>=1300
105 #  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
106 #  define BSWAP8(x)     _byteswap_uint64((u64)(x))
107 #  define BSWAP4(x)     _byteswap_ulong((u32)(x))
108 # elif defined(_M_IX86)
109 # endif
110 #endif
111
112 #ifdef BSWAP4
113 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
114 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
115 #else
116 #define GETU32(p)       ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
117 #define PUTU32(p,v)     ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
118 #endif
119
120 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
121 #ifdef  TABLE_BITS
122 #undef  TABLE_BITS
123 #endif
124 /*
125  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
126  * never be set to 8. 8 is effectively reserved for testing purposes.
127  * Under ideal conditions "8-bit" version should be twice as fast as
128  * "4-bit" one. But world is far from ideal. For gcc-generated x86 code,
129  * "8-bit" was observed to run only ~50% faster. On x86_64 observed
130  * improvement was ~75%, much closer to optimal, but the fact of
131  * deviation means that references to pre-computed tables end up on
132  * critical path and as tables are pretty big, 4KB per key+1KB shared,
133  * execution time is sensitive to cache timing. It's not actually
134  * proven, but 4-bit procedure is believed to provide adequate
135  * all-round performance...
136  */  
137 #define TABLE_BITS 4
138
139 #if     TABLE_BITS==8
140
141 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
142 {
143         int  i, j;
144         u128 V;
145
146         Htable[0].hi = 0;
147         Htable[0].lo = 0;
148         V.hi = H[0];
149         V.lo = H[1];
150
151         for (Htable[128]=V, i=64; i>0; i>>=1) {
152                 if (sizeof(size_t)==8) {
153                         u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
154                         V.lo  = (V.hi<<63)|(V.lo>>1);
155                         V.hi  = (V.hi>>1 )^T;
156                 }
157                 else {
158                         u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
159                         V.lo  = (V.hi<<63)|(V.lo>>1);
160                         V.hi  = (V.hi>>1 )^((u64)T<<32);
161                 }
162                 Htable[i] = V;
163         }
164
165         for (i=2; i<256; i<<=1) {
166                 u128 *Hi = Htable+i, H0 = *Hi;
167                 for (j=1; j<i; ++j) {
168                         Hi[j].hi = H0.hi^Htable[j].hi;
169                         Hi[j].lo = H0.lo^Htable[j].lo;
170                 }
171         }
172 }
173
174 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
175 {
176         u128 Z = { 0, 0};
177         const u8 *xi = (const u8 *)Xi+15;
178         size_t rem, n = *xi;
179         const union { long one; char little; } is_endian = {1};
180         static const size_t rem_8bit[256] = {
181                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
182                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
183                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
184                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
185                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
186                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
187                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
188                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
189                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
190                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
191                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
192                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
193                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
194                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
195                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
196                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
197                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
198                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
199                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
200                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
201                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
202                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
203                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
204                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
205                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
206                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
207                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
208                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
209                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
210                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
211                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
212                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
213                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
214                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
215                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
216                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
217                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
218                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
219                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
220                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
221                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
222                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
223                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
224                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
225                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
226                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
227                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
228                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
229                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
230                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
231                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
232                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
233                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
234                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
235                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
236                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
237                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
238                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
239                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
240                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
241                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
242                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
243                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
244                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
245
246         while (1) {
247                 Z.hi ^= Htable[n].hi;
248                 Z.lo ^= Htable[n].lo;
249
250                 if ((u8 *)Xi==xi)       break;
251
252                 n = *(--xi);
253
254                 rem  = (size_t)Z.lo&0xff;
255                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
256                 Z.hi = (Z.hi>>8);
257                 if (sizeof(size_t)==8)
258                         Z.hi ^= rem_8bit[rem];
259                 else
260                         Z.hi ^= (u64)rem_8bit[rem]<<32;
261         }
262
263         if (is_endian.little) {
264 #ifdef BSWAP8
265                 Xi[0] = BSWAP8(Z.hi);
266                 Xi[1] = BSWAP8(Z.lo);
267 #else
268                 u8 *p = (u8 *)Xi;
269                 u32 v;
270                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
271                 v = (u32)(Z.hi);        PUTU32(p+4,v);
272                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
273                 v = (u32)(Z.lo);        PUTU32(p+12,v);
274 #endif
275         }
276         else {
277                 Xi[0] = Z.hi;
278                 Xi[1] = Z.lo;
279         }
280 }
281 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
282
283 #elif   TABLE_BITS==4
284
285 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
286 {
287         int  i;
288         u128 V;
289
290         Htable[0].hi = 0;
291         Htable[0].lo = 0;
292         V.hi = H[0];
293         V.lo = H[1];
294
295         for (Htable[8]=V, i=4; i>0; i>>=1) {
296                 if (sizeof(size_t)==8) {
297                         u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
298                         V.lo  = (V.hi<<63)|(V.lo>>1);
299                         V.hi  = (V.hi>>1 )^T;
300                 }
301                 else {
302                         u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
303                         V.lo  = (V.hi<<63)|(V.lo>>1);
304                         V.hi  = (V.hi>>1 )^((u64)T<<32);
305                 }
306                 Htable[i] = V;
307         }
308
309 #if defined(OPENSSL_SMALL_FOOTPRINT)
310         for (i=2; i<16; i<<=1) {
311                 u128 *Hi = Htable+i;
312                 int   j;
313                 for (V=*Hi, j=1; j<i; ++j) {
314                         Hi[j].hi = V.hi^Htable[j].hi;
315                         Hi[j].lo = V.lo^Htable[j].lo;
316                 }
317         }
318 #else
319         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
320         V=Htable[4];
321         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
322         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
323         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
324         V=Htable[8];
325         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
326         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
327         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
328         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
329         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
330         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
331         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
332 #endif
333 }
334
335 #ifndef GHASH_ASM
336 static const size_t rem_4bit[16] = {
337         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
338         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
339         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
340         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
341
342 static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
343 {
344         u128 Z;
345         int cnt = 15;
346         size_t rem, nlo, nhi;
347         const union { long one; char little; } is_endian = {1};
348
349         nlo  = ((const u8 *)Xi)[15];
350         nhi  = nlo>>4;
351         nlo &= 0xf;
352
353         Z.hi = Htable[nlo].hi;
354         Z.lo = Htable[nlo].lo;
355
356         while (1) {
357                 rem  = (size_t)Z.lo&0xf;
358                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
359                 Z.hi = (Z.hi>>4);
360                 if (sizeof(size_t)==8)
361                         Z.hi ^= rem_4bit[rem];
362                 else
363                         Z.hi ^= (u64)rem_4bit[rem]<<32;
364
365                 Z.hi ^= Htable[nhi].hi;
366                 Z.lo ^= Htable[nhi].lo;
367
368                 if (--cnt<0)            break;
369
370                 nlo  = ((const u8 *)Xi)[cnt];
371                 nhi  = nlo>>4;
372                 nlo &= 0xf;
373
374                 rem  = (size_t)Z.lo&0xf;
375                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
376                 Z.hi = (Z.hi>>4);
377                 if (sizeof(size_t)==8)
378                         Z.hi ^= rem_4bit[rem];
379                 else
380                         Z.hi ^= (u64)rem_4bit[rem]<<32;
381
382                 Z.hi ^= Htable[nlo].hi;
383                 Z.lo ^= Htable[nlo].lo;
384         }
385
386         if (is_endian.little) {
387 #ifdef BSWAP8
388                 Xi[0] = BSWAP8(Z.hi);
389                 Xi[1] = BSWAP8(Z.lo);
390 #else
391                 u8 *p = (u8 *)Xi;
392                 u32 v;
393                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
394                 v = (u32)(Z.hi);        PUTU32(p+4,v);
395                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
396                 v = (u32)(Z.lo);        PUTU32(p+12,v);
397 #endif
398         }
399         else {
400                 Xi[0] = Z.hi;
401                 Xi[1] = Z.lo;
402         }
403 }
404
405 #if !defined(OPENSSL_SMALL_FOOTPRINT)
406 /*
407  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
408  * details... Compiler-generated code doesn't seem to give any
409  * performance improvement, at least not on x86[_64]. It's here
410  * mostly as reference and a placeholder for possible future
411  * non-trivial optimization[s]...
412  */
413 static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
414 {
415     u128 Z;
416     int cnt;
417     size_t rem, nlo, nhi;
418     const union { long one; char little; } is_endian = {1};
419
420     do {
421         cnt  = 15;
422         nlo  = ((const u8 *)Xi)[15];
423         nlo ^= inp[15];
424         nhi  = nlo>>4;
425         nlo &= 0xf;
426
427         Z.hi = Htable[nlo].hi;
428         Z.lo = Htable[nlo].lo;
429
430         while (1) {
431                 rem  = (size_t)Z.lo&0xf;
432                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
433                 Z.hi = (Z.hi>>4);
434                 if (sizeof(size_t)==8)
435                         Z.hi ^= rem_4bit[rem];
436                 else
437                         Z.hi ^= (u64)rem_4bit[rem]<<32;
438
439                 Z.hi ^= Htable[nhi].hi;
440                 Z.lo ^= Htable[nhi].lo;
441
442                 if (--cnt<0)            break;
443
444                 nlo  = ((const u8 *)Xi)[cnt];
445                 nlo ^= inp[cnt];
446                 nhi  = nlo>>4;
447                 nlo &= 0xf;
448
449                 rem  = (size_t)Z.lo&0xf;
450                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
451                 Z.hi = (Z.hi>>4);
452                 if (sizeof(size_t)==8)
453                         Z.hi ^= rem_4bit[rem];
454                 else
455                         Z.hi ^= (u64)rem_4bit[rem]<<32;
456
457                 Z.hi ^= Htable[nlo].hi;
458                 Z.lo ^= Htable[nlo].lo;
459         }
460
461         if (is_endian.little) {
462 #ifdef BSWAP8
463                 Xi[0] = BSWAP8(Z.hi);
464                 Xi[1] = BSWAP8(Z.lo);
465 #else
466                 u8 *p = (u8 *)Xi;
467                 u32 v;
468                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
469                 v = (u32)(Z.hi);        PUTU32(p+4,v);
470                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
471                 v = (u32)(Z.lo);        PUTU32(p+12,v);
472 #endif
473         }
474         else {
475                 Xi[0] = Z.hi;
476                 Xi[1] = Z.lo;
477         }
478     } while (inp+=16, len-=16);
479 }
480 #endif
481 #else
482 void gcm_gmult_4bit(u64 Xi[2],u128 Htable[16]);
483 void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2],u128 Htable[16]);
484 #endif
485
486 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
487 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
488 #define GHASH(in,len,ctx) gcm_ghash_4bit(in,len,(ctx)->Xi.u,(ctx)->Htable)
489 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
490  * trashing effect. In other words idea is to hash data while it's
491  * still in L1 cache after encryption pass... */
492 #define GHASH_CHUNK       1024
493 #endif
494
495 #else   /* TABLE_BITS */
496
497 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
498 {
499         u128 V,Z = { 0,0 };
500         long X;
501         int  i,j;
502         const long *xi = (const long *)Xi;
503         const union { long one; char little; } is_endian = {1};
504
505         V.hi = H[0];    /* H is in host byte order, no byte swapping */
506         V.lo = H[1];
507
508         for (j=0; j<16/sizeof(long); ++j) {
509                 if (is_endian.little) {
510                         if (sizeof(long)==8) {
511 #ifdef BSWAP8
512                                 X = (long)(BSWAP8(xi[j]));
513 #else
514                                 const u8 *p = (const u8 *)(xi+j);
515                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
516 #endif
517                         }
518                         else {
519                                 const u8 *p = (const u8 *)(xi+j);
520                                 X = (long)GETU32(p);
521                         }
522                 }
523                 else
524                         X = xi[j];
525
526                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
527                         u64 M = (u64)(X>>(8*sizeof(long)-1));
528                         Z.hi ^= V.hi&M;
529                         Z.lo ^= V.lo&M;
530
531                         if (sizeof(size_t)==8) {
532                                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
533                                 V.lo  = (V.hi<<63)|(V.lo>>1);
534                                 V.hi  = (V.hi>>1 )^T;
535                         }
536                         else {
537                                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
538                                 V.lo  = (V.hi<<63)|(V.lo>>1);
539                                 V.hi  = (V.hi>>1 )^((u64)T<<32);
540                         }
541                                 
542                 }
543         }
544
545         if (is_endian.little) {
546 #ifdef BSWAP8
547                 Xi[0] = BSWAP8(Z.hi);
548                 Xi[1] = BSWAP8(Z.lo);
549 #else
550                 u8 *p = (u8 *)Xi;
551                 u32 v;
552                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
553                 v = (u32)(Z.hi);        PUTU32(p+4,v);
554                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
555                 v = (u32)(Z.lo);        PUTU32(p+12,v);
556 #endif
557         }
558         else {
559                 Xi[0] = Z.hi;
560                 Xi[1] = Z.lo;
561         }
562 }
563 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
564
565 #endif
566
567 typedef struct {
568         /* Following 6 names follow names in GCM specification */
569         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
570                                                 Xi,H,
571                                                 len;
572         /* Pre-computed table used by gcm_gmult_* */
573 #if TABLE_BITS==8
574         u128 Htable[256];
575 #else
576         u128 Htable[16];
577 #endif
578         unsigned int res, ctr;
579         block128_f block;
580         void *key;
581 } GCM128_CONTEXT;
582
583 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
584 {
585         const union { long one; char little; } is_endian = {1};
586
587         memset(ctx,0,sizeof(*ctx));
588         ctx->block = block;
589         ctx->key   = key;
590
591         (*block)(ctx->H.c,ctx->H.c,key);
592
593         if (is_endian.little) {
594                 /* H is stored in host byte order */
595 #ifdef BSWAP8
596                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
597                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
598 #else
599                 u8 *p = ctx->H.c;
600                 u64 hi,lo;
601                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
602                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
603                 ctx->H.u[0] = hi;
604                 ctx->H.u[1] = lo;
605 #endif
606         }
607
608 #if     TABLE_BITS==8
609         gcm_init_8bit(ctx->Htable,ctx->H.u);
610 #elif   TABLE_BITS==4
611         gcm_init_4bit(ctx->Htable,ctx->H.u);
612 #endif
613 }
614
615 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
616 {
617         const union { long one; char little; } is_endian = {1};
618
619         ctx->Yi.u[0]  = 0;
620         ctx->Yi.u[1]  = 0;
621         ctx->Xi.u[0]  = 0;
622         ctx->Xi.u[1]  = 0;
623         ctx->len.u[0] = 0;
624         ctx->len.u[1] = 0;
625         ctx->res = 0;
626
627         if (len==12) {
628                 memcpy(ctx->Yi.c,iv,12);
629                 ctx->Yi.c[15]=1;
630                 ctx->ctr=1;
631         }
632         else {
633                 size_t i;
634                 u64 len0 = len;
635
636                 while (len>=16) {
637                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
638                         GCM_MUL(ctx,Yi);
639                         iv += 16;
640                         len -= 16;
641                 }
642                 if (len) {
643                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
644                         GCM_MUL(ctx,Yi);
645                 }
646                 len0 <<= 3;
647                 if (is_endian.little) {
648 #ifdef BSWAP8
649                         ctx->Yi.u[1]  ^= BSWAP8(len0);
650 #else
651                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
652                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
653                         ctx->Yi.c[10] ^= (u8)(len0>>40);
654                         ctx->Yi.c[11] ^= (u8)(len0>>32);
655                         ctx->Yi.c[12] ^= (u8)(len0>>24);
656                         ctx->Yi.c[13] ^= (u8)(len0>>16);
657                         ctx->Yi.c[14] ^= (u8)(len0>>8);
658                         ctx->Yi.c[15] ^= (u8)(len0);
659 #endif
660                 }
661                 else
662                         ctx->Yi.u[1]  ^= len0;
663
664                 GCM_MUL(ctx,Yi);
665
666                 if (is_endian.little)
667                         ctx->ctr = GETU32(ctx->Yi.c+12);
668                 else
669                         ctx->ctr = ctx->Yi.d[3];
670         }
671
672         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
673         ++ctx->ctr;
674         if (is_endian.little)
675                 PUTU32(ctx->Yi.c+12,ctx->ctr);
676         else
677                 ctx->Yi.d[3] = ctx->ctr;
678 }
679
680 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
681 {
682         size_t i;
683
684         ctx->len.u[0] += len;
685
686 #ifdef GHASH
687         if ((i = (len&(size_t)-16))) {
688                 GHASH(aad,i,ctx);
689                 aad += i;
690                 len -= i;
691         }
692 #else
693         while (len>=16) {
694                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
695                 GCM_MUL(ctx,Xi);
696                 aad += 16;
697                 len -= 16;
698         }
699 #endif
700         if (len) {
701                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
702                 GCM_MUL(ctx,Xi);
703         }
704 }
705
706 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
707                 const unsigned char *in, unsigned char *out,
708                 size_t len)
709 {
710         const union { long one; char little; } is_endian = {1};
711         unsigned int n, ctr;
712         size_t i;
713
714         ctx->len.u[1] += len;
715         n   = ctx->res;
716         ctr = ctx->ctr;
717
718 #if !defined(OPENSSL_SMALL_FOOTPRINT)
719         if (16%sizeof(size_t) == 0) do {        /* always true actually */
720                 if (n) {
721                         while (n && len) {
722                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
723                                 --len;
724                                 n = (n+1)%16;
725                         }
726                         if (n==0) GCM_MUL(ctx,Xi);
727                         else {
728                                 ctx->res = n;
729                                 return;
730                         }
731                 }
732 #if defined(STRICT_ALIGNMENT)
733                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
734                         break;
735 #endif
736 #if defined(GHASH) && defined(GHASH_CHUNK)
737                 while (len>=GHASH_CHUNK) {
738                     size_t j=GHASH_CHUNK;
739
740                     while (j) {
741                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
742                         ++ctr;
743                         if (is_endian.little)
744                                 PUTU32(ctx->Yi.c+12,ctr);
745                         else
746                                 ctx->Yi.d[3] = ctr;
747                         for (i=0; i<16; i+=sizeof(size_t))
748                                 *(size_t *)(out+i) =
749                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
750                         out += 16;
751                         in  += 16;
752                         j   -= 16;
753                     }
754                     GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx);
755                     len -= GHASH_CHUNK;
756                 }
757                 if ((i = (len&(size_t)-16))) {
758                     size_t j=i;
759
760                     while (len>=16) {
761                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
762                         ++ctr;
763                         if (is_endian.little)
764                                 PUTU32(ctx->Yi.c+12,ctr);
765                         else
766                                 ctx->Yi.d[3] = ctr;
767                         for (i=0; i<16; i+=sizeof(size_t))
768                                 *(size_t *)(out+i) =
769                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
770                         out += 16;
771                         in  += 16;
772                         len -= 16;
773                     }
774                     GHASH(out-j,j,ctx);
775                 }
776 #else
777                 while (len>=16) {
778                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
779                         ++ctr;
780                         if (is_endian.little)
781                                 PUTU32(ctx->Yi.c+12,ctr);
782                         else
783                                 ctx->Yi.d[3] = ctr;
784                         for (i=0; i<16; i+=sizeof(size_t))
785                                 *(size_t *)(ctx->Xi.c+i) ^=
786                                 *(size_t *)(out+i) =
787                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
788                         GCM_MUL(ctx,Xi);
789                         out += 16;
790                         in  += 16;
791                         len -= 16;
792                 }
793 #endif
794                 if (len) {
795                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
796                         ++ctr;
797                         if (is_endian.little)
798                                 PUTU32(ctx->Yi.c+12,ctr);
799                         else
800                                 ctx->Yi.d[3] = ctr;
801                         while (len--) {
802                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
803                                 ++n;
804                         }
805                 }
806
807                 ctx->res = n;
808                 ctx->ctr = ctr;
809                 return;
810         } while(0);
811 #endif
812         for (i=0;i<len;++i) {
813                 if (n==0) {
814                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
815                         ++ctr;
816                         if (is_endian.little)
817                                 PUTU32(ctx->Yi.c+12,ctr);
818                         else
819                                 ctx->Yi.d[3] = ctr;
820                 }
821                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
822                 n = (n+1)%16;
823                 if (n==0)
824                         GCM_MUL(ctx,Xi);
825         }
826
827         ctx->res = n;
828         ctx->ctr = ctr;
829 }
830
831 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
832                 const unsigned char *in, unsigned char *out,
833                 size_t len)
834 {
835         const union { long one; char little; } is_endian = {1};
836         unsigned int n, ctr;
837         size_t i;
838
839         ctx->len.u[1] += len;
840         n   = ctx->res;
841         ctr = ctx->ctr;
842
843 #if !defined(OPENSSL_SMALL_FOOTPRINT)
844         if (16%sizeof(size_t) == 0) do {        /* always true actually */
845                 if (n) {
846                         while (n && len) {
847                                 u8 c = *(in++);
848                                 *(out++) = c^ctx->EKi.c[n];
849                                 ctx->Xi.c[n] ^= c;
850                                 --len;
851                                 n = (n+1)%16;
852                         }
853                         if (n==0) GCM_MUL (ctx,Xi);
854                         else {
855                                 ctx->res = n;
856                                 return;
857                         }
858                 }
859 #if defined(STRICT_ALIGNMENT)
860                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
861                         break;
862 #endif
863 #if defined(GHASH) && defined(GHASH_CHUNK)
864                 while (len>=GHASH_CHUNK) {
865                     size_t j=GHASH_CHUNK;
866
867                     GHASH(in,GHASH_CHUNK,ctx);
868                     while (j) {
869                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
870                         ++ctr;
871                         if (is_endian.little)
872                                 PUTU32(ctx->Yi.c+12,ctr);
873                         else
874                                 ctx->Yi.d[3] = ctr;
875                         for (i=0; i<16; i+=sizeof(size_t))
876                                 *(size_t *)(out+i) =
877                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
878                         out += 16;
879                         in  += 16;
880                         j   -= 16;
881                     }
882                     len -= GHASH_CHUNK;
883                 }
884                 if ((i = (len&(size_t)-16))) {
885                     GHASH(in,i,ctx);
886                     while (len>=16) {
887                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
888                         ++ctr;
889                         if (is_endian.little)
890                                 PUTU32(ctx->Yi.c+12,ctr);
891                         else
892                                 ctx->Yi.d[3] = ctr;
893                         for (i=0; i<16; i+=sizeof(size_t))
894                                 *(size_t *)(out+i) =
895                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
896                         out += 16;
897                         in  += 16;
898                         len -= 16;
899                     }
900                 }
901 #else
902                 while (len>=16) {
903                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
904                         ++ctr;
905                         if (is_endian.little)
906                                 PUTU32(ctx->Yi.c+12,ctr);
907                         else
908                                 ctx->Yi.d[3] = ctr;
909                         for (i=0; i<16; i+=sizeof(size_t)) {
910                                 size_t c = *(size_t *)(in+i);
911                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
912                                 *(size_t *)(ctx->Xi.c+i) ^= c;
913                         }
914                         GCM_MUL(ctx,Xi);
915                         out += 16;
916                         in  += 16;
917                         len -= 16;
918                 }
919 #endif
920                 if (len) {
921                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
922                         ++ctr;
923                         if (is_endian.little)
924                                 PUTU32(ctx->Yi.c+12,ctr);
925                         else
926                                 ctx->Yi.d[3] = ctr;
927                         while (len--) {
928                                 u8 c = in[n];
929                                 ctx->Xi.c[n] ^= c;
930                                 out[n] = c^ctx->EKi.c[n];
931                                 ++n;
932                         }
933                 }
934
935                 ctx->res = n;
936                 ctx->ctr = ctr;
937                 return;
938         } while(0);
939 #endif
940         for (i=0;i<len;++i) {
941                 u8 c;
942                 if (n==0) {
943                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
944                         ++ctr;
945                         if (is_endian.little)
946                                 PUTU32(ctx->Yi.c+12,ctr);
947                         else
948                                 ctx->Yi.d[3] = ctr;
949                 }
950                 c = in[i];
951                 out[i] ^= ctx->EKi.c[n];
952                 ctx->Xi.c[n] ^= c;
953                 n = (n+1)%16;
954                 if (n==0)
955                         GCM_MUL(ctx,Xi);
956         }
957
958         ctx->res = n;
959         ctx->ctr = ctr;
960 }
961
962 void CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx)
963 {
964         const union { long one; char little; } is_endian = {1};
965         u64 alen = ctx->len.u[0]<<3;
966         u64 clen = ctx->len.u[1]<<3;
967
968         if (ctx->res)
969                 GCM_MUL(ctx,Xi);
970
971         if (is_endian.little) {
972 #ifdef BSWAP8
973                 alen = BSWAP8(alen);
974                 clen = BSWAP8(clen);
975 #else
976                 u8 *p = ctx->len.c;
977
978                 ctx->len.u[0] = alen;
979                 ctx->len.u[1] = clen;
980
981                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
982                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
983 #endif
984         }
985
986         ctx->Xi.u[0] ^= alen;
987         ctx->Xi.u[1] ^= clen;
988         GCM_MUL(ctx,Xi);
989
990         ctx->Xi.u[0] ^= ctx->EK0.u[0];
991         ctx->Xi.u[1] ^= ctx->EK0.u[1];
992 }
993
994 #if defined(SELFTEST)
995 #include <stdio.h>
996 #include <openssl/aes.h>
997
998 /* Test Case 1 */
999 static const u8 K1[16],
1000                 *P1=NULL,
1001                 *A1=NULL,
1002                 IV1[12],
1003                 *C1=NULL,
1004                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1005
1006 /* Test Case 2 */
1007 #define K2 K1
1008 #define A2 A1
1009 #define IV2 IV1
1010 static const u8 P2[16],
1011                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1012                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1013
1014 /* Test Case 3 */
1015 #define A3 A2
1016 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1017                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1018                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1019                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1020                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1021                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1022                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1023                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1024                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1025                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1026                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4,};
1027
1028 /* Test Case 4 */
1029 #define K4 K3
1030 #define IV4 IV3
1031 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1032                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1033                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1034                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1035                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1036                         0xab,0xad,0xda,0xd2},
1037                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1038                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1039                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1040                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1041                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1042
1043 /* Test Case 5 */
1044 #define K5 K4
1045 #define P5 P4
1046 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1047                         0xab,0xad,0xda,0xd2},
1048                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1049                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1050                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1051                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1052                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1053                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1054
1055 /* Test Case 6 */
1056 #define K6 K5
1057 #define P6 P5
1058 #define A6 A5
1059 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1060                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1061                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1062                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1063                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1064                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1065                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1066                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1067                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1068
1069 /* Test Case 7 */
1070 static const u8 K7[24],
1071                 *P7=NULL,
1072                 *A7=NULL,
1073                 IV7[12],
1074                 *C7=NULL,
1075                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1076
1077 /* Test Case 8 */
1078 #define K8 K7
1079 #define IV8 IV7
1080 #define A8 A7
1081 static const u8 P8[16],
1082                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1083                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1084
1085 /* Test Case 9 */
1086 #define A9 A8
1087 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1088                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1089                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1090                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1091                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1092                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1093                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1094                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1095                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1096                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1097                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1098                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1099
1100 /* Test Case 10 */
1101 #define K10 K9
1102 #define IV10 IV9
1103 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1104                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1105                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1106                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1107                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1108                         0xab,0xad,0xda,0xd2},
1109                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1110                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1111                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1112                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1113                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1114
1115 /* Test Case 11 */
1116 #define K11 K10
1117 #define P11 P10
1118 #define A11 A10
1119 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1120                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1121                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1122                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1123                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1124                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1125
1126 /* Test Case 12 */
1127 #define K12 K11
1128 #define P12 P11
1129 #define A12 A11
1130 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1131                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1132                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1133                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1134                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1135                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1136                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1137                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1138                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1139
1140 /* Test Case 13 */
1141 static const u8 K13[32],
1142                 *P13=NULL,
1143                 *A13=NULL,
1144                 IV13[12],
1145                 *C13=NULL,
1146                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1147
1148 /* Test Case 14 */
1149 #define K14 K13
1150 #define A14 A13
1151 static const u8 P14[16],
1152                 IV14[12],
1153                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1154                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1155
1156 /* Test Case 15 */
1157 #define A15 A14
1158 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1159                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1160                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1161                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1162                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1163                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1164                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1165                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1166                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1167                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1168                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1169                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1170
1171 /* Test Case 16 */
1172 #define K16 K15
1173 #define IV16 IV15
1174 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1175                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1176                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1177                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1178                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1179                         0xab,0xad,0xda,0xd2},
1180                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1181                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1182                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1183                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1184                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1185
1186 /* Test Case 17 */
1187 #define K17 K16
1188 #define P17 P16
1189 #define A17 A16
1190 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1191                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1192                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1193                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1194                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1195                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1196
1197 /* Test Case 18 */
1198 #define K18 K17
1199 #define P18 P17
1200 #define A18 A17
1201 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1202                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1203                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1204                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1205                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1206                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1207                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1208                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1209                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1210
1211 #define TEST_CASE(n)    do {                                    \
1212         u8 out[sizeof(P##n)];                                   \
1213         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1214         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1215         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1216         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1217         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1218         CRYPTO_gcm128_finish(&ctx);                             \
1219         if (memcmp(ctx.Xi.c,T##n,16) || (C##n && memcmp(out,C##n,sizeof(out)))) \
1220                 ret++, printf ("encrypt test#%d failed.\n",n);\
1221         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1222         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1223         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1224         CRYPTO_gcm128_finish(&ctx);                             \
1225         if (memcmp(ctx.Xi.c,T##n,16) || (P##n && memcmp(out,P##n,sizeof(out)))) \
1226                 ret++, printf ("decrypt test#%d failed.\n",n);\
1227         } while(0)
1228
1229 int main()
1230 {
1231         GCM128_CONTEXT ctx;
1232         AES_KEY key;
1233         int ret=0;
1234
1235         TEST_CASE(1);
1236         TEST_CASE(2);
1237         TEST_CASE(3);
1238         TEST_CASE(4);
1239         TEST_CASE(5);
1240         TEST_CASE(6);
1241         TEST_CASE(7);
1242         TEST_CASE(8);
1243         TEST_CASE(9);
1244         TEST_CASE(10);
1245         TEST_CASE(11);
1246         TEST_CASE(12);
1247         TEST_CASE(13);
1248         TEST_CASE(14);
1249         TEST_CASE(15);
1250         TEST_CASE(16);
1251         TEST_CASE(17);
1252         TEST_CASE(18);
1253
1254 #ifdef OPENSSL_CPUID_OBJ
1255         {
1256         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1257         union { u64 u; u8 c[1024]; } buf;
1258
1259         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1260         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1261         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1262
1263         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1264         start = OPENSSL_rdtsc();
1265         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1266         gcm_t = OPENSSL_rdtsc() - start;
1267
1268         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1269                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1270                         (block128_f)AES_encrypt);
1271         start = OPENSSL_rdtsc();
1272         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1273                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1274                         (block128_f)AES_encrypt);
1275         ctr_t = OPENSSL_rdtsc() - start;
1276
1277         printf("%.2f-%.2f=%.2f\n",
1278                         gcm_t/(double)sizeof(buf),
1279                         ctr_t/(double)sizeof(buf),
1280                         (gcm_t-ctr_t)/(double)sizeof(buf));
1281 #ifdef GHASH
1282         GHASH(buf.c,sizeof(buf),&ctx);
1283         start = OPENSSL_rdtsc();
1284         GHASH(buf.c,sizeof(buf),&ctx);
1285         gcm_t = OPENSSL_rdtsc() - start;
1286         printf("%.2f\n",gcm_t/(double)sizeof(buf));
1287 #endif
1288         }
1289 #endif
1290
1291         return ret;
1292 }
1293 #endif