gcm128.c and assembler modules: change argument order for gcm_ghash_4bit.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #include "modes.h"
51 #include <string.h>
52
53 #ifndef MODES_DEBUG
54 # ifndef NDEBUG
55 #  define NDEBUG
56 # endif
57 #endif
58 #include <assert.h>
59
60 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
61 typedef __int64 i64;
62 typedef unsigned __int64 u64;
63 #define U64(C) C##UI64
64 #elif defined(__arch64__)
65 typedef long i64;
66 typedef unsigned long u64;
67 #define U64(C) C##UL
68 #else
69 typedef long long i64;
70 typedef unsigned long long u64;
71 #define U64(C) C##ULL
72 #endif
73
74 typedef unsigned int u32;
75 typedef unsigned char u8;
76 typedef struct { u64 hi,lo; } u128;
77
78 #define STRICT_ALIGNMENT
79 #if defined(__i386)     || defined(__i386__)    || \
80     defined(__x86_64)   || defined(__x86_64__)  || \
81     defined(_M_IX86)    || defined(_M_AMD64)    || defined(_M_X64) || \
82     defined(__s390__)   || defined(__s390x__)
83 # undef STRICT_ALIGNMENT
84 #endif
85
86 #if defined(__GNUC__) && __GNUC__>=2
87 # if defined(__x86_64) || defined(__x86_64__)
88 #  define BSWAP8(x) ({  u64 ret=(x);                    \
89                         asm volatile ("bswapq %0"       \
90                         : "+r"(ret));   ret;            })
91 #  define BSWAP4(x) ({  u32 ret=(x);                    \
92                         asm volatile ("bswapl %0"       \
93                         : "+r"(ret));   ret;            })
94 # elif defined(__i386) || defined(__i386__)
95 #  define BSWAP8(x) ({  u32 lo=(u64)(x)>>32,hi=(x);     \
96                         asm volatile ("bswapl %0; bswapl %1"    \
97                         : "+r"(hi),"+r"(lo));           \
98                         (u64)hi<<32|lo;                 })
99 #  define BSWAP4(x) ({  u32 ret=(x);                    \
100                         asm volatile ("bswapl %0"       \
101                         : "+r"(ret));   ret;            })
102 # endif
103 #elif defined(_MSC_VER)
104 # if _MSC_VER>=1300
105 #  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
106 #  define BSWAP8(x)     _byteswap_uint64((u64)(x))
107 #  define BSWAP4(x)     _byteswap_ulong((u32)(x))
108 # elif defined(_M_IX86)
109 # endif
110 #endif
111
112 #ifdef BSWAP4
113 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
114 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
115 #else
116 #define GETU32(p)       ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
117 #define PUTU32(p,v)     ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
118 #endif
119
120 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
121 #ifdef  TABLE_BITS
122 #undef  TABLE_BITS
123 #endif
124 /*
125  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
126  * never be set to 8. 8 is effectively reserved for testing purposes.
127  * Under ideal conditions "8-bit" version should be twice as fast as
128  * "4-bit" one. But world is far from ideal. For gcc-generated x86 code,
129  * "8-bit" was observed to run only ~50% faster. On x86_64 observed
130  * improvement was ~75%, much closer to optimal, but the fact of
131  * deviation means that references to pre-computed tables end up on
132  * critical path and as tables are pretty big, 4KB per key+1KB shared,
133  * execution time is sensitive to cache timing. It's not actually
134  * proven, but 4-bit procedure is believed to provide adequate
135  * all-round performance...
136  */  
137 #define TABLE_BITS 4
138
139 #if     TABLE_BITS==8
140
141 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
142 {
143         int  i, j;
144         u128 V;
145
146         Htable[0].hi = 0;
147         Htable[0].lo = 0;
148         V.hi = H[0];
149         V.lo = H[1];
150
151         for (Htable[128]=V, i=64; i>0; i>>=1) {
152                 if (sizeof(size_t)==8) {
153                         u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
154                         V.lo  = (V.hi<<63)|(V.lo>>1);
155                         V.hi  = (V.hi>>1 )^T;
156                 }
157                 else {
158                         u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
159                         V.lo  = (V.hi<<63)|(V.lo>>1);
160                         V.hi  = (V.hi>>1 )^((u64)T<<32);
161                 }
162                 Htable[i] = V;
163         }
164
165         for (i=2; i<256; i<<=1) {
166                 u128 *Hi = Htable+i, H0 = *Hi;
167                 for (j=1; j<i; ++j) {
168                         Hi[j].hi = H0.hi^Htable[j].hi;
169                         Hi[j].lo = H0.lo^Htable[j].lo;
170                 }
171         }
172 }
173
174 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
175 {
176         u128 Z = { 0, 0};
177         const u8 *xi = (const u8 *)Xi+15;
178         size_t rem, n = *xi;
179         const union { long one; char little; } is_endian = {1};
180         static const size_t rem_8bit[256] = {
181                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
182                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
183                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
184                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
185                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
186                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
187                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
188                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
189                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
190                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
191                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
192                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
193                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
194                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
195                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
196                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
197                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
198                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
199                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
200                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
201                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
202                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
203                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
204                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
205                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
206                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
207                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
208                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
209                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
210                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
211                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
212                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
213                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
214                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
215                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
216                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
217                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
218                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
219                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
220                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
221                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
222                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
223                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
224                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
225                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
226                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
227                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
228                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
229                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
230                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
231                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
232                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
233                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
234                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
235                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
236                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
237                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
238                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
239                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
240                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
241                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
242                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
243                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
244                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
245
246         while (1) {
247                 Z.hi ^= Htable[n].hi;
248                 Z.lo ^= Htable[n].lo;
249
250                 if ((u8 *)Xi==xi)       break;
251
252                 n = *(--xi);
253
254                 rem  = (size_t)Z.lo&0xff;
255                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
256                 Z.hi = (Z.hi>>8);
257                 if (sizeof(size_t)==8)
258                         Z.hi ^= rem_8bit[rem];
259                 else
260                         Z.hi ^= (u64)rem_8bit[rem]<<32;
261         }
262
263         if (is_endian.little) {
264 #ifdef BSWAP8
265                 Xi[0] = BSWAP8(Z.hi);
266                 Xi[1] = BSWAP8(Z.lo);
267 #else
268                 u8 *p = (u8 *)Xi;
269                 u32 v;
270                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
271                 v = (u32)(Z.hi);        PUTU32(p+4,v);
272                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
273                 v = (u32)(Z.lo);        PUTU32(p+12,v);
274 #endif
275         }
276         else {
277                 Xi[0] = Z.hi;
278                 Xi[1] = Z.lo;
279         }
280 }
281 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
282
283 #elif   TABLE_BITS==4
284
285 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
286 {
287         int  i;
288         u128 V;
289
290         Htable[0].hi = 0;
291         Htable[0].lo = 0;
292         V.hi = H[0];
293         V.lo = H[1];
294
295         for (Htable[8]=V, i=4; i>0; i>>=1) {
296                 if (sizeof(size_t)==8) {
297                         u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
298                         V.lo  = (V.hi<<63)|(V.lo>>1);
299                         V.hi  = (V.hi>>1 )^T;
300                 }
301                 else {
302                         u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
303                         V.lo  = (V.hi<<63)|(V.lo>>1);
304                         V.hi  = (V.hi>>1 )^((u64)T<<32);
305                 }
306                 Htable[i] = V;
307         }
308
309 #if defined(OPENSSL_SMALL_FOOTPRINT)
310         for (i=2; i<16; i<<=1) {
311                 u128 *Hi = Htable+i;
312                 int   j;
313                 for (V=*Hi, j=1; j<i; ++j) {
314                         Hi[j].hi = V.hi^Htable[j].hi;
315                         Hi[j].lo = V.lo^Htable[j].lo;
316                 }
317         }
318 #else
319         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
320         V=Htable[4];
321         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
322         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
323         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
324         V=Htable[8];
325         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
326         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
327         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
328         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
329         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
330         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
331         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
332 #endif
333 }
334
335 #ifndef GHASH_ASM
336 static const size_t rem_4bit[16] = {
337         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
338         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
339         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
340         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
341
342 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
343 {
344         u128 Z;
345         int cnt = 15;
346         size_t rem, nlo, nhi;
347         const union { long one; char little; } is_endian = {1};
348
349         nlo  = ((const u8 *)Xi)[15];
350         nhi  = nlo>>4;
351         nlo &= 0xf;
352
353         Z.hi = Htable[nlo].hi;
354         Z.lo = Htable[nlo].lo;
355
356         while (1) {
357                 rem  = (size_t)Z.lo&0xf;
358                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
359                 Z.hi = (Z.hi>>4);
360                 if (sizeof(size_t)==8)
361                         Z.hi ^= rem_4bit[rem];
362                 else
363                         Z.hi ^= (u64)rem_4bit[rem]<<32;
364
365                 Z.hi ^= Htable[nhi].hi;
366                 Z.lo ^= Htable[nhi].lo;
367
368                 if (--cnt<0)            break;
369
370                 nlo  = ((const u8 *)Xi)[cnt];
371                 nhi  = nlo>>4;
372                 nlo &= 0xf;
373
374                 rem  = (size_t)Z.lo&0xf;
375                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
376                 Z.hi = (Z.hi>>4);
377                 if (sizeof(size_t)==8)
378                         Z.hi ^= rem_4bit[rem];
379                 else
380                         Z.hi ^= (u64)rem_4bit[rem]<<32;
381
382                 Z.hi ^= Htable[nlo].hi;
383                 Z.lo ^= Htable[nlo].lo;
384         }
385
386         if (is_endian.little) {
387 #ifdef BSWAP8
388                 Xi[0] = BSWAP8(Z.hi);
389                 Xi[1] = BSWAP8(Z.lo);
390 #else
391                 u8 *p = (u8 *)Xi;
392                 u32 v;
393                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
394                 v = (u32)(Z.hi);        PUTU32(p+4,v);
395                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
396                 v = (u32)(Z.lo);        PUTU32(p+12,v);
397 #endif
398         }
399         else {
400                 Xi[0] = Z.hi;
401                 Xi[1] = Z.lo;
402         }
403 }
404
405 #if !defined(OPENSSL_SMALL_FOOTPRINT)
406 /*
407  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
408  * details... Compiler-generated code doesn't seem to give any
409  * performance improvement, at least not on x86[_64]. It's here
410  * mostly as reference and a placeholder for possible future
411  * non-trivial optimization[s]...
412  */
413 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
414                                 const u8 *inp,size_t len)
415 {
416     u128 Z;
417     int cnt;
418     size_t rem, nlo, nhi;
419     const union { long one; char little; } is_endian = {1};
420
421     do {
422         cnt  = 15;
423         nlo  = ((const u8 *)Xi)[15];
424         nlo ^= inp[15];
425         nhi  = nlo>>4;
426         nlo &= 0xf;
427
428         Z.hi = Htable[nlo].hi;
429         Z.lo = Htable[nlo].lo;
430
431         while (1) {
432                 rem  = (size_t)Z.lo&0xf;
433                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
434                 Z.hi = (Z.hi>>4);
435                 if (sizeof(size_t)==8)
436                         Z.hi ^= rem_4bit[rem];
437                 else
438                         Z.hi ^= (u64)rem_4bit[rem]<<32;
439
440                 Z.hi ^= Htable[nhi].hi;
441                 Z.lo ^= Htable[nhi].lo;
442
443                 if (--cnt<0)            break;
444
445                 nlo  = ((const u8 *)Xi)[cnt];
446                 nlo ^= inp[cnt];
447                 nhi  = nlo>>4;
448                 nlo &= 0xf;
449
450                 rem  = (size_t)Z.lo&0xf;
451                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
452                 Z.hi = (Z.hi>>4);
453                 if (sizeof(size_t)==8)
454                         Z.hi ^= rem_4bit[rem];
455                 else
456                         Z.hi ^= (u64)rem_4bit[rem]<<32;
457
458                 Z.hi ^= Htable[nlo].hi;
459                 Z.lo ^= Htable[nlo].lo;
460         }
461
462         if (is_endian.little) {
463 #ifdef BSWAP8
464                 Xi[0] = BSWAP8(Z.hi);
465                 Xi[1] = BSWAP8(Z.lo);
466 #else
467                 u8 *p = (u8 *)Xi;
468                 u32 v;
469                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
470                 v = (u32)(Z.hi);        PUTU32(p+4,v);
471                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
472                 v = (u32)(Z.lo);        PUTU32(p+12,v);
473 #endif
474         }
475         else {
476                 Xi[0] = Z.hi;
477                 Xi[1] = Z.lo;
478         }
479     } while (inp+=16, len-=16);
480 }
481 #endif
482 #else
483 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
484 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
485 #endif
486
487 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
488 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
489 #define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
490 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
491  * trashing effect. In other words idea is to hash data while it's
492  * still in L1 cache after encryption pass... */
493 #define GHASH_CHUNK       1024
494 #endif
495
496 #else   /* TABLE_BITS */
497
498 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
499 {
500         u128 V,Z = { 0,0 };
501         long X;
502         int  i,j;
503         const long *xi = (const long *)Xi;
504         const union { long one; char little; } is_endian = {1};
505
506         V.hi = H[0];    /* H is in host byte order, no byte swapping */
507         V.lo = H[1];
508
509         for (j=0; j<16/sizeof(long); ++j) {
510                 if (is_endian.little) {
511                         if (sizeof(long)==8) {
512 #ifdef BSWAP8
513                                 X = (long)(BSWAP8(xi[j]));
514 #else
515                                 const u8 *p = (const u8 *)(xi+j);
516                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
517 #endif
518                         }
519                         else {
520                                 const u8 *p = (const u8 *)(xi+j);
521                                 X = (long)GETU32(p);
522                         }
523                 }
524                 else
525                         X = xi[j];
526
527                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
528                         u64 M = (u64)(X>>(8*sizeof(long)-1));
529                         Z.hi ^= V.hi&M;
530                         Z.lo ^= V.lo&M;
531
532                         if (sizeof(size_t)==8) {
533                                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
534                                 V.lo  = (V.hi<<63)|(V.lo>>1);
535                                 V.hi  = (V.hi>>1 )^T;
536                         }
537                         else {
538                                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
539                                 V.lo  = (V.hi<<63)|(V.lo>>1);
540                                 V.hi  = (V.hi>>1 )^((u64)T<<32);
541                         }
542                                 
543                 }
544         }
545
546         if (is_endian.little) {
547 #ifdef BSWAP8
548                 Xi[0] = BSWAP8(Z.hi);
549                 Xi[1] = BSWAP8(Z.lo);
550 #else
551                 u8 *p = (u8 *)Xi;
552                 u32 v;
553                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
554                 v = (u32)(Z.hi);        PUTU32(p+4,v);
555                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
556                 v = (u32)(Z.lo);        PUTU32(p+12,v);
557 #endif
558         }
559         else {
560                 Xi[0] = Z.hi;
561                 Xi[1] = Z.lo;
562         }
563 }
564 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
565
566 #endif
567
568 typedef struct {
569         /* Following 6 names follow names in GCM specification */
570         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
571                                                 Xi,H,
572                                                 len;
573         /* Pre-computed table used by gcm_gmult_* */
574 #if TABLE_BITS==8
575         u128 Htable[256];
576 #else
577         u128 Htable[16];
578 #endif
579         unsigned int res, ctr;
580         block128_f block;
581         void *key;
582 } GCM128_CONTEXT;
583
584 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
585 {
586         const union { long one; char little; } is_endian = {1};
587
588         memset(ctx,0,sizeof(*ctx));
589         ctx->block = block;
590         ctx->key   = key;
591
592         (*block)(ctx->H.c,ctx->H.c,key);
593
594         if (is_endian.little) {
595                 /* H is stored in host byte order */
596 #ifdef BSWAP8
597                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
598                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
599 #else
600                 u8 *p = ctx->H.c;
601                 u64 hi,lo;
602                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
603                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
604                 ctx->H.u[0] = hi;
605                 ctx->H.u[1] = lo;
606 #endif
607         }
608
609 #if     TABLE_BITS==8
610         gcm_init_8bit(ctx->Htable,ctx->H.u);
611 #elif   TABLE_BITS==4
612         gcm_init_4bit(ctx->Htable,ctx->H.u);
613 #endif
614 }
615
616 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
617 {
618         const union { long one; char little; } is_endian = {1};
619
620         ctx->Yi.u[0]  = 0;
621         ctx->Yi.u[1]  = 0;
622         ctx->Xi.u[0]  = 0;
623         ctx->Xi.u[1]  = 0;
624         ctx->len.u[0] = 0;
625         ctx->len.u[1] = 0;
626         ctx->res = 0;
627
628         if (len==12) {
629                 memcpy(ctx->Yi.c,iv,12);
630                 ctx->Yi.c[15]=1;
631                 ctx->ctr=1;
632         }
633         else {
634                 size_t i;
635                 u64 len0 = len;
636
637                 while (len>=16) {
638                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
639                         GCM_MUL(ctx,Yi);
640                         iv += 16;
641                         len -= 16;
642                 }
643                 if (len) {
644                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
645                         GCM_MUL(ctx,Yi);
646                 }
647                 len0 <<= 3;
648                 if (is_endian.little) {
649 #ifdef BSWAP8
650                         ctx->Yi.u[1]  ^= BSWAP8(len0);
651 #else
652                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
653                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
654                         ctx->Yi.c[10] ^= (u8)(len0>>40);
655                         ctx->Yi.c[11] ^= (u8)(len0>>32);
656                         ctx->Yi.c[12] ^= (u8)(len0>>24);
657                         ctx->Yi.c[13] ^= (u8)(len0>>16);
658                         ctx->Yi.c[14] ^= (u8)(len0>>8);
659                         ctx->Yi.c[15] ^= (u8)(len0);
660 #endif
661                 }
662                 else
663                         ctx->Yi.u[1]  ^= len0;
664
665                 GCM_MUL(ctx,Yi);
666
667                 if (is_endian.little)
668                         ctx->ctr = GETU32(ctx->Yi.c+12);
669                 else
670                         ctx->ctr = ctx->Yi.d[3];
671         }
672
673         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
674         ++ctx->ctr;
675         if (is_endian.little)
676                 PUTU32(ctx->Yi.c+12,ctx->ctr);
677         else
678                 ctx->Yi.d[3] = ctx->ctr;
679 }
680
681 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
682 {
683         size_t i;
684
685         ctx->len.u[0] += len;
686
687 #ifdef GHASH
688         if ((i = (len&(size_t)-16))) {
689                 GHASH(aad,i,ctx);
690                 aad += i;
691                 len -= i;
692         }
693 #else
694         while (len>=16) {
695                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
696                 GCM_MUL(ctx,Xi);
697                 aad += 16;
698                 len -= 16;
699         }
700 #endif
701         if (len) {
702                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
703                 GCM_MUL(ctx,Xi);
704         }
705 }
706
707 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
708                 const unsigned char *in, unsigned char *out,
709                 size_t len)
710 {
711         const union { long one; char little; } is_endian = {1};
712         unsigned int n, ctr;
713         size_t i;
714
715         ctx->len.u[1] += len;
716         n   = ctx->res;
717         ctr = ctx->ctr;
718
719 #if !defined(OPENSSL_SMALL_FOOTPRINT)
720         if (16%sizeof(size_t) == 0) do {        /* always true actually */
721                 if (n) {
722                         while (n && len) {
723                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
724                                 --len;
725                                 n = (n+1)%16;
726                         }
727                         if (n==0) GCM_MUL(ctx,Xi);
728                         else {
729                                 ctx->res = n;
730                                 return;
731                         }
732                 }
733 #if defined(STRICT_ALIGNMENT)
734                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
735                         break;
736 #endif
737 #if defined(GHASH) && defined(GHASH_CHUNK)
738                 while (len>=GHASH_CHUNK) {
739                     size_t j=GHASH_CHUNK;
740
741                     while (j) {
742                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
743                         ++ctr;
744                         if (is_endian.little)
745                                 PUTU32(ctx->Yi.c+12,ctr);
746                         else
747                                 ctx->Yi.d[3] = ctr;
748                         for (i=0; i<16; i+=sizeof(size_t))
749                                 *(size_t *)(out+i) =
750                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
751                         out += 16;
752                         in  += 16;
753                         j   -= 16;
754                     }
755                     GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx);
756                     len -= GHASH_CHUNK;
757                 }
758                 if ((i = (len&(size_t)-16))) {
759                     size_t j=i;
760
761                     while (len>=16) {
762                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
763                         ++ctr;
764                         if (is_endian.little)
765                                 PUTU32(ctx->Yi.c+12,ctr);
766                         else
767                                 ctx->Yi.d[3] = ctr;
768                         for (i=0; i<16; i+=sizeof(size_t))
769                                 *(size_t *)(out+i) =
770                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
771                         out += 16;
772                         in  += 16;
773                         len -= 16;
774                     }
775                     GHASH(out-j,j,ctx);
776                 }
777 #else
778                 while (len>=16) {
779                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
780                         ++ctr;
781                         if (is_endian.little)
782                                 PUTU32(ctx->Yi.c+12,ctr);
783                         else
784                                 ctx->Yi.d[3] = ctr;
785                         for (i=0; i<16; i+=sizeof(size_t))
786                                 *(size_t *)(ctx->Xi.c+i) ^=
787                                 *(size_t *)(out+i) =
788                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
789                         GCM_MUL(ctx,Xi);
790                         out += 16;
791                         in  += 16;
792                         len -= 16;
793                 }
794 #endif
795                 if (len) {
796                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
797                         ++ctr;
798                         if (is_endian.little)
799                                 PUTU32(ctx->Yi.c+12,ctr);
800                         else
801                                 ctx->Yi.d[3] = ctr;
802                         while (len--) {
803                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
804                                 ++n;
805                         }
806                 }
807
808                 ctx->res = n;
809                 ctx->ctr = ctr;
810                 return;
811         } while(0);
812 #endif
813         for (i=0;i<len;++i) {
814                 if (n==0) {
815                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
816                         ++ctr;
817                         if (is_endian.little)
818                                 PUTU32(ctx->Yi.c+12,ctr);
819                         else
820                                 ctx->Yi.d[3] = ctr;
821                 }
822                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
823                 n = (n+1)%16;
824                 if (n==0)
825                         GCM_MUL(ctx,Xi);
826         }
827
828         ctx->res = n;
829         ctx->ctr = ctr;
830 }
831
832 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
833                 const unsigned char *in, unsigned char *out,
834                 size_t len)
835 {
836         const union { long one; char little; } is_endian = {1};
837         unsigned int n, ctr;
838         size_t i;
839
840         ctx->len.u[1] += len;
841         n   = ctx->res;
842         ctr = ctx->ctr;
843
844 #if !defined(OPENSSL_SMALL_FOOTPRINT)
845         if (16%sizeof(size_t) == 0) do {        /* always true actually */
846                 if (n) {
847                         while (n && len) {
848                                 u8 c = *(in++);
849                                 *(out++) = c^ctx->EKi.c[n];
850                                 ctx->Xi.c[n] ^= c;
851                                 --len;
852                                 n = (n+1)%16;
853                         }
854                         if (n==0) GCM_MUL (ctx,Xi);
855                         else {
856                                 ctx->res = n;
857                                 return;
858                         }
859                 }
860 #if defined(STRICT_ALIGNMENT)
861                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
862                         break;
863 #endif
864 #if defined(GHASH) && defined(GHASH_CHUNK)
865                 while (len>=GHASH_CHUNK) {
866                     size_t j=GHASH_CHUNK;
867
868                     GHASH(in,GHASH_CHUNK,ctx);
869                     while (j) {
870                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
871                         ++ctr;
872                         if (is_endian.little)
873                                 PUTU32(ctx->Yi.c+12,ctr);
874                         else
875                                 ctx->Yi.d[3] = ctr;
876                         for (i=0; i<16; i+=sizeof(size_t))
877                                 *(size_t *)(out+i) =
878                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
879                         out += 16;
880                         in  += 16;
881                         j   -= 16;
882                     }
883                     len -= GHASH_CHUNK;
884                 }
885                 if ((i = (len&(size_t)-16))) {
886                     GHASH(in,i,ctx);
887                     while (len>=16) {
888                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
889                         ++ctr;
890                         if (is_endian.little)
891                                 PUTU32(ctx->Yi.c+12,ctr);
892                         else
893                                 ctx->Yi.d[3] = ctr;
894                         for (i=0; i<16; i+=sizeof(size_t))
895                                 *(size_t *)(out+i) =
896                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
897                         out += 16;
898                         in  += 16;
899                         len -= 16;
900                     }
901                 }
902 #else
903                 while (len>=16) {
904                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
905                         ++ctr;
906                         if (is_endian.little)
907                                 PUTU32(ctx->Yi.c+12,ctr);
908                         else
909                                 ctx->Yi.d[3] = ctr;
910                         for (i=0; i<16; i+=sizeof(size_t)) {
911                                 size_t c = *(size_t *)(in+i);
912                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
913                                 *(size_t *)(ctx->Xi.c+i) ^= c;
914                         }
915                         GCM_MUL(ctx,Xi);
916                         out += 16;
917                         in  += 16;
918                         len -= 16;
919                 }
920 #endif
921                 if (len) {
922                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
923                         ++ctr;
924                         if (is_endian.little)
925                                 PUTU32(ctx->Yi.c+12,ctr);
926                         else
927                                 ctx->Yi.d[3] = ctr;
928                         while (len--) {
929                                 u8 c = in[n];
930                                 ctx->Xi.c[n] ^= c;
931                                 out[n] = c^ctx->EKi.c[n];
932                                 ++n;
933                         }
934                 }
935
936                 ctx->res = n;
937                 ctx->ctr = ctr;
938                 return;
939         } while(0);
940 #endif
941         for (i=0;i<len;++i) {
942                 u8 c;
943                 if (n==0) {
944                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
945                         ++ctr;
946                         if (is_endian.little)
947                                 PUTU32(ctx->Yi.c+12,ctr);
948                         else
949                                 ctx->Yi.d[3] = ctr;
950                 }
951                 c = in[i];
952                 out[i] ^= ctx->EKi.c[n];
953                 ctx->Xi.c[n] ^= c;
954                 n = (n+1)%16;
955                 if (n==0)
956                         GCM_MUL(ctx,Xi);
957         }
958
959         ctx->res = n;
960         ctx->ctr = ctr;
961 }
962
963 void CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx)
964 {
965         const union { long one; char little; } is_endian = {1};
966         u64 alen = ctx->len.u[0]<<3;
967         u64 clen = ctx->len.u[1]<<3;
968
969         if (ctx->res)
970                 GCM_MUL(ctx,Xi);
971
972         if (is_endian.little) {
973 #ifdef BSWAP8
974                 alen = BSWAP8(alen);
975                 clen = BSWAP8(clen);
976 #else
977                 u8 *p = ctx->len.c;
978
979                 ctx->len.u[0] = alen;
980                 ctx->len.u[1] = clen;
981
982                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
983                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
984 #endif
985         }
986
987         ctx->Xi.u[0] ^= alen;
988         ctx->Xi.u[1] ^= clen;
989         GCM_MUL(ctx,Xi);
990
991         ctx->Xi.u[0] ^= ctx->EK0.u[0];
992         ctx->Xi.u[1] ^= ctx->EK0.u[1];
993 }
994
995 #if defined(SELFTEST)
996 #include <stdio.h>
997 #include <openssl/aes.h>
998
999 /* Test Case 1 */
1000 static const u8 K1[16],
1001                 *P1=NULL,
1002                 *A1=NULL,
1003                 IV1[12],
1004                 *C1=NULL,
1005                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1006
1007 /* Test Case 2 */
1008 #define K2 K1
1009 #define A2 A1
1010 #define IV2 IV1
1011 static const u8 P2[16],
1012                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1013                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1014
1015 /* Test Case 3 */
1016 #define A3 A2
1017 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1018                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1019                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1020                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1021                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1022                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1023                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1024                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1025                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1026                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1027                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4,};
1028
1029 /* Test Case 4 */
1030 #define K4 K3
1031 #define IV4 IV3
1032 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1033                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1034                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1035                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1036                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1037                         0xab,0xad,0xda,0xd2},
1038                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1039                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1040                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1041                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1042                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1043
1044 /* Test Case 5 */
1045 #define K5 K4
1046 #define P5 P4
1047 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1048                         0xab,0xad,0xda,0xd2},
1049                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1050                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1051                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1052                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1053                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1054                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1055
1056 /* Test Case 6 */
1057 #define K6 K5
1058 #define P6 P5
1059 #define A6 A5
1060 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1061                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1062                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1063                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1064                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1065                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1066                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1067                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1068                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1069
1070 /* Test Case 7 */
1071 static const u8 K7[24],
1072                 *P7=NULL,
1073                 *A7=NULL,
1074                 IV7[12],
1075                 *C7=NULL,
1076                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1077
1078 /* Test Case 8 */
1079 #define K8 K7
1080 #define IV8 IV7
1081 #define A8 A7
1082 static const u8 P8[16],
1083                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1084                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1085
1086 /* Test Case 9 */
1087 #define A9 A8
1088 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1089                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1090                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1091                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1092                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1093                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1094                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1095                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1096                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1097                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1098                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1099                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1100
1101 /* Test Case 10 */
1102 #define K10 K9
1103 #define IV10 IV9
1104 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1105                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1106                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1107                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1108                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1109                         0xab,0xad,0xda,0xd2},
1110                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1111                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1112                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1113                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1114                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1115
1116 /* Test Case 11 */
1117 #define K11 K10
1118 #define P11 P10
1119 #define A11 A10
1120 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1121                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1122                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1123                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1124                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1125                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1126
1127 /* Test Case 12 */
1128 #define K12 K11
1129 #define P12 P11
1130 #define A12 A11
1131 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1132                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1133                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1134                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1135                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1136                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1137                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1138                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1139                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1140
1141 /* Test Case 13 */
1142 static const u8 K13[32],
1143                 *P13=NULL,
1144                 *A13=NULL,
1145                 IV13[12],
1146                 *C13=NULL,
1147                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1148
1149 /* Test Case 14 */
1150 #define K14 K13
1151 #define A14 A13
1152 static const u8 P14[16],
1153                 IV14[12],
1154                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1155                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1156
1157 /* Test Case 15 */
1158 #define A15 A14
1159 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1160                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1161                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1162                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1163                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1164                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1165                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1166                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1167                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1168                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1169                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1170                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1171
1172 /* Test Case 16 */
1173 #define K16 K15
1174 #define IV16 IV15
1175 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1176                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1177                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1178                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1179                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1180                         0xab,0xad,0xda,0xd2},
1181                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1182                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1183                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1184                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1185                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1186
1187 /* Test Case 17 */
1188 #define K17 K16
1189 #define P17 P16
1190 #define A17 A16
1191 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1192                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1193                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1194                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1195                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1196                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1197
1198 /* Test Case 18 */
1199 #define K18 K17
1200 #define P18 P17
1201 #define A18 A17
1202 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1203                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1204                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1205                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1206                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1207                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1208                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1209                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1210                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1211
1212 #define TEST_CASE(n)    do {                                    \
1213         u8 out[sizeof(P##n)];                                   \
1214         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1215         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1216         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1217         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1218         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1219         CRYPTO_gcm128_finish(&ctx);                             \
1220         if (memcmp(ctx.Xi.c,T##n,16) || (C##n && memcmp(out,C##n,sizeof(out)))) \
1221                 ret++, printf ("encrypt test#%d failed.\n",n);\
1222         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1223         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1224         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1225         CRYPTO_gcm128_finish(&ctx);                             \
1226         if (memcmp(ctx.Xi.c,T##n,16) || (P##n && memcmp(out,P##n,sizeof(out)))) \
1227                 ret++, printf ("decrypt test#%d failed.\n",n);\
1228         } while(0)
1229
1230 int main()
1231 {
1232         GCM128_CONTEXT ctx;
1233         AES_KEY key;
1234         int ret=0;
1235
1236         TEST_CASE(1);
1237         TEST_CASE(2);
1238         TEST_CASE(3);
1239         TEST_CASE(4);
1240         TEST_CASE(5);
1241         TEST_CASE(6);
1242         TEST_CASE(7);
1243         TEST_CASE(8);
1244         TEST_CASE(9);
1245         TEST_CASE(10);
1246         TEST_CASE(11);
1247         TEST_CASE(12);
1248         TEST_CASE(13);
1249         TEST_CASE(14);
1250         TEST_CASE(15);
1251         TEST_CASE(16);
1252         TEST_CASE(17);
1253         TEST_CASE(18);
1254
1255 #ifdef OPENSSL_CPUID_OBJ
1256         {
1257         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1258         union { u64 u; u8 c[1024]; } buf;
1259
1260         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1261         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1262         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1263
1264         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1265         start = OPENSSL_rdtsc();
1266         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1267         gcm_t = OPENSSL_rdtsc() - start;
1268
1269         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1270                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1271                         (block128_f)AES_encrypt);
1272         start = OPENSSL_rdtsc();
1273         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1274                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1275                         (block128_f)AES_encrypt);
1276         ctr_t = OPENSSL_rdtsc() - start;
1277
1278         printf("%.2f-%.2f=%.2f\n",
1279                         gcm_t/(double)sizeof(buf),
1280                         ctr_t/(double)sizeof(buf),
1281                         (gcm_t-ctr_t)/(double)sizeof(buf));
1282 #ifdef GHASH
1283         GHASH(buf.c,sizeof(buf),&ctx);
1284         start = OPENSSL_rdtsc();
1285         GHASH(buf.c,sizeof(buf),&ctx);
1286         gcm_t = OPENSSL_rdtsc() - start;
1287         printf("%.2f\n",gcm_t/(double)sizeof(buf));
1288 #endif
1289         }
1290 #endif
1291
1292         return ret;
1293 }
1294 #endif