gcm128.c: add option for streamed GHASH, simple benchmark, minor naming
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #include "modes.h"
51 #include <string.h>
52
53 #ifndef MODES_DEBUG
54 # ifndef NDEBUG
55 #  define NDEBUG
56 # endif
57 #endif
58 #include <assert.h>
59
60 #if (defined(_WIN32) || defined(_WIN64)) && !defined(__MINGW32__)
61 typedef __int64 i64;
62 typedef unsigned __int64 u64;
63 #define U64(C) C##UI64
64 #elif defined(__arch64__)
65 typedef long i64;
66 typedef unsigned long u64;
67 #define U64(C) C##UL
68 #else
69 typedef long long i64;
70 typedef unsigned long long u64;
71 #define U64(C) C##ULL
72 #endif
73
74 typedef unsigned int u32;
75 typedef unsigned char u8;
76 typedef struct { u64 hi,lo; } u128;
77
78 #define STRICT_ALIGNMENT
79 #if defined(__i386)     || defined(__i386__)    || \
80     defined(__x86_64)   || defined(__x86_64__)  || \
81     defined(_M_IX86)    || defined(_M_AMD64)    || defined(_M_X64) || \
82     defined(__s390__)   || defined(__s390x__)
83 # undef STRICT_ALIGNMENT
84 #endif
85
86 #if defined(__GNUC__) && __GNUC__>=2
87 # if defined(__x86_64) || defined(__x86_64__)
88 #  define BSWAP8(x) ({  u64 ret=(x);                    \
89                         asm volatile ("bswapq %0"       \
90                         : "+r"(ret));   ret;            })
91 #  define BSWAP4(x) ({  u32 ret=(x);                    \
92                         asm volatile ("bswapl %0"       \
93                         : "+r"(ret));   ret;            })
94 # elif defined(__i386) || defined(__i386__)
95 #  define BSWAP8(x) ({  u32 lo=(u64)(x)>>32,hi=(x);     \
96                         asm volatile ("bswapl %0; bswapl %1"    \
97                         : "+r"(hi),"+r"(lo));           \
98                         (u64)hi<<32|lo;                 })
99 #  define BSWAP4(x) ({  u32 ret=(x);                    \
100                         asm volatile ("bswapl %0"       \
101                         : "+r"(ret));   ret;            })
102 # endif
103 #elif defined(_MSC_VER)
104 # if _MSC_VER>=1300
105 #  pragma intrinsic(_byteswap_uint64,_byteswap_ulong)
106 #  define BSWAP8(x)     _byteswap_uint64((u64)(x))
107 #  define BSWAP4(x)     _byteswap_ulong((u32)(x))
108 # elif defined(_M_IX86)
109 # endif
110 #endif
111
112 #ifdef BSWAP4
113 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
114 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
115 #else
116 #define GETU32(p)       ((u32)(p)[0]<<24|(u32)(p)[1]<<16|(u32)(p)[2]<<8|(u32)(p)[3])
117 #define PUTU32(p,v)     ((p)[0]=(u8)((v)>>24),(p)[1]=(u8)((v)>>16),(p)[2]=(u8)((v)>>8),(p)[3]=(u8)(v))
118 #endif
119
120 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
121
122 #if 0
123 /*
124  * Under ideal conditions 8-bit version should be twice as fast as
125  * 4-bit one. But world is far from ideal. For gcc-generated x86 code,
126  * 8-bit was observed to run "only" ~50% faster. On x86_64 observed
127  * improvement was ~75%, much closer to optimal, but the fact of
128  * deviation means that references to pre-computed tables end up on
129  * critical path and as tables are pretty big, 4KB per key+1KB shared,
130  * execution time is sensitive to cache trashing. It's not actually
131  * proven, but 4-bit procedure is believed to provide adequate
132  * all-round performance...
133  */  
134 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
135 {
136         int  i, j;
137         u128 V;
138
139         Htable[0].hi = 0;
140         Htable[0].lo = 0;
141         V.hi = H[0];
142         V.lo = H[1];
143
144         for (Htable[128]=V, i=64; i>0; i>>=1) {
145                 if (sizeof(size_t)==8) {
146                         u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
147                         V.lo  = (V.hi<<63)|(V.lo>>1);
148                         V.hi  = (V.hi>>1 )^T;
149                 }
150                 else {
151                         u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
152                         V.lo  = (V.hi<<63)|(V.lo>>1);
153                         V.hi  = (V.hi>>1) ^((u64)T<<32);
154                 }
155                 Htable[i] = V;
156         }
157
158         for (i=2; i<256; i<<=1) {
159                 u128 *Hi = Htable+i, H0 = *Hi;
160                 for (j=1; j<i; ++j) {
161                         Hi[j].hi = H0.hi^Htable[j].hi;
162                         Hi[j].lo = H0.lo^Htable[j].lo;
163                 }
164         }
165 }
166
167 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
168 {
169         u128 Z = { 0, 0};
170         const u8 *xi = (const u8 *)Xi+15;
171         size_t rem, n = *xi;
172         const union { long one; char little; } is_endian = {1};
173         static const size_t rem_8bit[256] = {
174                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
175                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
176                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
177                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
178                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
179                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
180                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
181                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
182                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
183                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
184                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
185                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
186                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
187                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
188                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
189                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
190                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
191                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
192                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
193                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
194                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
195                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
196                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
197                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
198                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
199                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
200                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
201                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
202                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
203                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
204                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
205                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
206                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
207                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
208                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
209                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
210                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
211                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
212                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
213                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
214                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
215                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
216                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
217                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
218                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
219                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
220                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
221                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
222                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
223                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
224                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
225                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
226                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
227                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
228                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
229                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
230                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
231                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
232                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
233                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
234                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
235                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
236                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
237                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
238
239         while (1) {
240                 Z.hi ^= Htable[n].hi;
241                 Z.lo ^= Htable[n].lo;
242
243                 if ((u8 *)Xi==xi)       break;
244
245                 n = *(--xi);
246
247                 rem  = (size_t)Z.lo&0xff;
248                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
249                 Z.hi = (Z.hi>>8);
250                 if (sizeof(size_t)==8)
251                         Z.hi ^= rem_8bit[rem];
252                 else
253                         Z.hi ^= (u64)rem_8bit[rem]<<32;
254         }
255
256         if (is_endian.little) {
257 #ifdef BSWAP8
258                 Xi[0] = BSWAP8(Z.hi);
259                 Xi[1] = BSWAP8(Z.lo);
260 #else
261                 u8 *p = (u8 *)Xi;
262                 u32 v;
263                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
264                 v = (u32)(Z.hi);        PUTU32(p+4,v);
265                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
266                 v = (u32)(Z.lo);        PUTU32(p+12,v);
267 #endif
268         }
269         else {
270                 Xi[0] = Z.hi;
271                 Xi[1] = Z.lo;
272         }
273 }
274 #endif
275
276 #define _4BIT 1 /* change to 0 to switch to 1-bit multiplication */
277
278 #if _4BIT
279 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
280 {
281         int  i;
282         u128 V;
283
284         Htable[0].hi = 0;
285         Htable[0].lo = 0;
286         V.hi = H[0];
287         V.lo = H[1];
288
289         for (Htable[8]=V, i=4; i>0; i>>=1) {
290                 if (sizeof(size_t)==8) {
291                         u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
292                         V.lo  = (V.hi<<63)|(V.lo>>1);
293                         V.hi  = (V.hi>>1 )^T;
294                 }
295                 else {
296                         u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
297                         V.lo  = (V.hi<<63)|(V.lo>>1);
298                         V.hi  = (V.hi>>1 )^((u64)T<<32);
299                 }
300                 Htable[i] = V;
301         }
302
303 #if defined(OPENSSL_SMALL_FOOTPRINT)
304         for (i=2; i<16; i<<=1) {
305                 u128 *Hi = Htable+i;
306                 int   j;
307                 for (V=*Hi, j=1; j<i; ++j) {
308                         Hi[j].hi = V.hi^Htable[j].hi;
309                         Hi[j].lo = V.lo^Htable[j].lo;
310                 }
311         }
312 #else
313         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
314         V=Htable[4];
315         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
316         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
317         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
318         V=Htable[8];
319         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
320         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
321         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
322         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
323         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
324         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
325         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
326 #endif
327 }
328
329 #ifndef GMULT_ASM
330 static const size_t rem_4bit[16] = {
331         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
332         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
333         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
334         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
335
336 static void gcm_gmult_4bit(u64 Xi[2], u128 Htable[16])
337 {
338         u128 Z;
339         int cnt = 15;
340         size_t rem, nlo, nhi;
341         const union { long one; char little; } is_endian = {1};
342
343         nlo  = ((const u8 *)Xi)[15];
344         nhi  = nlo>>4;
345         nlo &= 0xf;
346
347         Z.hi = Htable[nlo].hi;
348         Z.lo = Htable[nlo].lo;
349
350         while (1) {
351                 rem  = (size_t)Z.lo&0xf;
352                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
353                 Z.hi = (Z.hi>>4);
354                 if (sizeof(size_t)==8)
355                         Z.hi ^= rem_4bit[rem];
356                 else
357                         Z.hi ^= (u64)rem_4bit[rem]<<32;
358
359                 Z.hi ^= Htable[nhi].hi;
360                 Z.lo ^= Htable[nhi].lo;
361
362                 if (--cnt<0)            break;
363
364                 nlo  = ((const u8 *)Xi)[cnt];
365                 nhi  = nlo>>4;
366                 nlo &= 0xf;
367
368                 rem  = (size_t)Z.lo&0xf;
369                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
370                 Z.hi = (Z.hi>>4);
371                 if (sizeof(size_t)==8)
372                         Z.hi ^= rem_4bit[rem];
373                 else
374                         Z.hi ^= (u64)rem_4bit[rem]<<32;
375
376                 Z.hi ^= Htable[nlo].hi;
377                 Z.lo ^= Htable[nlo].lo;
378         }
379
380         if (is_endian.little) {
381 #ifdef BSWAP8
382                 Xi[0] = BSWAP8(Z.hi);
383                 Xi[1] = BSWAP8(Z.lo);
384 #else
385                 u8 *p = (u8 *)Xi;
386                 u32 v;
387                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
388                 v = (u32)(Z.hi);        PUTU32(p+4,v);
389                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
390                 v = (u32)(Z.lo);        PUTU32(p+12,v);
391 #endif
392         }
393         else {
394                 Xi[0] = Z.hi;
395                 Xi[1] = Z.lo;
396         }
397 }
398
399 #if !defined(OPENSSL_SMALL_FOOTPRINT)
400 /*
401  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
402  * details... It doesn't give any performance improvement, at least
403  * not on x86[_64]. It's here mostly as a placeholder for possible
404  * future non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2], u128 Htable[16])
407 {
408     u128 Z;
409     int cnt;
410     size_t rem, nlo, nhi;
411     const union { long one; char little; } is_endian = {1};
412
413     do {
414         cnt  = 15;
415         nlo  = ((const u8 *)Xi)[15];
416         nlo ^= inp[15];
417         nhi  = nlo>>4;
418         nlo &= 0xf;
419
420         Z.hi = Htable[nlo].hi;
421         Z.lo = Htable[nlo].lo;
422
423         while (1) {
424                 rem  = (size_t)Z.lo&0xf;
425                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
426                 Z.hi = (Z.hi>>4);
427                 if (sizeof(size_t)==8)
428                         Z.hi ^= rem_4bit[rem];
429                 else
430                         Z.hi ^= (u64)rem_4bit[rem]<<32;
431
432                 Z.hi ^= Htable[nhi].hi;
433                 Z.lo ^= Htable[nhi].lo;
434
435                 if (--cnt<0)            break;
436
437                 nlo  = ((const u8 *)Xi)[cnt];
438                 nlo ^= inp[cnt];
439                 nhi  = nlo>>4;
440                 nlo &= 0xf;
441
442                 rem  = (size_t)Z.lo&0xf;
443                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
444                 Z.hi = (Z.hi>>4);
445                 if (sizeof(size_t)==8)
446                         Z.hi ^= rem_4bit[rem];
447                 else
448                         Z.hi ^= (u64)rem_4bit[rem]<<32;
449
450                 Z.hi ^= Htable[nlo].hi;
451                 Z.lo ^= Htable[nlo].lo;
452         }
453
454         if (is_endian.little) {
455 #ifdef BSWAP8
456                 Xi[0] = BSWAP8(Z.hi);
457                 Xi[1] = BSWAP8(Z.lo);
458 #else
459                 u8 *p = (u8 *)Xi;
460                 u32 v;
461                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
462                 v = (u32)(Z.hi);        PUTU32(p+4,v);
463                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
464                 v = (u32)(Z.lo);        PUTU32(p+12,v);
465 #endif
466         }
467         else {
468                 Xi[0] = Z.hi;
469                 Xi[1] = Z.lo;
470         }
471     } while (inp+=16, len-=16);
472 }
473 #endif
474 #else
475 void gcm_gmult_4bit(u64 Xi[2],u128 Htable[16]);
476 void gcm_ghash_4bit(const u8 *inp,size_t len,u64 Xi[2],u128 Htable[16]);
477 #endif
478
479 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
480 #define GHASH(in,len,ctx) gcm_ghash_4bit(in,len,ctx->Xi.u,ctx->Htable)
481 #define GHASH_CHUNK       1024
482
483 #else   /* !_4BIT */
484
485 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
486 {
487         u128 V,Z = { 0,0 };
488         long X;
489         int  i,j;
490         const long *xi = (const long *)Xi;
491         const union { long one; char little; } is_endian = {1};
492
493         V.hi = H[0];    /* H is in host byte order, no byte swapping */
494         V.lo = H[1];
495
496         for (j=0; j<16/sizeof(long); ++j) {
497                 if (is_endian.little) {
498                         if (sizeof(long)==8) {
499 #ifdef BSWAP8
500                                 X = (long)(BSWAP8(xi[j]));
501 #else
502                                 const u8 *p = (const u8 *)(xi+j);
503                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
504 #endif
505                         }
506                         else {
507                                 const u8 *p = (const u8 *)(xi+j);
508                                 X = (long)GETU32(p);
509                         }
510                 }
511                 else
512                         X = xi[j];
513
514                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
515                         u64 M = (u64)(X>>(8*sizeof(long)-1));
516                         Z.hi ^= V.hi&M;
517                         Z.lo ^= V.lo&M;
518
519                         if (sizeof(size_t)==8) {
520                                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
521                                 V.lo  = (V.hi<<63)|(V.lo>>1);
522                                 V.hi  = (V.hi>>1 )^T;
523                         }
524                         else {
525                                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
526                                 V.lo  = (V.hi<<63)|(V.lo>>1);
527                                 V.hi  = (V.hi>>1 )^((u64)T<<32);
528                         }
529                                 
530                 }
531         }
532
533         if (is_endian.little) {
534 #ifdef BSWAP8
535                 Xi[0] = BSWAP8(Z.hi);
536                 Xi[1] = BSWAP8(Z.lo);
537 #else
538                 u8 *p = (u8 *)Xi;
539                 u32 v;
540                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
541                 v = (u32)(Z.hi);        PUTU32(p+4,v);
542                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
543                 v = (u32)(Z.lo);        PUTU32(p+12,v);
544 #endif
545         }
546         else {
547                 Xi[0] = Z.hi;
548                 Xi[1] = Z.lo;
549         }
550 }
551 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
552 #endif
553
554 typedef struct {
555         /* Following 6 names follow names in GCM specification */
556         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
557                                                 Xi,H,
558                                                 len;
559         /* Pre-computed table used by gcm_gmult_4bit */
560         u128 Htable[16];
561         unsigned int res, ctr;
562         block128_f block;
563         void *key;
564 } GCM128_CONTEXT;
565
566 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
567 {
568         const union { long one; char little; } is_endian = {1};
569
570         memset(ctx,0,sizeof(*ctx));
571         ctx->block = block;
572         ctx->key   = key;
573
574         (*block)(ctx->H.c,ctx->H.c,key);
575
576         if (is_endian.little) {
577                 /* H is stored in host byte order */
578 #ifdef BSWAP8
579                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
580                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
581 #else
582                 u8 *p = ctx->H.c;
583                 u64 hi,lo;
584                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
585                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
586                 ctx->H.u[0] = hi;
587                 ctx->H.u[1] = lo;
588 #endif
589         }
590
591         gcm_init_4bit(ctx->Htable,ctx->H.u);
592 }
593
594 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
595 {
596         const union { long one; char little; } is_endian = {1};
597
598         ctx->Yi.u[0]  = 0;
599         ctx->Yi.u[1]  = 0;
600         ctx->Xi.u[0]  = 0;
601         ctx->Xi.u[1]  = 0;
602         ctx->len.u[0] = 0;
603         ctx->len.u[1] = 0;
604         ctx->res = 0;
605
606         if (len==12) {
607                 memcpy(ctx->Yi.c,iv,12);
608                 ctx->Yi.c[15]=1;
609                 ctx->ctr=1;
610         }
611         else {
612                 size_t i;
613                 u64 len0 = len;
614
615                 while (len>=16) {
616                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
617                         GCM_MUL(ctx,Yi);
618                         iv += 16;
619                         len -= 16;
620                 }
621                 if (len) {
622                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
623                         GCM_MUL(ctx,Yi);
624                 }
625                 len0 <<= 3;
626                 if (is_endian.little) {
627 #ifdef BSWAP8
628                         ctx->Yi.u[1]  ^= BSWAP8(len0);
629 #else
630                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
631                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
632                         ctx->Yi.c[10] ^= (u8)(len0>>40);
633                         ctx->Yi.c[11] ^= (u8)(len0>>32);
634                         ctx->Yi.c[12] ^= (u8)(len0>>24);
635                         ctx->Yi.c[13] ^= (u8)(len0>>16);
636                         ctx->Yi.c[14] ^= (u8)(len0>>8);
637                         ctx->Yi.c[15] ^= (u8)(len0);
638 #endif
639                 }
640                 else
641                         ctx->Yi.u[1]  ^= len0;
642
643                 GCM_MUL(ctx,Yi);
644
645                 if (is_endian.little)
646                         ctx->ctr = GETU32(ctx->Yi.c+12);
647                 else
648                         ctx->ctr = ctx->Yi.d[3];
649         }
650
651         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
652         ++ctx->ctr;
653         if (is_endian.little)
654                 PUTU32(ctx->Yi.c+12,ctx->ctr);
655         else
656                 ctx->Yi.d[3] = ctx->ctr;
657 }
658
659 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
660 {
661         size_t i;
662
663         ctx->len.u[0] += len;
664
665 #ifdef GHASH
666         if ((i = (len&(size_t)-16))) {
667                 GHASH(aad,i,ctx);
668                 aad += i;
669                 len -= i;
670         }
671 #else
672         while (len>=16) {
673                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
674                 GCM_MUL(ctx,Xi);
675                 aad += 16;
676                 len -= 16;
677         }
678 #endif
679
680         if (len) {
681                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
682                 GCM_MUL(ctx,Xi);
683         }
684 }
685
686 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
687                 const unsigned char *in, unsigned char *out,
688                 size_t len)
689 {
690         const union { long one; char little; } is_endian = {1};
691         unsigned int n, ctr;
692         size_t i;
693
694         ctx->len.u[1] += len;
695         n   = ctx->res;
696         ctr = ctx->ctr;
697
698 #if !defined(OPENSSL_SMALL_FOOTPRINT)
699         if (16%sizeof(size_t) == 0) do {        /* always true actually */
700                 if (n) {
701                         while (n && len) {
702                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
703                                 --len;
704                                 n = (n+1)%16;
705                         }
706                         if (n==0) GCM_MUL(ctx,Xi);
707                         else {
708                                 ctx->res = n;
709                                 return;
710                         }
711                 }
712 #if defined(STRICT_ALIGNMENT)
713                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
714                         break;
715 #endif
716 #ifdef GHASH
717                 while (len>=GHASH_CHUNK) {
718                     size_t j=GHASH_CHUNK;
719
720                     while (j) {
721                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
722                         ++ctr;
723                         if (is_endian.little)
724                                 PUTU32(ctx->Yi.c+12,ctr);
725                         else
726                                 ctx->Yi.d[3] = ctr;
727                         for (i=0; i<16; i+=sizeof(size_t))
728                                 *(size_t *)(out+i) =
729                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
730                         out += 16;
731                         in  += 16;
732                         j   -= 16;
733                     }
734                     GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx);
735                     len -= GHASH_CHUNK;
736                 }
737                 if ((i = (len&(size_t)-16))) {
738                     size_t j=i;
739
740                     while (len>=16) {
741                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
742                         ++ctr;
743                         if (is_endian.little)
744                                 PUTU32(ctx->Yi.c+12,ctr);
745                         else
746                                 ctx->Yi.d[3] = ctr;
747                         for (i=0; i<16; i+=sizeof(size_t))
748                                 *(size_t *)(out+i) =
749                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
750                         out += 16;
751                         in  += 16;
752                         len -= 16;
753                     }
754                     GHASH(out-j,j,ctx);
755                 }
756 #else
757                 while (len>=16) {
758                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
759                         ++ctr;
760                         if (is_endian.little)
761                                 PUTU32(ctx->Yi.c+12,ctr);
762                         else
763                                 ctx->Yi.d[3] = ctr;
764                         for (i=0; i<16; i+=sizeof(size_t))
765                                 *(size_t *)(ctx->Xi.c+i) ^=
766                                 *(size_t *)(out+i) =
767                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
768                         GCM_MUL(ctx,Xi);
769                         out += 16;
770                         in  += 16;
771                         len -= 16;
772                 }
773 #endif
774                 if (len) {
775                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
776                         ++ctr;
777                         if (is_endian.little)
778                                 PUTU32(ctx->Yi.c+12,ctr);
779                         else
780                                 ctx->Yi.d[3] = ctr;
781                         while (len--) {
782                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
783                                 ++n;
784                         }
785                 }
786
787                 ctx->res = n;
788                 ctx->ctr = ctr;
789                 return;
790         } while(0);
791 #endif
792         for (i=0;i<len;++i) {
793                 if (n==0) {
794                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
795                         ++ctr;
796                         if (is_endian.little)
797                                 PUTU32(ctx->Yi.c+12,ctr);
798                         else
799                                 ctx->Yi.d[3] = ctr;
800                 }
801                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
802                 n = (n+1)%16;
803                 if (n==0)
804                         GCM_MUL(ctx,Xi);
805         }
806
807         ctx->res = n;
808         ctx->ctr = ctr;
809 }
810
811 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
812                 const unsigned char *in, unsigned char *out,
813                 size_t len)
814 {
815         const union { long one; char little; } is_endian = {1};
816         unsigned int n, ctr;
817         size_t i;
818
819         ctx->len.u[1] += len;
820         n   = ctx->res;
821         ctr = ctx->ctr;
822
823 #if !defined(OPENSSL_SMALL_FOOTPRINT)
824         if (16%sizeof(size_t) == 0) do {        /* always true actually */
825                 if (n) {
826                         while (n && len) {
827                                 u8 c = *(in++);
828                                 *(out++) = c^ctx->EKi.c[n];
829                                 ctx->Xi.c[n] ^= c;
830                                 --len;
831                                 n = (n+1)%16;
832                         }
833                         if (n==0) GCM_MUL (ctx,Xi);
834                         else {
835                                 ctx->res = n;
836                                 return;
837                         }
838                 }
839 #if defined(STRICT_ALIGNMENT)
840                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
841                         break;
842 #endif
843 #ifdef GHASH
844                 while (len>=GHASH_CHUNK) {
845                     size_t j=GHASH_CHUNK;
846
847                     GHASH(in,GHASH_CHUNK,ctx);
848                     while (j) {
849                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
850                         ++ctr;
851                         if (is_endian.little)
852                                 PUTU32(ctx->Yi.c+12,ctr);
853                         else
854                                 ctx->Yi.d[3] = ctr;
855                         for (i=0; i<16; i+=sizeof(size_t))
856                                 *(size_t *)(out+i) =
857                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
858                         out += 16;
859                         in  += 16;
860                         j   -= 16;
861                     }
862                     len -= GHASH_CHUNK;
863                 }
864                 if ((i = (len&(size_t)-16))) {
865                     GHASH(in,i,ctx);
866                     while (len>=16) {
867                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
868                         ++ctr;
869                         if (is_endian.little)
870                                 PUTU32(ctx->Yi.c+12,ctr);
871                         else
872                                 ctx->Yi.d[3] = ctr;
873                         for (i=0; i<16; i+=sizeof(size_t))
874                                 *(size_t *)(out+i) =
875                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
876                         out += 16;
877                         in  += 16;
878                         len -= 16;
879                     }
880                 }
881 #else
882                 while (len>=16) {
883                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
884                         ++ctr;
885                         if (is_endian.little)
886                                 PUTU32(ctx->Yi.c+12,ctr);
887                         else
888                                 ctx->Yi.d[3] = ctr;
889                         for (i=0; i<16; i+=sizeof(size_t)) {
890                                 size_t c = *(size_t *)(in+i);
891                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
892                                 *(size_t *)(ctx->Xi.c+i) ^= c;
893                         }
894                         GCM_MUL(ctx,Xi);
895                         out += 16;
896                         in  += 16;
897                         len -= 16;
898                 }
899 #endif
900                 if (len) {
901                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
902                         ++ctr;
903                         if (is_endian.little)
904                                 PUTU32(ctx->Yi.c+12,ctr);
905                         else
906                                 ctx->Yi.d[3] = ctr;
907                         while (len--) {
908                                 u8 c = in[n];
909                                 ctx->Xi.c[n] ^= c;
910                                 out[n] = c^ctx->EKi.c[n];
911                                 ++n;
912                         }
913                 }
914
915                 ctx->res = n;
916                 ctx->ctr = ctr;
917                 return;
918         } while(0);
919 #endif
920         for (i=0;i<len;++i) {
921                 u8 c;
922                 if (n==0) {
923                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
924                         ++ctr;
925                         if (is_endian.little)
926                                 PUTU32(ctx->Yi.c+12,ctr);
927                         else
928                                 ctx->Yi.d[3] = ctr;
929                 }
930                 c = in[i];
931                 out[i] ^= ctx->EKi.c[n];
932                 ctx->Xi.c[n] ^= c;
933                 n = (n+1)%16;
934                 if (n==0)
935                         GCM_MUL(ctx,Xi);
936         }
937
938         ctx->res = n;
939         ctx->ctr = ctr;
940 }
941
942 void CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx)
943 {
944         const union { long one; char little; } is_endian = {1};
945         u64 alen = ctx->len.u[0]<<3;
946         u64 clen = ctx->len.u[1]<<3;
947
948         if (ctx->res)
949                 GCM_MUL(ctx,Xi);
950
951         if (is_endian.little) {
952 #ifdef BSWAP8
953                 alen = BSWAP8(alen);
954                 clen = BSWAP8(clen);
955 #else
956                 u8 *p = ctx->len.c;
957
958                 ctx->len.u[0] = alen;
959                 ctx->len.u[1] = clen;
960
961                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
962                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
963 #endif
964         }
965
966         ctx->Xi.u[0] ^= alen;
967         ctx->Xi.u[1] ^= clen;
968         GCM_MUL(ctx,Xi);
969
970         ctx->Xi.u[0] ^= ctx->EK0.u[0];
971         ctx->Xi.u[1] ^= ctx->EK0.u[1];
972 }
973
974 #if defined(SELFTEST)
975 #include <stdio.h>
976 #include <openssl/aes.h>
977
978 /* Test Case 1 */
979 static const u8 K1[16],
980                 *P1=NULL,
981                 *A1=NULL,
982                 IV1[12],
983                 *C1=NULL,
984                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
985 /* Test Case 2 */
986 #define K2 K1
987 #define A2 A1
988 #define IV2 IV1
989 static const u8 P2[16],
990                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
991                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
992
993 /* Test Case 3 */
994 #define A3 A2
995 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
996                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
997                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
998                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
999                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1000                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1001                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1002                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1003                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1004                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1005                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4,};
1006
1007 /* Test Case 4 */
1008 #define K4 K3
1009 #define IV4 IV3
1010 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1011                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1012                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1013                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1014                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1015                         0xab,0xad,0xda,0xd2},
1016                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1017                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1018                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1019                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1020                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1021
1022 /* Test Case 5 */
1023 #define K5 K4
1024 #define P5 P4
1025 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1026                         0xab,0xad,0xda,0xd2},
1027                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1028                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1029                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1030                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1031                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1032                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1033 /* Test Case 6 */
1034 #define K6 K5
1035 #define P6 P5
1036 #define A6 A5
1037 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1038                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1039                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1040                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1041                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1042                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1043                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1044                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1045                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1046
1047 /* Test Case 7 */
1048 static const u8 K7[24],
1049                 *P7=NULL,
1050                 *A7=NULL,
1051                 IV7[12],
1052                 *C7=NULL,
1053                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1054
1055 /* Test Case 8 */
1056 #define K8 K7
1057 #define IV8 IV7
1058 #define A8 A7
1059 static const u8 P8[16],
1060                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1061                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1062
1063 /* Test Case 9 */
1064 #define A9 A8
1065 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1066                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1067                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1068                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1069                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1070                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1071                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1072                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1073                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1074                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1075                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1076                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1077
1078 /* Test Case 10 */
1079 #define K10 K9
1080 #define IV10 IV9
1081 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1082                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1083                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1084                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1085                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1086                         0xab,0xad,0xda,0xd2},
1087                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1088                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1089                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1090                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1091                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1092
1093 /* Test Case 11 */
1094 #define K11 K10
1095 #define P11 P10
1096 #define A11 A10
1097 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1098                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1099                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1100                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1101                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1102                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1103
1104 /* Test Case 12 */
1105 #define K12 K11
1106 #define P12 P11
1107 #define A12 A11
1108 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1109                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1110                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1111                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1112                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1113                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1114                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1115                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1116                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1117
1118 /* Test Case 13 */
1119 static const u8 K13[32],
1120                 *P13=NULL,
1121                 *A13=NULL,
1122                 IV13[12],
1123                 *C13=NULL,
1124                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1125
1126 /* Test Case 14 */
1127 #define K14 K13
1128 #define A14 A13
1129 static const u8 P14[16],
1130                 IV14[12],
1131                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1132                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1133
1134 /* Test Case 15 */
1135 #define A15 A14
1136 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1137                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1138                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1139                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1140                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1141                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1142                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1143                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1144                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1145                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1146                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1147                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1148
1149 /* Test Case 16 */
1150 #define K16 K15
1151 #define IV16 IV15
1152 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1153                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1154                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1155                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1156                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1157                         0xab,0xad,0xda,0xd2},
1158                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1159                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1160                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1161                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1162                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1163
1164 /* Test Case 17 */
1165 #define K17 K16
1166 #define P17 P16
1167 #define A17 A16
1168 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1169                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1170                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1171                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1172                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1173                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1174
1175 /* Test Case 18 */
1176 #define K18 K17
1177 #define P18 P17
1178 #define A18 A17
1179 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1180                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1181                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1182                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1183                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1184                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1185                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1186                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1187                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1188
1189 #define TEST_CASE(n)    do {                                    \
1190         u8 out[sizeof(P##n)];                                   \
1191         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1192         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1193         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1194         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1195         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1196         CRYPTO_gcm128_finish(&ctx);                             \
1197         if (memcmp(ctx.Xi.c,T##n,16) || (C##n && memcmp(out,C##n,sizeof(out)))) \
1198                 ret++, printf ("encrypt test#%d failed.\n",n);\
1199         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1200         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1201         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1202         CRYPTO_gcm128_finish(&ctx);                             \
1203         if (memcmp(ctx.Xi.c,T##n,16) || (P##n && memcmp(out,P##n,sizeof(out)))) \
1204                 ret++, printf ("decrypt test#%d failed.\n",n);\
1205         } while(0)
1206
1207 int main()
1208 {
1209         GCM128_CONTEXT ctx;
1210         AES_KEY key;
1211         int ret=0;
1212
1213         TEST_CASE(1);
1214         TEST_CASE(2);
1215         TEST_CASE(3);
1216         TEST_CASE(4);
1217         TEST_CASE(5);
1218         TEST_CASE(6);
1219         TEST_CASE(7);
1220         TEST_CASE(8);
1221         TEST_CASE(9);
1222         TEST_CASE(10);
1223         TEST_CASE(11);
1224         TEST_CASE(12);
1225         TEST_CASE(13);
1226         TEST_CASE(14);
1227         TEST_CASE(15);
1228         TEST_CASE(16);
1229         TEST_CASE(17);
1230         TEST_CASE(18);
1231
1232         {
1233         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1234         union { u64 u; u8 c[1024]; } buf;
1235         int i;
1236
1237         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1238         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1239         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1240
1241         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1242         start = OPENSSL_rdtsc();
1243         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1244         gcm_t = OPENSSL_rdtsc() - start;
1245
1246         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1247                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1248                         (block128_f)AES_encrypt);
1249         start = OPENSSL_rdtsc();
1250         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1251                 &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1252                 (block128_f)AES_encrypt);
1253         ctr_t = OPENSSL_rdtsc() - start;
1254
1255         printf("%.2f-%.2f=%.2f\n",
1256                         gcm_t/(double)sizeof(buf),
1257                         ctr_t/(double)sizeof(buf),
1258                         (gcm_t-ctr_t)/(double)sizeof(buf));
1259         }
1260
1261         return ret;
1262 }
1263 #endif