ghash-ia64.pl: 50% performance improvement of gcm_ghash_4bit.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #include <openssl/crypto.h>
51 #include "modes_lcl.h"
52 #include <string.h>
53
54 #ifndef MODES_DEBUG
55 # ifndef NDEBUG
56 #  define NDEBUG
57 # endif
58 #endif
59 #include <assert.h>
60
61 typedef struct { u64 hi,lo; } u128;
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 #ifdef  TABLE_BITS
86 #undef  TABLE_BITS
87 #endif
88 /*
89  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
90  * never be set to 8. 8 is effectively reserved for testing purposes.
91  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
92  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
93  * whole spectrum of possible table driven implementations. Why? In
94  * non-"Shoup's" case memory access pattern is segmented in such manner,
95  * that it's trivial to see that cache timing information can reveal
96  * fair portion of intermediate hash value. Given that ciphertext is
97  * always available to attacker, it's possible for him to attempt to
98  * deduce secret parameter H and if successful, tamper with messages
99  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
100  * not as trivial, but there is no reason to believe that it's resistant
101  * to cache-timing attack. And the thing about "8-bit" implementation is
102  * that it consumes 16 (sixteen) times more memory, 4KB per individual
103  * key + 1KB shared. Well, on pros side it should be twice as fast as
104  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
105  * was observed to run ~75% faster, closer to 100% for commercial
106  * compilers... Yet "4-bit" procedure is preferred, because it's
107  * believed to provide better security-performance balance and adequate
108  * all-round performance. "All-round" refers to things like:
109  *
110  * - shorter setup time effectively improves overall timing for
111  *   handling short messages;
112  * - larger table allocation can become unbearable because of VM
113  *   subsystem penalties (for example on Windows large enough free
114  *   results in VM working set trimming, meaning that consequent
115  *   malloc would immediately incur working set expansion);
116  * - larger table has larger cache footprint, which can affect
117  *   performance of other code paths (not necessarily even from same
118  *   thread in Hyper-Threading world);
119  */
120 #define TABLE_BITS 4
121
122 #if     TABLE_BITS==8
123
124 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
125 {
126         int  i, j;
127         u128 V;
128
129         Htable[0].hi = 0;
130         Htable[0].lo = 0;
131         V.hi = H[0];
132         V.lo = H[1];
133
134         for (Htable[128]=V, i=64; i>0; i>>=1) {
135                 REDUCE1BIT(V);
136                 Htable[i] = V;
137         }
138
139         for (i=2; i<256; i<<=1) {
140                 u128 *Hi = Htable+i, H0 = *Hi;
141                 for (j=1; j<i; ++j) {
142                         Hi[j].hi = H0.hi^Htable[j].hi;
143                         Hi[j].lo = H0.lo^Htable[j].lo;
144                 }
145         }
146 }
147
148 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
149 {
150         u128 Z = { 0, 0};
151         const u8 *xi = (const u8 *)Xi+15;
152         size_t rem, n = *xi;
153         const union { long one; char little; } is_endian = {1};
154         static const size_t rem_8bit[256] = {
155                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
156                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
157                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
158                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
159                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
160                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
161                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
162                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
163                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
164                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
165                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
166                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
167                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
168                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
169                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
170                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
171                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
172                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
173                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
174                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
175                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
176                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
177                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
178                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
179                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
180                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
181                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
182                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
183                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
184                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
185                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
186                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
187                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
188                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
189                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
190                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
191                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
192                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
193                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
194                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
195                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
196                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
197                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
198                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
199                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
200                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
201                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
202                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
203                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
204                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
205                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
206                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
207                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
208                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
209                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
210                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
211                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
212                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
213                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
214                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
215                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
216                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
217                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
218                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
219
220         while (1) {
221                 Z.hi ^= Htable[n].hi;
222                 Z.lo ^= Htable[n].lo;
223
224                 if ((u8 *)Xi==xi)       break;
225
226                 n = *(--xi);
227
228                 rem  = (size_t)Z.lo&0xff;
229                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
230                 Z.hi = (Z.hi>>8);
231                 if (sizeof(size_t)==8)
232                         Z.hi ^= rem_8bit[rem];
233                 else
234                         Z.hi ^= (u64)rem_8bit[rem]<<32;
235         }
236
237         if (is_endian.little) {
238 #ifdef BSWAP8
239                 Xi[0] = BSWAP8(Z.hi);
240                 Xi[1] = BSWAP8(Z.lo);
241 #else
242                 u8 *p = (u8 *)Xi;
243                 u32 v;
244                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
245                 v = (u32)(Z.hi);        PUTU32(p+4,v);
246                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
247                 v = (u32)(Z.lo);        PUTU32(p+12,v);
248 #endif
249         }
250         else {
251                 Xi[0] = Z.hi;
252                 Xi[1] = Z.lo;
253         }
254 }
255 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
256
257 #elif   TABLE_BITS==4
258
259 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
260 {
261         u128 V;
262 #if defined(OPENSSL_SMALL_FOOTPRINT)
263         int  i;
264 #endif
265
266         Htable[0].hi = 0;
267         Htable[0].lo = 0;
268         V.hi = H[0];
269         V.lo = H[1];
270
271 #if defined(OPENSSL_SMALL_FOOTPRINT)
272         for (Htable[8]=V, i=4; i>0; i>>=1) {
273                 REDUCE1BIT(V);
274                 Htable[i] = V;
275         }
276
277         for (i=2; i<16; i<<=1) {
278                 u128 *Hi = Htable+i;
279                 int   j;
280                 for (V=*Hi, j=1; j<i; ++j) {
281                         Hi[j].hi = V.hi^Htable[j].hi;
282                         Hi[j].lo = V.lo^Htable[j].lo;
283                 }
284         }
285 #else
286         Htable[8] = V;
287         REDUCE1BIT(V);
288         Htable[4] = V;
289         REDUCE1BIT(V);
290         Htable[2] = V;
291         REDUCE1BIT(V);
292         Htable[1] = V;
293         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
294         V=Htable[4];
295         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
296         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
297         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
298         V=Htable[8];
299         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
300         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
301         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
302         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
303         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
304         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
305         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
306 #endif
307 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
308         /*
309          * ARM assembler expects specific dword order in Htable.
310          */
311         {
312         int j;
313         const union { long one; char little; } is_endian = {1};
314
315         if (is_endian.little)
316                 for (j=0;j<16;++j) {
317                         V = Htable[j];
318                         Htable[j].hi = V.lo;
319                         Htable[j].lo = V.hi;
320                 }
321         else
322                 for (j=0;j<16;++j) {
323                         V = Htable[j];
324                         Htable[j].hi = V.lo<<32|V.lo>>32;
325                         Htable[j].lo = V.hi<<32|V.hi>>32;
326                 }
327         }
328 #endif
329 }
330
331 #ifndef GHASH_ASM
332 static const size_t rem_4bit[16] = {
333         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
334         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
335         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
336         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
337
338 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
339 {
340         u128 Z;
341         int cnt = 15;
342         size_t rem, nlo, nhi;
343         const union { long one; char little; } is_endian = {1};
344
345         nlo  = ((const u8 *)Xi)[15];
346         nhi  = nlo>>4;
347         nlo &= 0xf;
348
349         Z.hi = Htable[nlo].hi;
350         Z.lo = Htable[nlo].lo;
351
352         while (1) {
353                 rem  = (size_t)Z.lo&0xf;
354                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
355                 Z.hi = (Z.hi>>4);
356                 if (sizeof(size_t)==8)
357                         Z.hi ^= rem_4bit[rem];
358                 else
359                         Z.hi ^= (u64)rem_4bit[rem]<<32;
360
361                 Z.hi ^= Htable[nhi].hi;
362                 Z.lo ^= Htable[nhi].lo;
363
364                 if (--cnt<0)            break;
365
366                 nlo  = ((const u8 *)Xi)[cnt];
367                 nhi  = nlo>>4;
368                 nlo &= 0xf;
369
370                 rem  = (size_t)Z.lo&0xf;
371                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
372                 Z.hi = (Z.hi>>4);
373                 if (sizeof(size_t)==8)
374                         Z.hi ^= rem_4bit[rem];
375                 else
376                         Z.hi ^= (u64)rem_4bit[rem]<<32;
377
378                 Z.hi ^= Htable[nlo].hi;
379                 Z.lo ^= Htable[nlo].lo;
380         }
381
382         if (is_endian.little) {
383 #ifdef BSWAP8
384                 Xi[0] = BSWAP8(Z.hi);
385                 Xi[1] = BSWAP8(Z.lo);
386 #else
387                 u8 *p = (u8 *)Xi;
388                 u32 v;
389                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
390                 v = (u32)(Z.hi);        PUTU32(p+4,v);
391                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
392                 v = (u32)(Z.lo);        PUTU32(p+12,v);
393 #endif
394         }
395         else {
396                 Xi[0] = Z.hi;
397                 Xi[1] = Z.lo;
398         }
399 }
400
401 #if !defined(OPENSSL_SMALL_FOOTPRINT)
402 /*
403  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
404  * details... Compiler-generated code doesn't seem to give any
405  * performance improvement, at least not on x86[_64]. It's here
406  * mostly as reference and a placeholder for possible future
407  * non-trivial optimization[s]...
408  */
409 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
410                                 const u8 *inp,size_t len)
411 {
412     u128 Z;
413     int cnt;
414     size_t rem, nlo, nhi;
415     const union { long one; char little; } is_endian = {1};
416
417 #if 1
418     do {
419         cnt  = 15;
420         nlo  = ((const u8 *)Xi)[15];
421         nlo ^= inp[15];
422         nhi  = nlo>>4;
423         nlo &= 0xf;
424
425         Z.hi = Htable[nlo].hi;
426         Z.lo = Htable[nlo].lo;
427
428         while (1) {
429                 rem  = (size_t)Z.lo&0xf;
430                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
431                 Z.hi = (Z.hi>>4);
432                 if (sizeof(size_t)==8)
433                         Z.hi ^= rem_4bit[rem];
434                 else
435                         Z.hi ^= (u64)rem_4bit[rem]<<32;
436
437                 Z.hi ^= Htable[nhi].hi;
438                 Z.lo ^= Htable[nhi].lo;
439
440                 if (--cnt<0)            break;
441
442                 nlo  = ((const u8 *)Xi)[cnt];
443                 nlo ^= inp[cnt];
444                 nhi  = nlo>>4;
445                 nlo &= 0xf;
446
447                 rem  = (size_t)Z.lo&0xf;
448                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
449                 Z.hi = (Z.hi>>4);
450                 if (sizeof(size_t)==8)
451                         Z.hi ^= rem_4bit[rem];
452                 else
453                         Z.hi ^= (u64)rem_4bit[rem]<<32;
454
455                 Z.hi ^= Htable[nlo].hi;
456                 Z.lo ^= Htable[nlo].lo;
457         }
458 #else
459     /*
460      * Extra 256+16 bytes per-key plus 512 bytes shared tables
461      * [should] give ~50% improvement... One could have PACK()-ed
462      * the rem_8bit even here, but the priority is to minimize
463      * cache footprint...
464      */ 
465     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
466     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       1024
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 struct gcm128_context {
649         /* Following 6 names follow names in GCM specification */
650         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
651                                                 Xi,H,len;
652         /* Pre-computed table used by gcm_gmult_* */
653 #if TABLE_BITS==8
654         u128 Htable[256];
655 #else
656         u128 Htable[16];
657         void (*gmult)(u64 Xi[2],const u128 Htable[16]);
658         void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
659 #endif
660         unsigned int res, pad;
661         block128_f block;
662         void *key;
663 };
664
665 #if     TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
666         (defined(__i386)        || defined(__i386__)    || \
667          defined(__x86_64)      || defined(__x86_64__)  || \
668          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
669 # define GHASH_ASM_IAX
670 extern unsigned int OPENSSL_ia32cap_P[2];
671
672 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
673 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675
676 # if    defined(__i386) || defined(__i386__) || defined(_M_IX86)
677 #  define GHASH_ASM_X86
678 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
679 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
680
681 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
682 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
683 # endif
684
685 # undef  GCM_MUL
686 # define GCM_MUL(ctx,Xi)   (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
687 # undef  GHASH
688 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
689 #endif
690
691 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
692 {
693         const union { long one; char little; } is_endian = {1};
694
695         memset(ctx,0,sizeof(*ctx));
696         ctx->block = block;
697         ctx->key   = key;
698
699         (*block)(ctx->H.c,ctx->H.c,key);
700
701         if (is_endian.little) {
702                 /* H is stored in host byte order */
703 #ifdef BSWAP8
704                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
705                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
706 #else
707                 u8 *p = ctx->H.c;
708                 u64 hi,lo;
709                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
710                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
711                 ctx->H.u[0] = hi;
712                 ctx->H.u[1] = lo;
713 #endif
714         }
715
716 #if     TABLE_BITS==8
717         gcm_init_8bit(ctx->Htable,ctx->H.u);
718 #elif   TABLE_BITS==4
719 # if    defined(GHASH_ASM_IAX)                  /* both x86 and x86_64 */
720         if (OPENSSL_ia32cap_P[1]&(1<<1)) {
721                 gcm_init_clmul(ctx->Htable,ctx->H.u);
722                 ctx->gmult = gcm_gmult_clmul;
723                 ctx->ghash = gcm_ghash_clmul;
724                 return;
725         }
726         gcm_init_4bit(ctx->Htable,ctx->H.u);
727 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
728         if (OPENSSL_ia32cap_P[0]&(1<<23)) {
729                 ctx->gmult = gcm_gmult_4bit_mmx;
730                 ctx->ghash = gcm_ghash_4bit_mmx;
731         } else {
732                 ctx->gmult = gcm_gmult_4bit_x86;
733                 ctx->ghash = gcm_ghash_4bit_x86;
734         }
735 #  else
736         ctx->gmult = gcm_gmult_4bit;
737         ctx->ghash = gcm_ghash_4bit;
738 #  endif
739 # else
740         gcm_init_4bit(ctx->Htable,ctx->H.u);
741 # endif
742 #endif
743 }
744
745 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
746 {
747         const union { long one; char little; } is_endian = {1};
748         unsigned int ctr;
749
750         ctx->Yi.u[0]  = 0;
751         ctx->Yi.u[1]  = 0;
752         ctx->Xi.u[0]  = 0;
753         ctx->Xi.u[1]  = 0;
754         ctx->len.u[0] = 0;
755         ctx->len.u[1] = 0;
756         ctx->res = 0;
757
758         if (len==12) {
759                 memcpy(ctx->Yi.c,iv,12);
760                 ctx->Yi.c[15]=1;
761                 ctr=1;
762         }
763         else {
764                 size_t i;
765                 u64 len0 = len;
766
767                 while (len>=16) {
768                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
769                         GCM_MUL(ctx,Yi);
770                         iv += 16;
771                         len -= 16;
772                 }
773                 if (len) {
774                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
775                         GCM_MUL(ctx,Yi);
776                 }
777                 len0 <<= 3;
778                 if (is_endian.little) {
779 #ifdef BSWAP8
780                         ctx->Yi.u[1]  ^= BSWAP8(len0);
781 #else
782                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
783                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
784                         ctx->Yi.c[10] ^= (u8)(len0>>40);
785                         ctx->Yi.c[11] ^= (u8)(len0>>32);
786                         ctx->Yi.c[12] ^= (u8)(len0>>24);
787                         ctx->Yi.c[13] ^= (u8)(len0>>16);
788                         ctx->Yi.c[14] ^= (u8)(len0>>8);
789                         ctx->Yi.c[15] ^= (u8)(len0);
790 #endif
791                 }
792                 else
793                         ctx->Yi.u[1]  ^= len0;
794
795                 GCM_MUL(ctx,Yi);
796
797                 if (is_endian.little)
798                         ctr = GETU32(ctx->Yi.c+12);
799                 else
800                         ctr = ctx->Yi.d[3];
801         }
802
803         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
804         ++ctr;
805         if (is_endian.little)
806                 PUTU32(ctx->Yi.c+12,ctr);
807         else
808                 ctx->Yi.d[3] = ctr;
809 }
810
811 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
812 {
813         size_t i;
814
815         ctx->len.u[0] += len;
816
817 #ifdef GHASH
818         if ((i = (len&(size_t)-16))) {
819                 GHASH(ctx,aad,i);
820                 aad += i;
821                 len -= i;
822         }
823 #else
824         while (len>=16) {
825                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
826                 GCM_MUL(ctx,Xi);
827                 aad += 16;
828                 len -= 16;
829         }
830 #endif
831         if (len) {
832                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
833                 GCM_MUL(ctx,Xi);
834         }
835 }
836
837 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
838                 const unsigned char *in, unsigned char *out,
839                 size_t len)
840 {
841         const union { long one; char little; } is_endian = {1};
842         unsigned int n, ctr;
843         size_t i;
844
845         ctx->len.u[1] += len;
846         n   = ctx->res;
847         if (is_endian.little)
848                 ctr = GETU32(ctx->Yi.c+12);
849         else
850                 ctr = ctx->Yi.d[3];
851
852 #if !defined(OPENSSL_SMALL_FOOTPRINT)
853         if (16%sizeof(size_t) == 0) do {        /* always true actually */
854                 if (n) {
855                         while (n && len) {
856                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
857                                 --len;
858                                 n = (n+1)%16;
859                         }
860                         if (n==0) GCM_MUL(ctx,Xi);
861                         else {
862                                 ctx->res = n;
863                                 return;
864                         }
865                 }
866 #if defined(STRICT_ALIGNMENT)
867                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
868                         break;
869 #endif
870 #if defined(GHASH) && defined(GHASH_CHUNK)
871                 while (len>=GHASH_CHUNK) {
872                     size_t j=GHASH_CHUNK;
873
874                     while (j) {
875                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
876                         ++ctr;
877                         if (is_endian.little)
878                                 PUTU32(ctx->Yi.c+12,ctr);
879                         else
880                                 ctx->Yi.d[3] = ctr;
881                         for (i=0; i<16; i+=sizeof(size_t))
882                                 *(size_t *)(out+i) =
883                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
884                         out += 16;
885                         in  += 16;
886                         j   -= 16;
887                     }
888                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
889                     len -= GHASH_CHUNK;
890                 }
891                 if ((i = (len&(size_t)-16))) {
892                     size_t j=i;
893
894                     while (len>=16) {
895                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
896                         ++ctr;
897                         if (is_endian.little)
898                                 PUTU32(ctx->Yi.c+12,ctr);
899                         else
900                                 ctx->Yi.d[3] = ctr;
901                         for (i=0; i<16; i+=sizeof(size_t))
902                                 *(size_t *)(out+i) =
903                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
904                         out += 16;
905                         in  += 16;
906                         len -= 16;
907                     }
908                     GHASH(ctx,out-j,j);
909                 }
910 #else
911                 while (len>=16) {
912                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
913                         ++ctr;
914                         if (is_endian.little)
915                                 PUTU32(ctx->Yi.c+12,ctr);
916                         else
917                                 ctx->Yi.d[3] = ctr;
918                         for (i=0; i<16; i+=sizeof(size_t))
919                                 *(size_t *)(ctx->Xi.c+i) ^=
920                                 *(size_t *)(out+i) =
921                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
922                         GCM_MUL(ctx,Xi);
923                         out += 16;
924                         in  += 16;
925                         len -= 16;
926                 }
927 #endif
928                 if (len) {
929                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
930                         ++ctr;
931                         if (is_endian.little)
932                                 PUTU32(ctx->Yi.c+12,ctr);
933                         else
934                                 ctx->Yi.d[3] = ctr;
935                         while (len--) {
936                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
937                                 ++n;
938                         }
939                 }
940
941                 ctx->res = n;
942                 return;
943         } while(0);
944 #endif
945         for (i=0;i<len;++i) {
946                 if (n==0) {
947                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
948                         ++ctr;
949                         if (is_endian.little)
950                                 PUTU32(ctx->Yi.c+12,ctr);
951                         else
952                                 ctx->Yi.d[3] = ctr;
953                 }
954                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
955                 n = (n+1)%16;
956                 if (n==0)
957                         GCM_MUL(ctx,Xi);
958         }
959
960         ctx->res = n;
961 }
962
963 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
964                 const unsigned char *in, unsigned char *out,
965                 size_t len)
966 {
967         const union { long one; char little; } is_endian = {1};
968         unsigned int n, ctr;
969         size_t i;
970
971         ctx->len.u[1] += len;
972         n   = ctx->res;
973         if (is_endian.little)
974                 ctr = GETU32(ctx->Yi.c+12);
975         else
976                 ctr = ctx->Yi.d[3];
977
978 #if !defined(OPENSSL_SMALL_FOOTPRINT)
979         if (16%sizeof(size_t) == 0) do {        /* always true actually */
980                 if (n) {
981                         while (n && len) {
982                                 u8 c = *(in++);
983                                 *(out++) = c^ctx->EKi.c[n];
984                                 ctx->Xi.c[n] ^= c;
985                                 --len;
986                                 n = (n+1)%16;
987                         }
988                         if (n==0) GCM_MUL (ctx,Xi);
989                         else {
990                                 ctx->res = n;
991                                 return;
992                         }
993                 }
994 #if defined(STRICT_ALIGNMENT)
995                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
996                         break;
997 #endif
998 #if defined(GHASH) && defined(GHASH_CHUNK)
999                 while (len>=GHASH_CHUNK) {
1000                     size_t j=GHASH_CHUNK;
1001
1002                     GHASH(ctx,in,GHASH_CHUNK);
1003                     while (j) {
1004                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1005                         ++ctr;
1006                         if (is_endian.little)
1007                                 PUTU32(ctx->Yi.c+12,ctr);
1008                         else
1009                                 ctx->Yi.d[3] = ctr;
1010                         for (i=0; i<16; i+=sizeof(size_t))
1011                                 *(size_t *)(out+i) =
1012                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1013                         out += 16;
1014                         in  += 16;
1015                         j   -= 16;
1016                     }
1017                     len -= GHASH_CHUNK;
1018                 }
1019                 if ((i = (len&(size_t)-16))) {
1020                     GHASH(ctx,in,i);
1021                     while (len>=16) {
1022                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1023                         ++ctr;
1024                         if (is_endian.little)
1025                                 PUTU32(ctx->Yi.c+12,ctr);
1026                         else
1027                                 ctx->Yi.d[3] = ctr;
1028                         for (i=0; i<16; i+=sizeof(size_t))
1029                                 *(size_t *)(out+i) =
1030                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1031                         out += 16;
1032                         in  += 16;
1033                         len -= 16;
1034                     }
1035                 }
1036 #else
1037                 while (len>=16) {
1038                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1039                         ++ctr;
1040                         if (is_endian.little)
1041                                 PUTU32(ctx->Yi.c+12,ctr);
1042                         else
1043                                 ctx->Yi.d[3] = ctr;
1044                         for (i=0; i<16; i+=sizeof(size_t)) {
1045                                 size_t c = *(size_t *)(in+i);
1046                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1047                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1048                         }
1049                         GCM_MUL(ctx,Xi);
1050                         out += 16;
1051                         in  += 16;
1052                         len -= 16;
1053                 }
1054 #endif
1055                 if (len) {
1056                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1057                         ++ctr;
1058                         if (is_endian.little)
1059                                 PUTU32(ctx->Yi.c+12,ctr);
1060                         else
1061                                 ctx->Yi.d[3] = ctr;
1062                         while (len--) {
1063                                 u8 c = in[n];
1064                                 ctx->Xi.c[n] ^= c;
1065                                 out[n] = c^ctx->EKi.c[n];
1066                                 ++n;
1067                         }
1068                 }
1069
1070                 ctx->res = n;
1071                 return;
1072         } while(0);
1073 #endif
1074         for (i=0;i<len;++i) {
1075                 u8 c;
1076                 if (n==0) {
1077                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1078                         ++ctr;
1079                         if (is_endian.little)
1080                                 PUTU32(ctx->Yi.c+12,ctr);
1081                         else
1082                                 ctx->Yi.d[3] = ctr;
1083                 }
1084                 c = in[i];
1085                 out[i] ^= ctx->EKi.c[n];
1086                 ctx->Xi.c[n] ^= c;
1087                 n = (n+1)%16;
1088                 if (n==0)
1089                         GCM_MUL(ctx,Xi);
1090         }
1091
1092         ctx->res = n;
1093 }
1094
1095 void CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1096                 const unsigned char *in, unsigned char *out,
1097                 size_t len, ctr128_f stream)
1098 {
1099         const union { long one; char little; } is_endian = {1};
1100         unsigned int n, ctr;
1101         size_t i;
1102
1103         ctx->len.u[1] += len;
1104         n   = ctx->res;
1105         if (is_endian.little)
1106                 ctr = GETU32(ctx->Yi.c+12);
1107         else
1108                 ctr = ctx->Yi.d[3];
1109
1110         if (n) {
1111                 while (n && len) {
1112                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1113                         --len;
1114                         n = (n+1)%16;
1115                 }
1116                 if (n==0) GCM_MUL(ctx,Xi);
1117                 else {
1118                         ctx->res = n;
1119                         return;
1120                 }
1121         }
1122 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1123         while (len>=GHASH_CHUNK) {
1124                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1125                 ctr += GHASH_CHUNK/16;
1126                 if (is_endian.little)
1127                         PUTU32(ctx->Yi.c+12,ctr);
1128                 else
1129                         ctx->Yi.d[3] = ctr;
1130                 GHASH(ctx,out,GHASH_CHUNK);
1131                 out += GHASH_CHUNK;
1132                 in  += GHASH_CHUNK;
1133                 len -= GHASH_CHUNK;
1134         }
1135 #endif
1136         if ((i = (len&(size_t)-16))) {
1137                 size_t j=i/16;
1138
1139                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1140                 ctr += j;
1141                 if (is_endian.little)
1142                         PUTU32(ctx->Yi.c+12,ctr);
1143                 else
1144                         ctx->Yi.d[3] = ctr;
1145                 in  += i;
1146                 len -= i;
1147 #if defined(GHASH)
1148                 GHASH(ctx,out,i);
1149                 out += i;
1150 #else
1151                 while (j--) {
1152                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1153                         GCM_MUL(ctx,Xi);
1154                         out += 16;
1155                 }
1156 #endif
1157         }
1158         if (len) {
1159                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1160                 ++ctr;
1161                 if (is_endian.little)
1162                         PUTU32(ctx->Yi.c+12,ctr);
1163                 else
1164                         ctx->Yi.d[3] = ctr;
1165                 while (len--) {
1166                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1167                         ++n;
1168                 }
1169         }
1170
1171         ctx->res = n;
1172 }
1173
1174 void CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1175                 const unsigned char *in, unsigned char *out,
1176                 size_t len,ctr128_f stream)
1177 {
1178         const union { long one; char little; } is_endian = {1};
1179         unsigned int n, ctr;
1180         size_t i;
1181
1182         ctx->len.u[1] += len;
1183         n   = ctx->res;
1184         if (is_endian.little)
1185                 ctr = GETU32(ctx->Yi.c+12);
1186         else
1187                 ctr = ctx->Yi.d[3];
1188
1189         if (n) {
1190                 while (n && len) {
1191                         u8 c = *(in++);
1192                         *(out++) = c^ctx->EKi.c[n];
1193                         ctx->Xi.c[n] ^= c;
1194                         --len;
1195                         n = (n+1)%16;
1196                 }
1197                 if (n==0) GCM_MUL (ctx,Xi);
1198                 else {
1199                         ctx->res = n;
1200                         return;
1201                 }
1202         }
1203 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1204         while (len>=GHASH_CHUNK) {
1205                 GHASH(ctx,in,GHASH_CHUNK);
1206                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1207                 ctr += GHASH_CHUNK/16;
1208                 if (is_endian.little)
1209                         PUTU32(ctx->Yi.c+12,ctr);
1210                 else
1211                         ctx->Yi.d[3] = ctr;
1212                 out += GHASH_CHUNK;
1213                 in  += GHASH_CHUNK;
1214                 len -= GHASH_CHUNK;
1215         }
1216 #endif
1217         if ((i = (len&(size_t)-16))) {
1218                 size_t j=i/16;
1219
1220 #if defined(GHASH)
1221                 GHASH(ctx,in,i);
1222 #else
1223                 while (j--) {
1224                         size_t k;
1225                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1226                         GCM_MUL(ctx,Xi);
1227                         in += 16;
1228                 }
1229                 j   = i/16;
1230                 in -= i;
1231 #endif
1232                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1233                 ctr += j;
1234                 if (is_endian.little)
1235                         PUTU32(ctx->Yi.c+12,ctr);
1236                 else
1237                         ctx->Yi.d[3] = ctr;
1238                 out += i;
1239                 in  += i;
1240                 len -= i;
1241         }
1242         if (len) {
1243                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1244                 ++ctr;
1245                 if (is_endian.little)
1246                         PUTU32(ctx->Yi.c+12,ctr);
1247                 else
1248                         ctx->Yi.d[3] = ctr;
1249                 while (len--) {
1250                         u8 c = in[n];
1251                         ctx->Xi.c[n] ^= c;
1252                         out[n] = c^ctx->EKi.c[n];
1253                         ++n;
1254                 }
1255         }
1256
1257         ctx->res = n;
1258 }
1259
1260 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1261                         size_t len)
1262 {
1263         const union { long one; char little; } is_endian = {1};
1264         u64 alen = ctx->len.u[0]<<3;
1265         u64 clen = ctx->len.u[1]<<3;
1266
1267         if (ctx->res)
1268                 GCM_MUL(ctx,Xi);
1269
1270         if (is_endian.little) {
1271 #ifdef BSWAP8
1272                 alen = BSWAP8(alen);
1273                 clen = BSWAP8(clen);
1274 #else
1275                 u8 *p = ctx->len.c;
1276
1277                 ctx->len.u[0] = alen;
1278                 ctx->len.u[1] = clen;
1279
1280                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1281                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1282 #endif
1283         }
1284
1285         ctx->Xi.u[0] ^= alen;
1286         ctx->Xi.u[1] ^= clen;
1287         GCM_MUL(ctx,Xi);
1288
1289         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1290         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1291
1292         if (tag && len<=sizeof(ctx->Xi))
1293                 return memcmp(ctx->Xi.c,tag,len);
1294         else
1295                 return -1;
1296 }
1297
1298 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1299 {
1300         GCM128_CONTEXT *ret;
1301
1302         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1303                 CRYPTO_gcm128_init(ret,key,block);
1304
1305         return ret;
1306 }
1307
1308 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1309 {
1310         if (ctx) {
1311                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1312                 OPENSSL_free(ctx);
1313         }
1314 }
1315
1316 #if defined(SELFTEST)
1317 #include <stdio.h>
1318 #include <openssl/aes.h>
1319
1320 /* Test Case 1 */
1321 static const u8 K1[16],
1322                 *P1=NULL,
1323                 *A1=NULL,
1324                 IV1[12],
1325                 *C1=NULL,
1326                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1327
1328 /* Test Case 2 */
1329 #define K2 K1
1330 #define A2 A1
1331 #define IV2 IV1
1332 static const u8 P2[16],
1333                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1334                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1335
1336 /* Test Case 3 */
1337 #define A3 A2
1338 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1339                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1340                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1341                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1342                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1343                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1344                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1345                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1346                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1347                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1348                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1349
1350 /* Test Case 4 */
1351 #define K4 K3
1352 #define IV4 IV3
1353 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1354                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1355                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1356                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1357                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1358                         0xab,0xad,0xda,0xd2},
1359                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1360                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1361                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1362                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1363                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1364
1365 /* Test Case 5 */
1366 #define K5 K4
1367 #define P5 P4
1368 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1369                         0xab,0xad,0xda,0xd2},
1370                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1371                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1372                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1373                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1374                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1375                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1376
1377 /* Test Case 6 */
1378 #define K6 K5
1379 #define P6 P5
1380 #define A6 A5
1381 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1382                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1383                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1384                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1385                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1386                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1387                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1388                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1389                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1390
1391 /* Test Case 7 */
1392 static const u8 K7[24],
1393                 *P7=NULL,
1394                 *A7=NULL,
1395                 IV7[12],
1396                 *C7=NULL,
1397                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1398
1399 /* Test Case 8 */
1400 #define K8 K7
1401 #define IV8 IV7
1402 #define A8 A7
1403 static const u8 P8[16],
1404                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1405                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1406
1407 /* Test Case 9 */
1408 #define A9 A8
1409 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1410                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1411                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1412                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1413                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1414                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1415                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1416                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1417                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1418                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1419                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1420                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1421
1422 /* Test Case 10 */
1423 #define K10 K9
1424 #define IV10 IV9
1425 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1426                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1427                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1428                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1429                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1430                         0xab,0xad,0xda,0xd2},
1431                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1432                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1433                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1434                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1435                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1436
1437 /* Test Case 11 */
1438 #define K11 K10
1439 #define P11 P10
1440 #define A11 A10
1441 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1442                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1443                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1444                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1445                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1446                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1447
1448 /* Test Case 12 */
1449 #define K12 K11
1450 #define P12 P11
1451 #define A12 A11
1452 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1453                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1454                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1455                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1456                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1457                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1458                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1459                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1460                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1461
1462 /* Test Case 13 */
1463 static const u8 K13[32],
1464                 *P13=NULL,
1465                 *A13=NULL,
1466                 IV13[12],
1467                 *C13=NULL,
1468                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1469
1470 /* Test Case 14 */
1471 #define K14 K13
1472 #define A14 A13
1473 static const u8 P14[16],
1474                 IV14[12],
1475                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1476                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1477
1478 /* Test Case 15 */
1479 #define A15 A14
1480 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1481                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1482                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1483                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1484                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1485                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1486                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1487                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1488                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1489                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1490                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1491                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1492
1493 /* Test Case 16 */
1494 #define K16 K15
1495 #define IV16 IV15
1496 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1497                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1498                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1499                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1500                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1501                         0xab,0xad,0xda,0xd2},
1502                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1503                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1504                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1505                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1506                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1507
1508 /* Test Case 17 */
1509 #define K17 K16
1510 #define P17 P16
1511 #define A17 A16
1512 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1513                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1514                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1515                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1516                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1517                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1518
1519 /* Test Case 18 */
1520 #define K18 K17
1521 #define P18 P17
1522 #define A18 A17
1523 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1524                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1525                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1526                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1527                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1528                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1529                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1530                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1531                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1532
1533 #define TEST_CASE(n)    do {                                    \
1534         u8 out[sizeof(P##n)];                                   \
1535         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1536         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1537         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1538         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1539         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1540         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1541             (C##n && memcmp(out,C##n,sizeof(out))))             \
1542                 ret++, printf ("encrypt test#%d failed.\n",n);\
1543         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1544         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1545         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1546         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1547             (P##n && memcmp(out,P##n,sizeof(out))))             \
1548                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1549         } while(0)
1550
1551 int main()
1552 {
1553         GCM128_CONTEXT ctx;
1554         AES_KEY key;
1555         int ret=0;
1556
1557         TEST_CASE(1);
1558         TEST_CASE(2);
1559         TEST_CASE(3);
1560         TEST_CASE(4);
1561         TEST_CASE(5);
1562         TEST_CASE(6);
1563         TEST_CASE(7);
1564         TEST_CASE(8);
1565         TEST_CASE(9);
1566         TEST_CASE(10);
1567         TEST_CASE(11);
1568         TEST_CASE(12);
1569         TEST_CASE(13);
1570         TEST_CASE(14);
1571         TEST_CASE(15);
1572         TEST_CASE(16);
1573         TEST_CASE(17);
1574         TEST_CASE(18);
1575
1576 #ifdef OPENSSL_CPUID_OBJ
1577         {
1578         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1579         union { u64 u; u8 c[1024]; } buf;
1580         int i;
1581
1582         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1583         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1584         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1585
1586         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1587         start = OPENSSL_rdtsc();
1588         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1589         gcm_t = OPENSSL_rdtsc() - start;
1590
1591         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1592                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1593                         (block128_f)AES_encrypt);
1594         start = OPENSSL_rdtsc();
1595         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1596                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1597                         (block128_f)AES_encrypt);
1598         ctr_t = OPENSSL_rdtsc() - start;
1599
1600         printf("%.2f-%.2f=%.2f\n",
1601                         gcm_t/(double)sizeof(buf),
1602                         ctr_t/(double)sizeof(buf),
1603                         (gcm_t-ctr_t)/(double)sizeof(buf));
1604 #ifdef GHASH
1605         GHASH(&ctx,buf.c,sizeof(buf));
1606         start = OPENSSL_rdtsc();
1607         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1608         gcm_t = OPENSSL_rdtsc() - start;
1609         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1610 #endif
1611         }
1612 #endif
1613
1614         return ret;
1615 }
1616 #endif