gcm128.c: P.-M. Hager has tipped about possibility to fold reductions
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #include "modes_lcl.h"
51 #include <string.h>
52
53 #ifndef MODES_DEBUG
54 # ifndef NDEBUG
55 #  define NDEBUG
56 # endif
57 #endif
58 #include <assert.h>
59
60 typedef struct { u64 hi,lo; } u128;
61
62 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
63 /* redefine, because alignment is ensured */
64 #undef  GETU32
65 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
66 #undef  PUTU32
67 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
68 #endif
69
70 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
71 #define REDUCE1BIT(V)   do { \
72         if (sizeof(size_t)==8) { \
73                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
74                 V.lo  = (V.hi<<63)|(V.lo>>1); \
75                 V.hi  = (V.hi>>1 )^T; \
76         } \
77         else { \
78                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
79                 V.lo  = (V.hi<<63)|(V.lo>>1); \
80                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
81         } \
82 } while(0)
83
84 #ifdef  TABLE_BITS
85 #undef  TABLE_BITS
86 #endif
87 /*
88  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
89  * never be set to 8. 8 is effectively reserved for testing purposes.
90  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
91  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
92  * whole spectrum of possible table driven implementations. Why? In
93  * non-"Shoup's" case memory access pattern is segmented in such manner,
94  * that it's trivial to see that cache timing information can reveal
95  * fair portion of intermediate hash value. Given that ciphertext is
96  * always available to attacker, it's possible for him to attempt to
97  * deduce secret parameter H and if successful, tamper with messages
98  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
99  * not as trivial, but there is no reason to believe that it's resistant
100  * to cache-timing attack. And the thing about "8-bit" implementation is
101  * that it consumes 16 (sixteen) times more memory, 4KB per individual
102  * key + 1KB shared. Well, on pros side it should be twice as fast as
103  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
104  * was observed to run ~75% faster, closer to 100% for commercial
105  * compilers... Yet "4-bit" procedure is preferred, because it's
106  * believed to provide better security-performance balance and adequate
107  * all-round performance. "All-round" refers to things like:
108  *
109  * - shorter setup time effectively improves overall timing for
110  *   handling short messages;
111  * - larger table allocation can become unbearable because of VM
112  *   subsystem penalties (for example on Windows large enough free
113  *   results in VM working set trimming, meaning that consequent
114  *   malloc would immediately incur working set expansion);
115  * - larger table has larger cache footprint, which can affect
116  *   performance of other code paths (not necessarily even from same
117  *   thread in Hyper-Threading world);
118  */
119 #define TABLE_BITS 4
120
121 #if     TABLE_BITS==8
122
123 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
124 {
125         int  i, j;
126         u128 V;
127
128         Htable[0].hi = 0;
129         Htable[0].lo = 0;
130         V.hi = H[0];
131         V.lo = H[1];
132
133         for (Htable[128]=V, i=64; i>0; i>>=1) {
134                 REDUCE1BIT(V);
135                 Htable[i] = V;
136         }
137
138         for (i=2; i<256; i<<=1) {
139                 u128 *Hi = Htable+i, H0 = *Hi;
140                 for (j=1; j<i; ++j) {
141                         Hi[j].hi = H0.hi^Htable[j].hi;
142                         Hi[j].lo = H0.lo^Htable[j].lo;
143                 }
144         }
145 }
146
147 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
148 {
149         u128 Z = { 0, 0};
150         const u8 *xi = (const u8 *)Xi+15;
151         size_t rem, n = *xi;
152         const union { long one; char little; } is_endian = {1};
153         static const size_t rem_8bit[256] = {
154                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
155                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
156                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
157                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
158                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
159                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
160                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
161                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
162                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
163                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
164                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
165                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
166                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
167                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
168                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
169                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
170                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
171                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
172                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
173                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
174                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
175                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
176                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
177                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
178                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
179                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
180                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
181                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
182                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
183                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
184                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
185                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
186                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
187                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
188                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
189                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
190                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
191                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
192                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
193                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
194                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
195                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
196                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
197                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
198                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
199                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
200                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
201                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
202                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
203                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
204                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
205                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
206                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
207                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
208                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
209                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
210                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
211                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
212                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
213                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
214                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
215                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
216                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
217                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
218
219         while (1) {
220                 Z.hi ^= Htable[n].hi;
221                 Z.lo ^= Htable[n].lo;
222
223                 if ((u8 *)Xi==xi)       break;
224
225                 n = *(--xi);
226
227                 rem  = (size_t)Z.lo&0xff;
228                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
229                 Z.hi = (Z.hi>>8);
230                 if (sizeof(size_t)==8)
231                         Z.hi ^= rem_8bit[rem];
232                 else
233                         Z.hi ^= (u64)rem_8bit[rem]<<32;
234         }
235
236         if (is_endian.little) {
237 #ifdef BSWAP8
238                 Xi[0] = BSWAP8(Z.hi);
239                 Xi[1] = BSWAP8(Z.lo);
240 #else
241                 u8 *p = (u8 *)Xi;
242                 u32 v;
243                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
244                 v = (u32)(Z.hi);        PUTU32(p+4,v);
245                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
246                 v = (u32)(Z.lo);        PUTU32(p+12,v);
247 #endif
248         }
249         else {
250                 Xi[0] = Z.hi;
251                 Xi[1] = Z.lo;
252         }
253 }
254 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
255
256 #elif   TABLE_BITS==4
257
258 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
259 {
260         u128 V;
261 #if defined(OPENSSL_SMALL_FOOTPRINT)
262         int  i;
263 #endif
264
265         Htable[0].hi = 0;
266         Htable[0].lo = 0;
267         V.hi = H[0];
268         V.lo = H[1];
269
270 #if defined(OPENSSL_SMALL_FOOTPRINT)
271         for (Htable[8]=V, i=4; i>0; i>>=1) {
272                 REDUCE1BIT(V);
273                 Htable[i] = V;
274         }
275
276         for (i=2; i<16; i<<=1) {
277                 u128 *Hi = Htable+i;
278                 int   j;
279                 for (V=*Hi, j=1; j<i; ++j) {
280                         Hi[j].hi = V.hi^Htable[j].hi;
281                         Hi[j].lo = V.lo^Htable[j].lo;
282                 }
283         }
284 #else
285         Htable[8] = V;
286         REDUCE1BIT(V);
287         Htable[4] = V;
288         REDUCE1BIT(V);
289         Htable[2] = V;
290         REDUCE1BIT(V);
291         Htable[1] = V;
292         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
293         V=Htable[4];
294         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
295         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
296         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
297         V=Htable[8];
298         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
299         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
300         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
301         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
302         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
303         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
304         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
305 #endif
306 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
307         /*
308          * ARM assembler expects specific dword order in Htable.
309          */
310         {
311         int j;
312         const union { long one; char little; } is_endian = {1};
313
314         if (is_endian.little)
315                 for (j=0;j<16;++j) {
316                         V = Htable[j];
317                         Htable[j].hi = V.lo;
318                         Htable[j].lo = V.hi;
319                 }
320         else
321                 for (j=0;j<16;++j) {
322                         V = Htable[j];
323                         Htable[j].hi = V.lo<<32|V.lo>>32;
324                         Htable[j].lo = V.hi<<32|V.hi>>32;
325                 }
326         }
327 #endif
328 }
329
330 #ifndef GHASH_ASM
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but priority is to minimize memory
462      * usage...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     static const unsigned short rem_8bit[256] = {
467         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
468         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
469         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
470         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
471         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
472         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
473         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
474         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
475         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
476         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
477         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
478         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
479         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
480         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
481         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
482         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
483         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
484         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
485         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
486         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
487         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
488         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
489         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
490         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
491         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
492         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
493         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
494         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
495         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
496         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
497         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
498         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
499
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         nlo  = ((const u8 *)Xi)[15];
516         nlo ^= inp[15];
517         nhi  = nlo>>4;
518         nlo &= 0xf;
519
520         Z.hi = Htable[nlo].hi;
521         Z.lo = Htable[nlo].lo;
522
523         rem = (size_t)Z.lo&0xff;
524
525         Z.lo = (Z.hi<<56)|(Z.lo>>8);
526         Z.hi = (Z.hi>>8);
527
528         Z.hi ^= Hshr4[nhi].hi;
529         Z.lo ^= Hshr4[nhi].lo;
530         Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
531
532         for (cnt=14; cnt>0; --cnt) {
533                 nlo  = ((const u8 *)Xi)[cnt];
534                 nlo ^= inp[cnt];
535                 nhi  = nlo>>4;
536                 nlo &= 0xf;
537
538                 Z.hi ^= Htable[nlo].hi;
539                 Z.lo ^= Htable[nlo].lo;
540
541                 rem = (size_t)Z.lo&0xff;
542
543                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
544                 Z.hi = (Z.hi>>8);
545
546                 Z.hi ^= Hshr4[nhi].hi;
547                 Z.lo ^= Hshr4[nhi].lo;
548                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
549         }
550
551         nlo  = ((const u8 *)Xi)[0];
552         nlo ^= inp[0];
553         nhi  = nlo>>4;
554         nlo &= 0xf;
555
556         Z.hi ^= Htable[nlo].hi;
557         Z.lo ^= Htable[nlo].lo;
558
559         rem = (size_t)Z.lo&0xf;
560
561         Z.lo = (Z.hi<<60)|(Z.lo>>4);
562         Z.hi = (Z.hi>>4);
563
564         Z.hi ^= Htable[nhi].hi;
565         Z.lo ^= Htable[nhi].lo;
566         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
567 #endif
568
569         if (is_endian.little) {
570 #ifdef BSWAP8
571                 Xi[0] = BSWAP8(Z.hi);
572                 Xi[1] = BSWAP8(Z.lo);
573 #else
574                 u8 *p = (u8 *)Xi;
575                 u32 v;
576                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
577                 v = (u32)(Z.hi);        PUTU32(p+4,v);
578                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
579                 v = (u32)(Z.lo);        PUTU32(p+12,v);
580 #endif
581         }
582         else {
583                 Xi[0] = Z.hi;
584                 Xi[1] = Z.lo;
585         }
586     } while (inp+=16, len-=16);
587 }
588 #endif
589 #else
590 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
591 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
592 #endif
593
594 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
595 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
596 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
597 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
598  * trashing effect. In other words idea is to hash data while it's
599  * still in L1 cache after encryption pass... */
600 #define GHASH_CHUNK       1024
601 #endif
602
603 #else   /* TABLE_BITS */
604
605 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
606 {
607         u128 V,Z = { 0,0 };
608         long X;
609         int  i,j;
610         const long *xi = (const long *)Xi;
611         const union { long one; char little; } is_endian = {1};
612
613         V.hi = H[0];    /* H is in host byte order, no byte swapping */
614         V.lo = H[1];
615
616         for (j=0; j<16/sizeof(long); ++j) {
617                 if (is_endian.little) {
618                         if (sizeof(long)==8) {
619 #ifdef BSWAP8
620                                 X = (long)(BSWAP8(xi[j]));
621 #else
622                                 const u8 *p = (const u8 *)(xi+j);
623                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
624 #endif
625                         }
626                         else {
627                                 const u8 *p = (const u8 *)(xi+j);
628                                 X = (long)GETU32(p);
629                         }
630                 }
631                 else
632                         X = xi[j];
633
634                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
635                         u64 M = (u64)(X>>(8*sizeof(long)-1));
636                         Z.hi ^= V.hi&M;
637                         Z.lo ^= V.lo&M;
638
639                         REDUCE1BIT(V);
640                 }
641         }
642
643         if (is_endian.little) {
644 #ifdef BSWAP8
645                 Xi[0] = BSWAP8(Z.hi);
646                 Xi[1] = BSWAP8(Z.lo);
647 #else
648                 u8 *p = (u8 *)Xi;
649                 u32 v;
650                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
651                 v = (u32)(Z.hi);        PUTU32(p+4,v);
652                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
653                 v = (u32)(Z.lo);        PUTU32(p+12,v);
654 #endif
655         }
656         else {
657                 Xi[0] = Z.hi;
658                 Xi[1] = Z.lo;
659         }
660 }
661 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
662
663 #endif
664
665 struct gcm128_context {
666         /* Following 6 names follow names in GCM specification */
667         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
668                                                 Xi,H,len;
669         /* Pre-computed table used by gcm_gmult_* */
670 #if TABLE_BITS==8
671         u128 Htable[256];
672 #else
673         u128 Htable[16];
674         void (*gmult)(u64 Xi[2],const u128 Htable[16]);
675         void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 #endif
677         unsigned int res, pad;
678         block128_f block;
679         void *key;
680 };
681
682 #if     TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
683         (defined(__i386)        || defined(__i386__)    || \
684          defined(__x86_64)      || defined(__x86_64__)  || \
685          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
686 # define GHASH_ASM_IAX
687 extern unsigned int OPENSSL_ia32cap_P[2];
688
689 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
690 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
691 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
692
693 # if    defined(__i386) || defined(__i386__) || defined(_M_IX86)
694 #  define GHASH_ASM_X86
695 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
696 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
697
698 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
699 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
700 # endif
701
702 # undef  GCM_MUL
703 # define GCM_MUL(ctx,Xi)   (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
704 # undef  GHASH
705 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
706 #endif
707
708 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
709 {
710         const union { long one; char little; } is_endian = {1};
711
712         memset(ctx,0,sizeof(*ctx));
713         ctx->block = block;
714         ctx->key   = key;
715
716         (*block)(ctx->H.c,ctx->H.c,key);
717
718         if (is_endian.little) {
719                 /* H is stored in host byte order */
720 #ifdef BSWAP8
721                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
722                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
723 #else
724                 u8 *p = ctx->H.c;
725                 u64 hi,lo;
726                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
727                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
728                 ctx->H.u[0] = hi;
729                 ctx->H.u[1] = lo;
730 #endif
731         }
732
733 #if     TABLE_BITS==8
734         gcm_init_8bit(ctx->Htable,ctx->H.u);
735 #elif   TABLE_BITS==4
736 # if    defined(GHASH_ASM_IAX)
737         if (OPENSSL_ia32cap_P[1]&(1<<1)) {
738                 gcm_init_clmul(ctx->Htable,ctx->H.u);
739                 ctx->gmult = gcm_gmult_clmul;
740                 ctx->ghash = gcm_ghash_clmul;
741                 return;
742         }
743         gcm_init_4bit(ctx->Htable,ctx->H.u);
744 #  if   defined(GHASH_ASM_X86)
745         if (OPENSSL_ia32cap_P[0]&(1<<23)) {
746                 ctx->gmult = gcm_gmult_4bit_mmx;
747                 ctx->ghash = gcm_ghash_4bit_mmx;
748         } else {
749                 ctx->gmult = gcm_gmult_4bit_x86;
750                 ctx->ghash = gcm_ghash_4bit_x86;
751         }
752 #  else
753         ctx->gmult = gcm_gmult_4bit;
754         ctx->ghash = gcm_ghash_4bit;
755 #  endif
756 # else
757         gcm_init_4bit(ctx->Htable,ctx->H.u);
758 # endif
759 #endif
760 }
761
762 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
763 {
764         const union { long one; char little; } is_endian = {1};
765         unsigned int ctr;
766
767         ctx->Yi.u[0]  = 0;
768         ctx->Yi.u[1]  = 0;
769         ctx->Xi.u[0]  = 0;
770         ctx->Xi.u[1]  = 0;
771         ctx->len.u[0] = 0;
772         ctx->len.u[1] = 0;
773         ctx->res = 0;
774
775         if (len==12) {
776                 memcpy(ctx->Yi.c,iv,12);
777                 ctx->Yi.c[15]=1;
778                 ctr=1;
779         }
780         else {
781                 size_t i;
782                 u64 len0 = len;
783
784                 while (len>=16) {
785                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
786                         GCM_MUL(ctx,Yi);
787                         iv += 16;
788                         len -= 16;
789                 }
790                 if (len) {
791                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
792                         GCM_MUL(ctx,Yi);
793                 }
794                 len0 <<= 3;
795                 if (is_endian.little) {
796 #ifdef BSWAP8
797                         ctx->Yi.u[1]  ^= BSWAP8(len0);
798 #else
799                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
800                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
801                         ctx->Yi.c[10] ^= (u8)(len0>>40);
802                         ctx->Yi.c[11] ^= (u8)(len0>>32);
803                         ctx->Yi.c[12] ^= (u8)(len0>>24);
804                         ctx->Yi.c[13] ^= (u8)(len0>>16);
805                         ctx->Yi.c[14] ^= (u8)(len0>>8);
806                         ctx->Yi.c[15] ^= (u8)(len0);
807 #endif
808                 }
809                 else
810                         ctx->Yi.u[1]  ^= len0;
811
812                 GCM_MUL(ctx,Yi);
813
814                 if (is_endian.little)
815                         ctr = GETU32(ctx->Yi.c+12);
816                 else
817                         ctr = ctx->Yi.d[3];
818         }
819
820         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
821         ++ctr;
822         if (is_endian.little)
823                 PUTU32(ctx->Yi.c+12,ctr);
824         else
825                 ctx->Yi.d[3] = ctr;
826 }
827
828 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
829 {
830         size_t i;
831
832         ctx->len.u[0] += len;
833
834 #ifdef GHASH
835         if ((i = (len&(size_t)-16))) {
836                 GHASH(ctx,aad,i);
837                 aad += i;
838                 len -= i;
839         }
840 #else
841         while (len>=16) {
842                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
843                 GCM_MUL(ctx,Xi);
844                 aad += 16;
845                 len -= 16;
846         }
847 #endif
848         if (len) {
849                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
850                 GCM_MUL(ctx,Xi);
851         }
852 }
853
854 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
855                 const unsigned char *in, unsigned char *out,
856                 size_t len)
857 {
858         const union { long one; char little; } is_endian = {1};
859         unsigned int n, ctr;
860         size_t i;
861
862         ctx->len.u[1] += len;
863         n   = ctx->res;
864         if (is_endian.little)
865                 ctr = GETU32(ctx->Yi.c+12);
866         else
867                 ctr = ctx->Yi.d[3];
868
869 #if !defined(OPENSSL_SMALL_FOOTPRINT)
870         if (16%sizeof(size_t) == 0) do {        /* always true actually */
871                 if (n) {
872                         while (n && len) {
873                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
874                                 --len;
875                                 n = (n+1)%16;
876                         }
877                         if (n==0) GCM_MUL(ctx,Xi);
878                         else {
879                                 ctx->res = n;
880                                 return;
881                         }
882                 }
883 #if defined(STRICT_ALIGNMENT)
884                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
885                         break;
886 #endif
887 #if defined(GHASH) && defined(GHASH_CHUNK)
888                 while (len>=GHASH_CHUNK) {
889                     size_t j=GHASH_CHUNK;
890
891                     while (j) {
892                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
893                         ++ctr;
894                         if (is_endian.little)
895                                 PUTU32(ctx->Yi.c+12,ctr);
896                         else
897                                 ctx->Yi.d[3] = ctr;
898                         for (i=0; i<16; i+=sizeof(size_t))
899                                 *(size_t *)(out+i) =
900                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
901                         out += 16;
902                         in  += 16;
903                         j   -= 16;
904                     }
905                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
906                     len -= GHASH_CHUNK;
907                 }
908                 if ((i = (len&(size_t)-16))) {
909                     size_t j=i;
910
911                     while (len>=16) {
912                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
913                         ++ctr;
914                         if (is_endian.little)
915                                 PUTU32(ctx->Yi.c+12,ctr);
916                         else
917                                 ctx->Yi.d[3] = ctr;
918                         for (i=0; i<16; i+=sizeof(size_t))
919                                 *(size_t *)(out+i) =
920                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
921                         out += 16;
922                         in  += 16;
923                         len -= 16;
924                     }
925                     GHASH(ctx,out-j,j);
926                 }
927 #else
928                 while (len>=16) {
929                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
930                         ++ctr;
931                         if (is_endian.little)
932                                 PUTU32(ctx->Yi.c+12,ctr);
933                         else
934                                 ctx->Yi.d[3] = ctr;
935                         for (i=0; i<16; i+=sizeof(size_t))
936                                 *(size_t *)(ctx->Xi.c+i) ^=
937                                 *(size_t *)(out+i) =
938                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
939                         GCM_MUL(ctx,Xi);
940                         out += 16;
941                         in  += 16;
942                         len -= 16;
943                 }
944 #endif
945                 if (len) {
946                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
947                         ++ctr;
948                         if (is_endian.little)
949                                 PUTU32(ctx->Yi.c+12,ctr);
950                         else
951                                 ctx->Yi.d[3] = ctr;
952                         while (len--) {
953                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
954                                 ++n;
955                         }
956                 }
957
958                 ctx->res = n;
959                 return;
960         } while(0);
961 #endif
962         for (i=0;i<len;++i) {
963                 if (n==0) {
964                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
965                         ++ctr;
966                         if (is_endian.little)
967                                 PUTU32(ctx->Yi.c+12,ctr);
968                         else
969                                 ctx->Yi.d[3] = ctr;
970                 }
971                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
972                 n = (n+1)%16;
973                 if (n==0)
974                         GCM_MUL(ctx,Xi);
975         }
976
977         ctx->res = n;
978 }
979
980 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
981                 const unsigned char *in, unsigned char *out,
982                 size_t len)
983 {
984         const union { long one; char little; } is_endian = {1};
985         unsigned int n, ctr;
986         size_t i;
987
988         ctx->len.u[1] += len;
989         n   = ctx->res;
990         if (is_endian.little)
991                 ctr = GETU32(ctx->Yi.c+12);
992         else
993                 ctr = ctx->Yi.d[3];
994
995 #if !defined(OPENSSL_SMALL_FOOTPRINT)
996         if (16%sizeof(size_t) == 0) do {        /* always true actually */
997                 if (n) {
998                         while (n && len) {
999                                 u8 c = *(in++);
1000                                 *(out++) = c^ctx->EKi.c[n];
1001                                 ctx->Xi.c[n] ^= c;
1002                                 --len;
1003                                 n = (n+1)%16;
1004                         }
1005                         if (n==0) GCM_MUL (ctx,Xi);
1006                         else {
1007                                 ctx->res = n;
1008                                 return;
1009                         }
1010                 }
1011 #if defined(STRICT_ALIGNMENT)
1012                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1013                         break;
1014 #endif
1015 #if defined(GHASH) && defined(GHASH_CHUNK)
1016                 while (len>=GHASH_CHUNK) {
1017                     size_t j=GHASH_CHUNK;
1018
1019                     GHASH(ctx,in,GHASH_CHUNK);
1020                     while (j) {
1021                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1022                         ++ctr;
1023                         if (is_endian.little)
1024                                 PUTU32(ctx->Yi.c+12,ctr);
1025                         else
1026                                 ctx->Yi.d[3] = ctr;
1027                         for (i=0; i<16; i+=sizeof(size_t))
1028                                 *(size_t *)(out+i) =
1029                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1030                         out += 16;
1031                         in  += 16;
1032                         j   -= 16;
1033                     }
1034                     len -= GHASH_CHUNK;
1035                 }
1036                 if ((i = (len&(size_t)-16))) {
1037                     GHASH(ctx,in,i);
1038                     while (len>=16) {
1039                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1040                         ++ctr;
1041                         if (is_endian.little)
1042                                 PUTU32(ctx->Yi.c+12,ctr);
1043                         else
1044                                 ctx->Yi.d[3] = ctr;
1045                         for (i=0; i<16; i+=sizeof(size_t))
1046                                 *(size_t *)(out+i) =
1047                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1048                         out += 16;
1049                         in  += 16;
1050                         len -= 16;
1051                     }
1052                 }
1053 #else
1054                 while (len>=16) {
1055                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1056                         ++ctr;
1057                         if (is_endian.little)
1058                                 PUTU32(ctx->Yi.c+12,ctr);
1059                         else
1060                                 ctx->Yi.d[3] = ctr;
1061                         for (i=0; i<16; i+=sizeof(size_t)) {
1062                                 size_t c = *(size_t *)(in+i);
1063                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1064                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1065                         }
1066                         GCM_MUL(ctx,Xi);
1067                         out += 16;
1068                         in  += 16;
1069                         len -= 16;
1070                 }
1071 #endif
1072                 if (len) {
1073                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1074                         ++ctr;
1075                         if (is_endian.little)
1076                                 PUTU32(ctx->Yi.c+12,ctr);
1077                         else
1078                                 ctx->Yi.d[3] = ctr;
1079                         while (len--) {
1080                                 u8 c = in[n];
1081                                 ctx->Xi.c[n] ^= c;
1082                                 out[n] = c^ctx->EKi.c[n];
1083                                 ++n;
1084                         }
1085                 }
1086
1087                 ctx->res = n;
1088                 return;
1089         } while(0);
1090 #endif
1091         for (i=0;i<len;++i) {
1092                 u8 c;
1093                 if (n==0) {
1094                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1095                         ++ctr;
1096                         if (is_endian.little)
1097                                 PUTU32(ctx->Yi.c+12,ctr);
1098                         else
1099                                 ctx->Yi.d[3] = ctr;
1100                 }
1101                 c = in[i];
1102                 out[i] ^= ctx->EKi.c[n];
1103                 ctx->Xi.c[n] ^= c;
1104                 n = (n+1)%16;
1105                 if (n==0)
1106                         GCM_MUL(ctx,Xi);
1107         }
1108
1109         ctx->res = n;
1110 }
1111
1112 void CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx)
1113 {
1114         const union { long one; char little; } is_endian = {1};
1115         u64 alen = ctx->len.u[0]<<3;
1116         u64 clen = ctx->len.u[1]<<3;
1117
1118         if (ctx->res)
1119                 GCM_MUL(ctx,Xi);
1120
1121         if (is_endian.little) {
1122 #ifdef BSWAP8
1123                 alen = BSWAP8(alen);
1124                 clen = BSWAP8(clen);
1125 #else
1126                 u8 *p = ctx->len.c;
1127
1128                 ctx->len.u[0] = alen;
1129                 ctx->len.u[1] = clen;
1130
1131                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1132                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1133 #endif
1134         }
1135
1136         ctx->Xi.u[0] ^= alen;
1137         ctx->Xi.u[1] ^= clen;
1138         GCM_MUL(ctx,Xi);
1139
1140         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1141         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1142 }
1143
1144 #if defined(SELFTEST)
1145 #include <stdio.h>
1146 #include <openssl/aes.h>
1147
1148 /* Test Case 1 */
1149 static const u8 K1[16],
1150                 *P1=NULL,
1151                 *A1=NULL,
1152                 IV1[12],
1153                 *C1=NULL,
1154                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1155
1156 /* Test Case 2 */
1157 #define K2 K1
1158 #define A2 A1
1159 #define IV2 IV1
1160 static const u8 P2[16],
1161                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1162                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1163
1164 /* Test Case 3 */
1165 #define A3 A2
1166 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1167                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1168                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1169                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1170                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1171                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1172                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1173                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1174                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1175                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1176                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1177
1178 /* Test Case 4 */
1179 #define K4 K3
1180 #define IV4 IV3
1181 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1182                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1183                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1184                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1185                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1186                         0xab,0xad,0xda,0xd2},
1187                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1188                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1189                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1190                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1191                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1192
1193 /* Test Case 5 */
1194 #define K5 K4
1195 #define P5 P4
1196 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1197                         0xab,0xad,0xda,0xd2},
1198                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1199                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1200                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1201                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1202                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1203                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1204
1205 /* Test Case 6 */
1206 #define K6 K5
1207 #define P6 P5
1208 #define A6 A5
1209 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1210                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1211                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1212                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1213                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1214                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1215                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1216                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1217                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1218
1219 /* Test Case 7 */
1220 static const u8 K7[24],
1221                 *P7=NULL,
1222                 *A7=NULL,
1223                 IV7[12],
1224                 *C7=NULL,
1225                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1226
1227 /* Test Case 8 */
1228 #define K8 K7
1229 #define IV8 IV7
1230 #define A8 A7
1231 static const u8 P8[16],
1232                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1233                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1234
1235 /* Test Case 9 */
1236 #define A9 A8
1237 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1238                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1239                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1240                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1241                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1242                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1243                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1244                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1245                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1246                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1247                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1248                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1249
1250 /* Test Case 10 */
1251 #define K10 K9
1252 #define IV10 IV9
1253 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1254                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1255                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1256                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1257                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1258                         0xab,0xad,0xda,0xd2},
1259                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1260                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1261                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1262                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1263                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1264
1265 /* Test Case 11 */
1266 #define K11 K10
1267 #define P11 P10
1268 #define A11 A10
1269 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1270                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1271                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1272                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1273                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1274                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1275
1276 /* Test Case 12 */
1277 #define K12 K11
1278 #define P12 P11
1279 #define A12 A11
1280 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1281                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1282                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1283                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1284                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1285                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1286                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1287                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1288                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1289
1290 /* Test Case 13 */
1291 static const u8 K13[32],
1292                 *P13=NULL,
1293                 *A13=NULL,
1294                 IV13[12],
1295                 *C13=NULL,
1296                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1297
1298 /* Test Case 14 */
1299 #define K14 K13
1300 #define A14 A13
1301 static const u8 P14[16],
1302                 IV14[12],
1303                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1304                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1305
1306 /* Test Case 15 */
1307 #define A15 A14
1308 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1309                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1310                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1311                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1312                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1313                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1314                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1315                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1316                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1317                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1318                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1319                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1320
1321 /* Test Case 16 */
1322 #define K16 K15
1323 #define IV16 IV15
1324 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1325                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1326                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1327                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1328                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1329                         0xab,0xad,0xda,0xd2},
1330                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1331                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1332                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1333                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1334                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1335
1336 /* Test Case 17 */
1337 #define K17 K16
1338 #define P17 P16
1339 #define A17 A16
1340 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1341                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1342                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1343                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1344                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1345                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1346
1347 /* Test Case 18 */
1348 #define K18 K17
1349 #define P18 P17
1350 #define A18 A17
1351 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1352                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1353                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1354                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1355                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1356                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1357                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1358                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1359                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1360
1361 #define TEST_CASE(n)    do {                                    \
1362         u8 out[sizeof(P##n)];                                   \
1363         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1364         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1365         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1366         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1367         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1368         CRYPTO_gcm128_finish(&ctx);                             \
1369         if (memcmp(ctx.Xi.c,T##n,16) || (C##n && memcmp(out,C##n,sizeof(out)))) \
1370                 ret++, printf ("encrypt test#%d failed.\n",n);\
1371         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1372         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1373         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1374         CRYPTO_gcm128_finish(&ctx);                             \
1375         if (memcmp(ctx.Xi.c,T##n,16) || (P##n && memcmp(out,P##n,sizeof(out)))) \
1376                 ret++, printf ("decrypt test#%d failed.\n",n);\
1377         } while(0)
1378
1379 int main()
1380 {
1381         GCM128_CONTEXT ctx;
1382         AES_KEY key;
1383         int ret=0;
1384
1385         TEST_CASE(1);
1386         TEST_CASE(2);
1387         TEST_CASE(3);
1388         TEST_CASE(4);
1389         TEST_CASE(5);
1390         TEST_CASE(6);
1391         TEST_CASE(7);
1392         TEST_CASE(8);
1393         TEST_CASE(9);
1394         TEST_CASE(10);
1395         TEST_CASE(11);
1396         TEST_CASE(12);
1397         TEST_CASE(13);
1398         TEST_CASE(14);
1399         TEST_CASE(15);
1400         TEST_CASE(16);
1401         TEST_CASE(17);
1402         TEST_CASE(18);
1403
1404 #ifdef OPENSSL_CPUID_OBJ
1405         {
1406         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1407         union { u64 u; u8 c[1024]; } buf;
1408         int i;
1409
1410         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1411         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1412         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1413
1414         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1415         start = OPENSSL_rdtsc();
1416         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1417         gcm_t = OPENSSL_rdtsc() - start;
1418
1419         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1420                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1421                         (block128_f)AES_encrypt);
1422         start = OPENSSL_rdtsc();
1423         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1424                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1425                         (block128_f)AES_encrypt);
1426         ctr_t = OPENSSL_rdtsc() - start;
1427
1428         printf("%.2f-%.2f=%.2f\n",
1429                         gcm_t/(double)sizeof(buf),
1430                         ctr_t/(double)sizeof(buf),
1431                         (gcm_t-ctr_t)/(double)sizeof(buf));
1432 #ifdef GHASH
1433         GHASH(&ctx,buf.c,sizeof(buf));
1434         start = OPENSSL_rdtsc();
1435         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1436         gcm_t = OPENSSL_rdtsc() - start;
1437         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1438 #endif
1439         }
1440 #endif
1441
1442         return ret;
1443 }
1444 #endif