gcm128.c: make it work with no-sse2.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 typedef struct { u64 hi,lo; } u128;
64
65 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
66 /* redefine, because alignment is ensured */
67 #undef  GETU32
68 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
69 #undef  PUTU32
70 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
71 #endif
72
73 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
74 #define REDUCE1BIT(V)   do { \
75         if (sizeof(size_t)==8) { \
76                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
77                 V.lo  = (V.hi<<63)|(V.lo>>1); \
78                 V.hi  = (V.hi>>1 )^T; \
79         } \
80         else { \
81                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
82                 V.lo  = (V.hi<<63)|(V.lo>>1); \
83                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
84         } \
85 } while(0)
86
87 #ifdef  TABLE_BITS
88 #undef  TABLE_BITS
89 #endif
90 /*
91  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
92  * never be set to 8. 8 is effectively reserved for testing purposes.
93  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
94  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
95  * whole spectrum of possible table driven implementations. Why? In
96  * non-"Shoup's" case memory access pattern is segmented in such manner,
97  * that it's trivial to see that cache timing information can reveal
98  * fair portion of intermediate hash value. Given that ciphertext is
99  * always available to attacker, it's possible for him to attempt to
100  * deduce secret parameter H and if successful, tamper with messages
101  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
102  * not as trivial, but there is no reason to believe that it's resistant
103  * to cache-timing attack. And the thing about "8-bit" implementation is
104  * that it consumes 16 (sixteen) times more memory, 4KB per individual
105  * key + 1KB shared. Well, on pros side it should be twice as fast as
106  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
107  * was observed to run ~75% faster, closer to 100% for commercial
108  * compilers... Yet "4-bit" procedure is preferred, because it's
109  * believed to provide better security-performance balance and adequate
110  * all-round performance. "All-round" refers to things like:
111  *
112  * - shorter setup time effectively improves overall timing for
113  *   handling short messages;
114  * - larger table allocation can become unbearable because of VM
115  *   subsystem penalties (for example on Windows large enough free
116  *   results in VM working set trimming, meaning that consequent
117  *   malloc would immediately incur working set expansion);
118  * - larger table has larger cache footprint, which can affect
119  *   performance of other code paths (not necessarily even from same
120  *   thread in Hyper-Threading world);
121  */
122 #define TABLE_BITS 4
123
124 #if     TABLE_BITS==8
125
126 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
127 {
128         int  i, j;
129         u128 V;
130
131         Htable[0].hi = 0;
132         Htable[0].lo = 0;
133         V.hi = H[0];
134         V.lo = H[1];
135
136         for (Htable[128]=V, i=64; i>0; i>>=1) {
137                 REDUCE1BIT(V);
138                 Htable[i] = V;
139         }
140
141         for (i=2; i<256; i<<=1) {
142                 u128 *Hi = Htable+i, H0 = *Hi;
143                 for (j=1; j<i; ++j) {
144                         Hi[j].hi = H0.hi^Htable[j].hi;
145                         Hi[j].lo = H0.lo^Htable[j].lo;
146                 }
147         }
148 }
149
150 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
151 {
152         u128 Z = { 0, 0};
153         const u8 *xi = (const u8 *)Xi+15;
154         size_t rem, n = *xi;
155         const union { long one; char little; } is_endian = {1};
156         static const size_t rem_8bit[256] = {
157                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
158                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
159                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
160                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
161                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
162                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
163                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
164                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
165                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
166                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
167                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
168                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
169                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
170                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
171                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
172                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
173                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
174                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
175                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
176                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
177                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
178                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
179                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
180                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
181                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
182                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
183                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
184                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
185                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
186                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
187                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
188                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
189                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
190                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
191                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
192                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
193                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
194                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
195                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
196                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
197                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
198                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
199                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
200                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
201                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
202                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
203                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
204                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
205                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
206                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
207                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
208                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
209                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
210                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
211                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
212                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
213                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
214                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
215                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
216                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
217                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
218                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
219                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
220                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
221
222         while (1) {
223                 Z.hi ^= Htable[n].hi;
224                 Z.lo ^= Htable[n].lo;
225
226                 if ((u8 *)Xi==xi)       break;
227
228                 n = *(--xi);
229
230                 rem  = (size_t)Z.lo&0xff;
231                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
232                 Z.hi = (Z.hi>>8);
233                 if (sizeof(size_t)==8)
234                         Z.hi ^= rem_8bit[rem];
235                 else
236                         Z.hi ^= (u64)rem_8bit[rem]<<32;
237         }
238
239         if (is_endian.little) {
240 #ifdef BSWAP8
241                 Xi[0] = BSWAP8(Z.hi);
242                 Xi[1] = BSWAP8(Z.lo);
243 #else
244                 u8 *p = (u8 *)Xi;
245                 u32 v;
246                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
247                 v = (u32)(Z.hi);        PUTU32(p+4,v);
248                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
249                 v = (u32)(Z.lo);        PUTU32(p+12,v);
250 #endif
251         }
252         else {
253                 Xi[0] = Z.hi;
254                 Xi[1] = Z.lo;
255         }
256 }
257 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
258
259 #elif   TABLE_BITS==4
260
261 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
262 {
263         u128 V;
264 #if defined(OPENSSL_SMALL_FOOTPRINT)
265         int  i;
266 #endif
267
268         Htable[0].hi = 0;
269         Htable[0].lo = 0;
270         V.hi = H[0];
271         V.lo = H[1];
272
273 #if defined(OPENSSL_SMALL_FOOTPRINT)
274         for (Htable[8]=V, i=4; i>0; i>>=1) {
275                 REDUCE1BIT(V);
276                 Htable[i] = V;
277         }
278
279         for (i=2; i<16; i<<=1) {
280                 u128 *Hi = Htable+i;
281                 int   j;
282                 for (V=*Hi, j=1; j<i; ++j) {
283                         Hi[j].hi = V.hi^Htable[j].hi;
284                         Hi[j].lo = V.lo^Htable[j].lo;
285                 }
286         }
287 #else
288         Htable[8] = V;
289         REDUCE1BIT(V);
290         Htable[4] = V;
291         REDUCE1BIT(V);
292         Htable[2] = V;
293         REDUCE1BIT(V);
294         Htable[1] = V;
295         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
296         V=Htable[4];
297         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
298         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
299         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
300         V=Htable[8];
301         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
302         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
303         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
304         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
305         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
306         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
307         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
308 #endif
309 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
310         /*
311          * ARM assembler expects specific dword order in Htable.
312          */
313         {
314         int j;
315         const union { long one; char little; } is_endian = {1};
316
317         if (is_endian.little)
318                 for (j=0;j<16;++j) {
319                         V = Htable[j];
320                         Htable[j].hi = V.lo;
321                         Htable[j].lo = V.hi;
322                 }
323         else
324                 for (j=0;j<16;++j) {
325                         V = Htable[j];
326                         Htable[j].hi = V.lo<<32|V.lo>>32;
327                         Htable[j].lo = V.hi<<32|V.hi>>32;
328                 }
329         }
330 #endif
331 }
332
333 #ifndef GHASH_ASM
334 static const size_t rem_4bit[16] = {
335         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
336         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
337         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
338         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
339
340 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
341 {
342         u128 Z;
343         int cnt = 15;
344         size_t rem, nlo, nhi;
345         const union { long one; char little; } is_endian = {1};
346
347         nlo  = ((const u8 *)Xi)[15];
348         nhi  = nlo>>4;
349         nlo &= 0xf;
350
351         Z.hi = Htable[nlo].hi;
352         Z.lo = Htable[nlo].lo;
353
354         while (1) {
355                 rem  = (size_t)Z.lo&0xf;
356                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
357                 Z.hi = (Z.hi>>4);
358                 if (sizeof(size_t)==8)
359                         Z.hi ^= rem_4bit[rem];
360                 else
361                         Z.hi ^= (u64)rem_4bit[rem]<<32;
362
363                 Z.hi ^= Htable[nhi].hi;
364                 Z.lo ^= Htable[nhi].lo;
365
366                 if (--cnt<0)            break;
367
368                 nlo  = ((const u8 *)Xi)[cnt];
369                 nhi  = nlo>>4;
370                 nlo &= 0xf;
371
372                 rem  = (size_t)Z.lo&0xf;
373                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
374                 Z.hi = (Z.hi>>4);
375                 if (sizeof(size_t)==8)
376                         Z.hi ^= rem_4bit[rem];
377                 else
378                         Z.hi ^= (u64)rem_4bit[rem]<<32;
379
380                 Z.hi ^= Htable[nlo].hi;
381                 Z.lo ^= Htable[nlo].lo;
382         }
383
384         if (is_endian.little) {
385 #ifdef BSWAP8
386                 Xi[0] = BSWAP8(Z.hi);
387                 Xi[1] = BSWAP8(Z.lo);
388 #else
389                 u8 *p = (u8 *)Xi;
390                 u32 v;
391                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
392                 v = (u32)(Z.hi);        PUTU32(p+4,v);
393                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
394                 v = (u32)(Z.lo);        PUTU32(p+12,v);
395 #endif
396         }
397         else {
398                 Xi[0] = Z.hi;
399                 Xi[1] = Z.lo;
400         }
401 }
402
403 #if !defined(OPENSSL_SMALL_FOOTPRINT)
404 /*
405  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
406  * details... Compiler-generated code doesn't seem to give any
407  * performance improvement, at least not on x86[_64]. It's here
408  * mostly as reference and a placeholder for possible future
409  * non-trivial optimization[s]...
410  */
411 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
412                                 const u8 *inp,size_t len)
413 {
414     u128 Z;
415     int cnt;
416     size_t rem, nlo, nhi;
417     const union { long one; char little; } is_endian = {1};
418
419 #if 1
420     do {
421         cnt  = 15;
422         nlo  = ((const u8 *)Xi)[15];
423         nlo ^= inp[15];
424         nhi  = nlo>>4;
425         nlo &= 0xf;
426
427         Z.hi = Htable[nlo].hi;
428         Z.lo = Htable[nlo].lo;
429
430         while (1) {
431                 rem  = (size_t)Z.lo&0xf;
432                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
433                 Z.hi = (Z.hi>>4);
434                 if (sizeof(size_t)==8)
435                         Z.hi ^= rem_4bit[rem];
436                 else
437                         Z.hi ^= (u64)rem_4bit[rem]<<32;
438
439                 Z.hi ^= Htable[nhi].hi;
440                 Z.lo ^= Htable[nhi].lo;
441
442                 if (--cnt<0)            break;
443
444                 nlo  = ((const u8 *)Xi)[cnt];
445                 nlo ^= inp[cnt];
446                 nhi  = nlo>>4;
447                 nlo &= 0xf;
448
449                 rem  = (size_t)Z.lo&0xf;
450                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
451                 Z.hi = (Z.hi>>4);
452                 if (sizeof(size_t)==8)
453                         Z.hi ^= rem_4bit[rem];
454                 else
455                         Z.hi ^= (u64)rem_4bit[rem]<<32;
456
457                 Z.hi ^= Htable[nlo].hi;
458                 Z.lo ^= Htable[nlo].lo;
459         }
460 #else
461     /*
462      * Extra 256+16 bytes per-key plus 512 bytes shared tables
463      * [should] give ~50% improvement... One could have PACK()-ed
464      * the rem_8bit even here, but the priority is to minimize
465      * cache footprint...
466      */ 
467     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
468     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
469     static const unsigned short rem_8bit[256] = {
470         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
471         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
472         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
473         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
474         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
475         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
476         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
477         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
478         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
479         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
480         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
481         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
482         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
483         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
484         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
485         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
486         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
487         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
488         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
489         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
490         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
491         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
492         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
493         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
494         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
495         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
496         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
497         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
498         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
499         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
500         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
501         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
502     /*
503      * This pre-processing phase slows down procedure by approximately
504      * same time as it makes each loop spin faster. In other words
505      * single block performance is approximately same as straightforward
506      * "4-bit" implementation, and then it goes only faster...
507      */
508     for (cnt=0; cnt<16; ++cnt) {
509         Z.hi = Htable[cnt].hi;
510         Z.lo = Htable[cnt].lo;
511         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
512         Hshr4[cnt].hi = (Z.hi>>4);
513         Hshl4[cnt]    = (u8)(Z.lo<<4);
514     }
515
516     do {
517         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
518                 nlo  = ((const u8 *)Xi)[cnt];
519                 nlo ^= inp[cnt];
520                 nhi  = nlo>>4;
521                 nlo &= 0xf;
522
523                 Z.hi ^= Htable[nlo].hi;
524                 Z.lo ^= Htable[nlo].lo;
525
526                 rem = (size_t)Z.lo&0xff;
527
528                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
529                 Z.hi = (Z.hi>>8);
530
531                 Z.hi ^= Hshr4[nhi].hi;
532                 Z.lo ^= Hshr4[nhi].lo;
533                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
534         }
535
536         nlo  = ((const u8 *)Xi)[0];
537         nlo ^= inp[0];
538         nhi  = nlo>>4;
539         nlo &= 0xf;
540
541         Z.hi ^= Htable[nlo].hi;
542         Z.lo ^= Htable[nlo].lo;
543
544         rem = (size_t)Z.lo&0xf;
545
546         Z.lo = (Z.hi<<60)|(Z.lo>>4);
547         Z.hi = (Z.hi>>4);
548
549         Z.hi ^= Htable[nhi].hi;
550         Z.lo ^= Htable[nhi].lo;
551         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
552 #endif
553
554         if (is_endian.little) {
555 #ifdef BSWAP8
556                 Xi[0] = BSWAP8(Z.hi);
557                 Xi[1] = BSWAP8(Z.lo);
558 #else
559                 u8 *p = (u8 *)Xi;
560                 u32 v;
561                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
562                 v = (u32)(Z.hi);        PUTU32(p+4,v);
563                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
564                 v = (u32)(Z.lo);        PUTU32(p+12,v);
565 #endif
566         }
567         else {
568                 Xi[0] = Z.hi;
569                 Xi[1] = Z.lo;
570         }
571     } while (inp+=16, len-=16);
572 }
573 #endif
574 #else
575 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
576 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
577 #endif
578
579 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
580 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
581 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
582 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
583  * trashing effect. In other words idea is to hash data while it's
584  * still in L1 cache after encryption pass... */
585 #define GHASH_CHUNK       (3*1024)
586 #endif
587
588 #else   /* TABLE_BITS */
589
590 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
591 {
592         u128 V,Z = { 0,0 };
593         long X;
594         int  i,j;
595         const long *xi = (const long *)Xi;
596         const union { long one; char little; } is_endian = {1};
597
598         V.hi = H[0];    /* H is in host byte order, no byte swapping */
599         V.lo = H[1];
600
601         for (j=0; j<16/sizeof(long); ++j) {
602                 if (is_endian.little) {
603                         if (sizeof(long)==8) {
604 #ifdef BSWAP8
605                                 X = (long)(BSWAP8(xi[j]));
606 #else
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
609 #endif
610                         }
611                         else {
612                                 const u8 *p = (const u8 *)(xi+j);
613                                 X = (long)GETU32(p);
614                         }
615                 }
616                 else
617                         X = xi[j];
618
619                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
620                         u64 M = (u64)(X>>(8*sizeof(long)-1));
621                         Z.hi ^= V.hi&M;
622                         Z.lo ^= V.lo&M;
623
624                         REDUCE1BIT(V);
625                 }
626         }
627
628         if (is_endian.little) {
629 #ifdef BSWAP8
630                 Xi[0] = BSWAP8(Z.hi);
631                 Xi[1] = BSWAP8(Z.lo);
632 #else
633                 u8 *p = (u8 *)Xi;
634                 u32 v;
635                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
636                 v = (u32)(Z.hi);        PUTU32(p+4,v);
637                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
638                 v = (u32)(Z.lo);        PUTU32(p+12,v);
639 #endif
640         }
641         else {
642                 Xi[0] = Z.hi;
643                 Xi[1] = Z.lo;
644         }
645 }
646 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
647
648 #endif
649
650 struct gcm128_context {
651         /* Following 6 names follow names in GCM specification */
652         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
653                                                 Xi,H,len;
654         /* Pre-computed table used by gcm_gmult_* */
655 #if TABLE_BITS==8
656         u128 Htable[256];
657 #else
658         u128 Htable[16];
659         void (*gmult)(u64 Xi[2],const u128 Htable[16]);
660         void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
661 #endif
662         unsigned int mres, ares;
663         block128_f block;
664         void *key;
665 };
666
667 #if     TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
668         (defined(__i386)        || defined(__i386__)    || \
669          defined(__x86_64)      || defined(__x86_64__)  || \
670          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
671 # define GHASH_ASM_IAX
672 extern unsigned int OPENSSL_ia32cap_P[2];
673
674 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
675 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
676 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
677
678 # if    defined(__i386) || defined(__i386__) || defined(_M_IX86)
679 #  define GHASH_ASM_X86
680 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
681 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
682
683 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
684 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
685 # endif
686
687 # undef  GCM_MUL
688 # define GCM_MUL(ctx,Xi)   (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
689 # undef  GHASH
690 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
691 #endif
692
693 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
694 {
695         const union { long one; char little; } is_endian = {1};
696
697         memset(ctx,0,sizeof(*ctx));
698         ctx->block = block;
699         ctx->key   = key;
700
701         (*block)(ctx->H.c,ctx->H.c,key);
702
703         if (is_endian.little) {
704                 /* H is stored in host byte order */
705 #ifdef BSWAP8
706                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
707                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
708 #else
709                 u8 *p = ctx->H.c;
710                 u64 hi,lo;
711                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
712                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
713                 ctx->H.u[0] = hi;
714                 ctx->H.u[1] = lo;
715 #endif
716         }
717
718 #if     TABLE_BITS==8
719         gcm_init_8bit(ctx->Htable,ctx->H.u);
720 #elif   TABLE_BITS==4
721 # if    defined(GHASH_ASM_IAX)                  /* both x86 and x86_64 */
722 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
723         if (OPENSSL_ia32cap_P[1]&(1<<1)) {
724                 gcm_init_clmul(ctx->Htable,ctx->H.u);
725                 ctx->gmult = gcm_gmult_clmul;
726                 ctx->ghash = gcm_ghash_clmul;
727                 return;
728         }
729 #  endif
730         gcm_init_4bit(ctx->Htable,ctx->H.u);
731 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
732         if (OPENSSL_ia32cap_P[0]&(1<<23)) {
733                 ctx->gmult = gcm_gmult_4bit_mmx;
734                 ctx->ghash = gcm_ghash_4bit_mmx;
735         } else {
736                 ctx->gmult = gcm_gmult_4bit_x86;
737                 ctx->ghash = gcm_ghash_4bit_x86;
738         }
739 #  else
740         ctx->gmult = gcm_gmult_4bit;
741         ctx->ghash = gcm_ghash_4bit;
742 #  endif
743 # else
744         gcm_init_4bit(ctx->Htable,ctx->H.u);
745 # endif
746 #endif
747 }
748
749 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
750 {
751         const union { long one; char little; } is_endian = {1};
752         unsigned int ctr;
753
754         ctx->Yi.u[0]  = 0;
755         ctx->Yi.u[1]  = 0;
756         ctx->Xi.u[0]  = 0;
757         ctx->Xi.u[1]  = 0;
758         ctx->len.u[0] = 0;      /* AAD length */
759         ctx->len.u[1] = 0;      /* message length */
760         ctx->ares = 0;
761         ctx->mres = 0;
762
763         if (len==12) {
764                 memcpy(ctx->Yi.c,iv,12);
765                 ctx->Yi.c[15]=1;
766                 ctr=1;
767         }
768         else {
769                 size_t i;
770                 u64 len0 = len;
771
772                 while (len>=16) {
773                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
774                         GCM_MUL(ctx,Yi);
775                         iv += 16;
776                         len -= 16;
777                 }
778                 if (len) {
779                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
780                         GCM_MUL(ctx,Yi);
781                 }
782                 len0 <<= 3;
783                 if (is_endian.little) {
784 #ifdef BSWAP8
785                         ctx->Yi.u[1]  ^= BSWAP8(len0);
786 #else
787                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
788                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
789                         ctx->Yi.c[10] ^= (u8)(len0>>40);
790                         ctx->Yi.c[11] ^= (u8)(len0>>32);
791                         ctx->Yi.c[12] ^= (u8)(len0>>24);
792                         ctx->Yi.c[13] ^= (u8)(len0>>16);
793                         ctx->Yi.c[14] ^= (u8)(len0>>8);
794                         ctx->Yi.c[15] ^= (u8)(len0);
795 #endif
796                 }
797                 else
798                         ctx->Yi.u[1]  ^= len0;
799
800                 GCM_MUL(ctx,Yi);
801
802                 if (is_endian.little)
803                         ctr = GETU32(ctx->Yi.c+12);
804                 else
805                         ctr = ctx->Yi.d[3];
806         }
807
808         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
809         ++ctr;
810         if (is_endian.little)
811                 PUTU32(ctx->Yi.c+12,ctr);
812         else
813                 ctx->Yi.d[3] = ctr;
814 }
815
816 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
817 {
818         size_t i;
819         unsigned int n;
820         u64 alen = ctx->len.u[0];
821
822         if (ctx->len.u[1]) return -2;
823
824         alen += len;
825         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
826                 return -1;
827         ctx->len.u[0] = alen;
828
829         n = ctx->ares;
830         if (n) {
831                 while (n && len) {
832                         ctx->Xi.c[n] ^= *(aad++);
833                         --len;
834                         n = (n+1)%16;
835                 }
836                 if (n==0) GCM_MUL(ctx,Xi);
837                 else {
838                         ctx->ares = n;
839                         return 0;
840                 }
841         }
842
843 #ifdef GHASH
844         if ((i = (len&(size_t)-16))) {
845                 GHASH(ctx,aad,i);
846                 aad += i;
847                 len -= i;
848         }
849 #else
850         while (len>=16) {
851                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
852                 GCM_MUL(ctx,Xi);
853                 aad += 16;
854                 len -= 16;
855         }
856 #endif
857         if (len) {
858                 n = (unsigned int)len;
859                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
860         }
861
862         ctx->ares = n;
863         return 0;
864 }
865
866 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
867                 const unsigned char *in, unsigned char *out,
868                 size_t len)
869 {
870         const union { long one; char little; } is_endian = {1};
871         unsigned int n, ctr;
872         size_t i;
873         u64 mlen = ctx->len.u[1];
874
875 #if 0
876         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
877 #endif
878         mlen += len;
879         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
880                 return -1;
881         ctx->len.u[1] = mlen;
882
883         if (ctx->ares) {
884                 /* First call to encrypt finalizes GHASH(AAD) */
885                 GCM_MUL(ctx,Xi);
886                 ctx->ares = 0;
887         }
888
889         if (is_endian.little)
890                 ctr = GETU32(ctx->Yi.c+12);
891         else
892                 ctr = ctx->Yi.d[3];
893
894         n = ctx->mres;
895 #if !defined(OPENSSL_SMALL_FOOTPRINT)
896         if (16%sizeof(size_t) == 0) do {        /* always true actually */
897                 if (n) {
898                         while (n && len) {
899                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
900                                 --len;
901                                 n = (n+1)%16;
902                         }
903                         if (n==0) GCM_MUL(ctx,Xi);
904                         else {
905                                 ctx->mres = n;
906                                 return 0;
907                         }
908                 }
909 #if defined(STRICT_ALIGNMENT)
910                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
911                         break;
912 #endif
913 #if defined(GHASH) && defined(GHASH_CHUNK)
914                 while (len>=GHASH_CHUNK) {
915                     size_t j=GHASH_CHUNK;
916
917                     while (j) {
918                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
919                         ++ctr;
920                         if (is_endian.little)
921                                 PUTU32(ctx->Yi.c+12,ctr);
922                         else
923                                 ctx->Yi.d[3] = ctr;
924                         for (i=0; i<16; i+=sizeof(size_t))
925                                 *(size_t *)(out+i) =
926                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
927                         out += 16;
928                         in  += 16;
929                         j   -= 16;
930                     }
931                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
932                     len -= GHASH_CHUNK;
933                 }
934                 if ((i = (len&(size_t)-16))) {
935                     size_t j=i;
936
937                     while (len>=16) {
938                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
939                         ++ctr;
940                         if (is_endian.little)
941                                 PUTU32(ctx->Yi.c+12,ctr);
942                         else
943                                 ctx->Yi.d[3] = ctr;
944                         for (i=0; i<16; i+=sizeof(size_t))
945                                 *(size_t *)(out+i) =
946                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
947                         out += 16;
948                         in  += 16;
949                         len -= 16;
950                     }
951                     GHASH(ctx,out-j,j);
952                 }
953 #else
954                 while (len>=16) {
955                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
956                         ++ctr;
957                         if (is_endian.little)
958                                 PUTU32(ctx->Yi.c+12,ctr);
959                         else
960                                 ctx->Yi.d[3] = ctr;
961                         for (i=0; i<16; i+=sizeof(size_t))
962                                 *(size_t *)(ctx->Xi.c+i) ^=
963                                 *(size_t *)(out+i) =
964                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
965                         GCM_MUL(ctx,Xi);
966                         out += 16;
967                         in  += 16;
968                         len -= 16;
969                 }
970 #endif
971                 if (len) {
972                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
973                         ++ctr;
974                         if (is_endian.little)
975                                 PUTU32(ctx->Yi.c+12,ctr);
976                         else
977                                 ctx->Yi.d[3] = ctr;
978                         while (len--) {
979                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
980                                 ++n;
981                         }
982                 }
983
984                 ctx->mres = n;
985                 return 0;
986         } while(0);
987 #endif
988         for (i=0;i<len;++i) {
989                 if (n==0) {
990                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
991                         ++ctr;
992                         if (is_endian.little)
993                                 PUTU32(ctx->Yi.c+12,ctr);
994                         else
995                                 ctx->Yi.d[3] = ctr;
996                 }
997                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
998                 n = (n+1)%16;
999                 if (n==0)
1000                         GCM_MUL(ctx,Xi);
1001         }
1002
1003         ctx->mres = n;
1004         return 0;
1005 }
1006
1007 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1008                 const unsigned char *in, unsigned char *out,
1009                 size_t len)
1010 {
1011         const union { long one; char little; } is_endian = {1};
1012         unsigned int n, ctr;
1013         size_t i;
1014         u64 mlen = ctx->len.u[1];
1015
1016         mlen += len;
1017         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1018                 return -1;
1019         ctx->len.u[1] = mlen;
1020
1021         if (ctx->ares) {
1022                 /* First call to decrypt finalizes GHASH(AAD) */
1023                 GCM_MUL(ctx,Xi);
1024                 ctx->ares = 0;
1025         }
1026
1027         if (is_endian.little)
1028                 ctr = GETU32(ctx->Yi.c+12);
1029         else
1030                 ctr = ctx->Yi.d[3];
1031
1032         n = ctx->mres;
1033 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1034         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1035                 if (n) {
1036                         while (n && len) {
1037                                 u8 c = *(in++);
1038                                 *(out++) = c^ctx->EKi.c[n];
1039                                 ctx->Xi.c[n] ^= c;
1040                                 --len;
1041                                 n = (n+1)%16;
1042                         }
1043                         if (n==0) GCM_MUL (ctx,Xi);
1044                         else {
1045                                 ctx->mres = n;
1046                                 return 0;
1047                         }
1048                 }
1049 #if defined(STRICT_ALIGNMENT)
1050                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1051                         break;
1052 #endif
1053 #if defined(GHASH) && defined(GHASH_CHUNK)
1054                 while (len>=GHASH_CHUNK) {
1055                     size_t j=GHASH_CHUNK;
1056
1057                     GHASH(ctx,in,GHASH_CHUNK);
1058                     while (j) {
1059                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1060                         ++ctr;
1061                         if (is_endian.little)
1062                                 PUTU32(ctx->Yi.c+12,ctr);
1063                         else
1064                                 ctx->Yi.d[3] = ctr;
1065                         for (i=0; i<16; i+=sizeof(size_t))
1066                                 *(size_t *)(out+i) =
1067                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1068                         out += 16;
1069                         in  += 16;
1070                         j   -= 16;
1071                     }
1072                     len -= GHASH_CHUNK;
1073                 }
1074                 if ((i = (len&(size_t)-16))) {
1075                     GHASH(ctx,in,i);
1076                     while (len>=16) {
1077                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1078                         ++ctr;
1079                         if (is_endian.little)
1080                                 PUTU32(ctx->Yi.c+12,ctr);
1081                         else
1082                                 ctx->Yi.d[3] = ctr;
1083                         for (i=0; i<16; i+=sizeof(size_t))
1084                                 *(size_t *)(out+i) =
1085                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1086                         out += 16;
1087                         in  += 16;
1088                         len -= 16;
1089                     }
1090                 }
1091 #else
1092                 while (len>=16) {
1093                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1094                         ++ctr;
1095                         if (is_endian.little)
1096                                 PUTU32(ctx->Yi.c+12,ctr);
1097                         else
1098                                 ctx->Yi.d[3] = ctr;
1099                         for (i=0; i<16; i+=sizeof(size_t)) {
1100                                 size_t c = *(size_t *)(in+i);
1101                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1102                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1103                         }
1104                         GCM_MUL(ctx,Xi);
1105                         out += 16;
1106                         in  += 16;
1107                         len -= 16;
1108                 }
1109 #endif
1110                 if (len) {
1111                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1112                         ++ctr;
1113                         if (is_endian.little)
1114                                 PUTU32(ctx->Yi.c+12,ctr);
1115                         else
1116                                 ctx->Yi.d[3] = ctr;
1117                         while (len--) {
1118                                 u8 c = in[n];
1119                                 ctx->Xi.c[n] ^= c;
1120                                 out[n] = c^ctx->EKi.c[n];
1121                                 ++n;
1122                         }
1123                 }
1124
1125                 ctx->mres = n;
1126                 return 0;
1127         } while(0);
1128 #endif
1129         for (i=0;i<len;++i) {
1130                 u8 c;
1131                 if (n==0) {
1132                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1133                         ++ctr;
1134                         if (is_endian.little)
1135                                 PUTU32(ctx->Yi.c+12,ctr);
1136                         else
1137                                 ctx->Yi.d[3] = ctr;
1138                 }
1139                 c = in[i];
1140                 out[i] = c^ctx->EKi.c[n];
1141                 ctx->Xi.c[n] ^= c;
1142                 n = (n+1)%16;
1143                 if (n==0)
1144                         GCM_MUL(ctx,Xi);
1145         }
1146
1147         ctx->mres = n;
1148         return 0;
1149 }
1150
1151 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1152                 const unsigned char *in, unsigned char *out,
1153                 size_t len, ctr128_f stream)
1154 {
1155         const union { long one; char little; } is_endian = {1};
1156         unsigned int n, ctr;
1157         size_t i;
1158         u64 mlen = ctx->len.u[1];
1159
1160         mlen += len;
1161         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1162                 return -1;
1163         ctx->len.u[1] = mlen;
1164
1165         if (ctx->ares) {
1166                 /* First call to encrypt finalizes GHASH(AAD) */
1167                 GCM_MUL(ctx,Xi);
1168                 ctx->ares = 0;
1169         }
1170
1171         if (is_endian.little)
1172                 ctr = GETU32(ctx->Yi.c+12);
1173         else
1174                 ctr = ctx->Yi.d[3];
1175
1176         n = ctx->mres;
1177         if (n) {
1178                 while (n && len) {
1179                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1180                         --len;
1181                         n = (n+1)%16;
1182                 }
1183                 if (n==0) GCM_MUL(ctx,Xi);
1184                 else {
1185                         ctx->mres = n;
1186                         return 0;
1187                 }
1188         }
1189 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1190         while (len>=GHASH_CHUNK) {
1191                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1192                 ctr += GHASH_CHUNK/16;
1193                 if (is_endian.little)
1194                         PUTU32(ctx->Yi.c+12,ctr);
1195                 else
1196                         ctx->Yi.d[3] = ctr;
1197                 GHASH(ctx,out,GHASH_CHUNK);
1198                 out += GHASH_CHUNK;
1199                 in  += GHASH_CHUNK;
1200                 len -= GHASH_CHUNK;
1201         }
1202 #endif
1203         if ((i = (len&(size_t)-16))) {
1204                 size_t j=i/16;
1205
1206                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1207                 ctr += (unsigned int)j;
1208                 if (is_endian.little)
1209                         PUTU32(ctx->Yi.c+12,ctr);
1210                 else
1211                         ctx->Yi.d[3] = ctr;
1212                 in  += i;
1213                 len -= i;
1214 #if defined(GHASH)
1215                 GHASH(ctx,out,i);
1216                 out += i;
1217 #else
1218                 while (j--) {
1219                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1220                         GCM_MUL(ctx,Xi);
1221                         out += 16;
1222                 }
1223 #endif
1224         }
1225         if (len) {
1226                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1227                 ++ctr;
1228                 if (is_endian.little)
1229                         PUTU32(ctx->Yi.c+12,ctr);
1230                 else
1231                         ctx->Yi.d[3] = ctr;
1232                 while (len--) {
1233                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1234                         ++n;
1235                 }
1236         }
1237
1238         ctx->mres = n;
1239         return 0;
1240 }
1241
1242 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1243                 const unsigned char *in, unsigned char *out,
1244                 size_t len,ctr128_f stream)
1245 {
1246         const union { long one; char little; } is_endian = {1};
1247         unsigned int n, ctr;
1248         size_t i;
1249         u64 mlen = ctx->len.u[1];
1250
1251         mlen += len;
1252         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1253                 return -1;
1254         ctx->len.u[1] = mlen;
1255
1256         if (ctx->ares) {
1257                 /* First call to decrypt finalizes GHASH(AAD) */
1258                 GCM_MUL(ctx,Xi);
1259                 ctx->ares = 0;
1260         }
1261
1262         if (is_endian.little)
1263                 ctr = GETU32(ctx->Yi.c+12);
1264         else
1265                 ctr = ctx->Yi.d[3];
1266
1267         n = ctx->mres;
1268         if (n) {
1269                 while (n && len) {
1270                         u8 c = *(in++);
1271                         *(out++) = c^ctx->EKi.c[n];
1272                         ctx->Xi.c[n] ^= c;
1273                         --len;
1274                         n = (n+1)%16;
1275                 }
1276                 if (n==0) GCM_MUL (ctx,Xi);
1277                 else {
1278                         ctx->mres = n;
1279                         return 0;
1280                 }
1281         }
1282 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1283         while (len>=GHASH_CHUNK) {
1284                 GHASH(ctx,in,GHASH_CHUNK);
1285                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1286                 ctr += GHASH_CHUNK/16;
1287                 if (is_endian.little)
1288                         PUTU32(ctx->Yi.c+12,ctr);
1289                 else
1290                         ctx->Yi.d[3] = ctr;
1291                 out += GHASH_CHUNK;
1292                 in  += GHASH_CHUNK;
1293                 len -= GHASH_CHUNK;
1294         }
1295 #endif
1296         if ((i = (len&(size_t)-16))) {
1297                 size_t j=i/16;
1298
1299 #if defined(GHASH)
1300                 GHASH(ctx,in,i);
1301 #else
1302                 while (j--) {
1303                         size_t k;
1304                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1305                         GCM_MUL(ctx,Xi);
1306                         in += 16;
1307                 }
1308                 j   = i/16;
1309                 in -= i;
1310 #endif
1311                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1312                 ctr += (unsigned int)j;
1313                 if (is_endian.little)
1314                         PUTU32(ctx->Yi.c+12,ctr);
1315                 else
1316                         ctx->Yi.d[3] = ctr;
1317                 out += i;
1318                 in  += i;
1319                 len -= i;
1320         }
1321         if (len) {
1322                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1323                 ++ctr;
1324                 if (is_endian.little)
1325                         PUTU32(ctx->Yi.c+12,ctr);
1326                 else
1327                         ctx->Yi.d[3] = ctr;
1328                 while (len--) {
1329                         u8 c = in[n];
1330                         ctx->Xi.c[n] ^= c;
1331                         out[n] = c^ctx->EKi.c[n];
1332                         ++n;
1333                 }
1334         }
1335
1336         ctx->mres = n;
1337         return 0;
1338 }
1339
1340 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1341                         size_t len)
1342 {
1343         const union { long one; char little; } is_endian = {1};
1344         u64 alen = ctx->len.u[0]<<3;
1345         u64 clen = ctx->len.u[1]<<3;
1346
1347         if (ctx->mres)
1348                 GCM_MUL(ctx,Xi);
1349
1350         if (is_endian.little) {
1351 #ifdef BSWAP8
1352                 alen = BSWAP8(alen);
1353                 clen = BSWAP8(clen);
1354 #else
1355                 u8 *p = ctx->len.c;
1356
1357                 ctx->len.u[0] = alen;
1358                 ctx->len.u[1] = clen;
1359
1360                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1361                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1362 #endif
1363         }
1364
1365         ctx->Xi.u[0] ^= alen;
1366         ctx->Xi.u[1] ^= clen;
1367         GCM_MUL(ctx,Xi);
1368
1369         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1370         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1371
1372         if (tag && len<=sizeof(ctx->Xi))
1373                 return memcmp(ctx->Xi.c,tag,len);
1374         else
1375                 return -1;
1376 }
1377
1378 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1379 {
1380         CRYPTO_gcm128_finish(ctx, NULL, 0);
1381         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1382 }
1383
1384 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1385 {
1386         GCM128_CONTEXT *ret;
1387
1388         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1389                 CRYPTO_gcm128_init(ret,key,block);
1390
1391         return ret;
1392 }
1393
1394 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1395 {
1396         if (ctx) {
1397                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1398                 OPENSSL_free(ctx);
1399         }
1400 }
1401
1402 #if defined(SELFTEST)
1403 #include <stdio.h>
1404 #include <openssl/aes.h>
1405
1406 /* Test Case 1 */
1407 static const u8 K1[16],
1408                 *P1=NULL,
1409                 *A1=NULL,
1410                 IV1[12],
1411                 *C1=NULL,
1412                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1413
1414 /* Test Case 2 */
1415 #define K2 K1
1416 #define A2 A1
1417 #define IV2 IV1
1418 static const u8 P2[16],
1419                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1420                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1421
1422 /* Test Case 3 */
1423 #define A3 A2
1424 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1425                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1426                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1427                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1428                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1429                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1430                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1431                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1432                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1433                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1434                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1435
1436 /* Test Case 4 */
1437 #define K4 K3
1438 #define IV4 IV3
1439 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1440                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1441                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1442                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1443                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1444                         0xab,0xad,0xda,0xd2},
1445                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1446                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1447                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1448                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1449                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1450
1451 /* Test Case 5 */
1452 #define K5 K4
1453 #define P5 P4
1454 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1455                         0xab,0xad,0xda,0xd2},
1456                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1457                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1458                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1459                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1460                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1461                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1462
1463 /* Test Case 6 */
1464 #define K6 K5
1465 #define P6 P5
1466 #define A6 A5
1467 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1468                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1469                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1470                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1471                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1472                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1473                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1474                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1475                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1476
1477 /* Test Case 7 */
1478 static const u8 K7[24],
1479                 *P7=NULL,
1480                 *A7=NULL,
1481                 IV7[12],
1482                 *C7=NULL,
1483                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1484
1485 /* Test Case 8 */
1486 #define K8 K7
1487 #define IV8 IV7
1488 #define A8 A7
1489 static const u8 P8[16],
1490                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1491                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1492
1493 /* Test Case 9 */
1494 #define A9 A8
1495 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1496                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1497                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1498                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1499                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1500                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1501                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1502                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1503                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1504                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1505                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1506                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1507
1508 /* Test Case 10 */
1509 #define K10 K9
1510 #define IV10 IV9
1511 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1512                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1513                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1514                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1515                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1516                         0xab,0xad,0xda,0xd2},
1517                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1518                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1519                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1520                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1521                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1522
1523 /* Test Case 11 */
1524 #define K11 K10
1525 #define P11 P10
1526 #define A11 A10
1527 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1528                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1529                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1530                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1531                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1532                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1533
1534 /* Test Case 12 */
1535 #define K12 K11
1536 #define P12 P11
1537 #define A12 A11
1538 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1539                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1540                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1541                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1542                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1543                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1544                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1545                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1546                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1547
1548 /* Test Case 13 */
1549 static const u8 K13[32],
1550                 *P13=NULL,
1551                 *A13=NULL,
1552                 IV13[12],
1553                 *C13=NULL,
1554                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1555
1556 /* Test Case 14 */
1557 #define K14 K13
1558 #define A14 A13
1559 static const u8 P14[16],
1560                 IV14[12],
1561                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1562                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1563
1564 /* Test Case 15 */
1565 #define A15 A14
1566 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1567                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1568                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1569                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1570                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1571                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1572                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1573                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1574                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1575                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1576                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1577                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1578
1579 /* Test Case 16 */
1580 #define K16 K15
1581 #define IV16 IV15
1582 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1583                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1584                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1585                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1586                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1587                         0xab,0xad,0xda,0xd2},
1588                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1589                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1590                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1591                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1592                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1593
1594 /* Test Case 17 */
1595 #define K17 K16
1596 #define P17 P16
1597 #define A17 A16
1598 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1599                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1600                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1601                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1602                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1603                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1604
1605 /* Test Case 18 */
1606 #define K18 K17
1607 #define P18 P17
1608 #define A18 A17
1609 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1610                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1611                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1612                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1613                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1614                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1615                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1616                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1617                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1618
1619 #define TEST_CASE(n)    do {                                    \
1620         u8 out[sizeof(P##n)];                                   \
1621         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1622         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1623         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1624         memset(out,0,sizeof(out));                              \
1625         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1626         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1627         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1628             (C##n && memcmp(out,C##n,sizeof(out))))             \
1629                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1630         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1631         memset(out,0,sizeof(out));                              \
1632         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1633         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1634         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1635             (P##n && memcmp(out,P##n,sizeof(out))))             \
1636                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1637         } while(0)
1638
1639 int main()
1640 {
1641         GCM128_CONTEXT ctx;
1642         AES_KEY key;
1643         int ret=0;
1644
1645         TEST_CASE(1);
1646         TEST_CASE(2);
1647         TEST_CASE(3);
1648         TEST_CASE(4);
1649         TEST_CASE(5);
1650         TEST_CASE(6);
1651         TEST_CASE(7);
1652         TEST_CASE(8);
1653         TEST_CASE(9);
1654         TEST_CASE(10);
1655         TEST_CASE(11);
1656         TEST_CASE(12);
1657         TEST_CASE(13);
1658         TEST_CASE(14);
1659         TEST_CASE(15);
1660         TEST_CASE(16);
1661         TEST_CASE(17);
1662         TEST_CASE(18);
1663
1664 #ifdef OPENSSL_CPUID_OBJ
1665         {
1666         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1667         union { u64 u; u8 c[1024]; } buf;
1668         int i;
1669
1670         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1671         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1672         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1673
1674         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1675         start = OPENSSL_rdtsc();
1676         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1677         gcm_t = OPENSSL_rdtsc() - start;
1678
1679         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1680                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1681                         (block128_f)AES_encrypt);
1682         start = OPENSSL_rdtsc();
1683         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1684                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1685                         (block128_f)AES_encrypt);
1686         ctr_t = OPENSSL_rdtsc() - start;
1687
1688         printf("%.2f-%.2f=%.2f\n",
1689                         gcm_t/(double)sizeof(buf),
1690                         ctr_t/(double)sizeof(buf),
1691                         (gcm_t-ctr_t)/(double)sizeof(buf));
1692 #ifdef GHASH
1693         GHASH(&ctx,buf.c,sizeof(buf));
1694         start = OPENSSL_rdtsc();
1695         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1696         gcm_t = OPENSSL_rdtsc() - start;
1697         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1698 #endif
1699         }
1700 #endif
1701
1702         return ret;
1703 }
1704 #endif