gcm128.c: tidy up, minor optimization, rearrange gcm128_context.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         static const size_t rem_8bit[256] = {
152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217         while (1) {
218                 Z.hi ^= Htable[n].hi;
219                 Z.lo ^= Htable[n].lo;
220
221                 if ((u8 *)Xi==xi)       break;
222
223                 n = *(--xi);
224
225                 rem  = (size_t)Z.lo&0xff;
226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227                 Z.hi = (Z.hi>>8);
228                 if (sizeof(size_t)==8)
229                         Z.hi ^= rem_8bit[rem];
230                 else
231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
232         }
233
234         if (is_endian.little) {
235 #ifdef BSWAP8
236                 Xi[0] = BSWAP8(Z.hi);
237                 Xi[1] = BSWAP8(Z.lo);
238 #else
239                 u8 *p = (u8 *)Xi;
240                 u32 v;
241                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
243                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
245 #endif
246         }
247         else {
248                 Xi[0] = Z.hi;
249                 Xi[1] = Z.lo;
250         }
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254 #elif   TABLE_BITS==4
255
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258         u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         int  i;
261 #endif
262
263         Htable[0].hi = 0;
264         Htable[0].lo = 0;
265         V.hi = H[0];
266         V.lo = H[1];
267
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269         for (Htable[8]=V, i=4; i>0; i>>=1) {
270                 REDUCE1BIT(V);
271                 Htable[i] = V;
272         }
273
274         for (i=2; i<16; i<<=1) {
275                 u128 *Hi = Htable+i;
276                 int   j;
277                 for (V=*Hi, j=1; j<i; ++j) {
278                         Hi[j].hi = V.hi^Htable[j].hi;
279                         Hi[j].lo = V.lo^Htable[j].lo;
280                 }
281         }
282 #else
283         Htable[8] = V;
284         REDUCE1BIT(V);
285         Htable[4] = V;
286         REDUCE1BIT(V);
287         Htable[2] = V;
288         REDUCE1BIT(V);
289         Htable[1] = V;
290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291         V=Htable[4];
292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295         V=Htable[8];
296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305         /*
306          * ARM assembler expects specific dword order in Htable.
307          */
308         {
309         int j;
310         const union { long one; char little; } is_endian = {1};
311
312         if (is_endian.little)
313                 for (j=0;j<16;++j) {
314                         V = Htable[j];
315                         Htable[j].hi = V.lo;
316                         Htable[j].lo = V.hi;
317                 }
318         else
319                 for (j=0;j<16;++j) {
320                         V = Htable[j];
321                         Htable[j].hi = V.lo<<32|V.lo>>32;
322                         Htable[j].lo = V.hi<<32|V.hi>>32;
323                 }
324         }
325 #endif
326 }
327
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337         u128 Z;
338         int cnt = 15;
339         size_t rem, nlo, nhi;
340         const union { long one; char little; } is_endian = {1};
341
342         nlo  = ((const u8 *)Xi)[15];
343         nhi  = nlo>>4;
344         nlo &= 0xf;
345
346         Z.hi = Htable[nlo].hi;
347         Z.lo = Htable[nlo].lo;
348
349         while (1) {
350                 rem  = (size_t)Z.lo&0xf;
351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352                 Z.hi = (Z.hi>>4);
353                 if (sizeof(size_t)==8)
354                         Z.hi ^= rem_4bit[rem];
355                 else
356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358                 Z.hi ^= Htable[nhi].hi;
359                 Z.lo ^= Htable[nhi].lo;
360
361                 if (--cnt<0)            break;
362
363                 nlo  = ((const u8 *)Xi)[cnt];
364                 nhi  = nlo>>4;
365                 nlo &= 0xf;
366
367                 rem  = (size_t)Z.lo&0xf;
368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369                 Z.hi = (Z.hi>>4);
370                 if (sizeof(size_t)==8)
371                         Z.hi ^= rem_4bit[rem];
372                 else
373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375                 Z.hi ^= Htable[nlo].hi;
376                 Z.lo ^= Htable[nlo].lo;
377         }
378
379         if (is_endian.little) {
380 #ifdef BSWAP8
381                 Xi[0] = BSWAP8(Z.hi);
382                 Xi[1] = BSWAP8(Z.lo);
383 #else
384                 u8 *p = (u8 *)Xi;
385                 u32 v;
386                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
388                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
390 #endif
391         }
392         else {
393                 Xi[0] = Z.hi;
394                 Xi[1] = Z.lo;
395         }
396 }
397
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407                                 const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413
414 #if 1
415     do {
416         cnt  = 15;
417         nlo  = ((const u8 *)Xi)[15];
418         nlo ^= inp[15];
419         nhi  = nlo>>4;
420         nlo &= 0xf;
421
422         Z.hi = Htable[nlo].hi;
423         Z.lo = Htable[nlo].lo;
424
425         while (1) {
426                 rem  = (size_t)Z.lo&0xf;
427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428                 Z.hi = (Z.hi>>4);
429                 if (sizeof(size_t)==8)
430                         Z.hi ^= rem_4bit[rem];
431                 else
432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434                 Z.hi ^= Htable[nhi].hi;
435                 Z.lo ^= Htable[nhi].lo;
436
437                 if (--cnt<0)            break;
438
439                 nlo  = ((const u8 *)Xi)[cnt];
440                 nlo ^= inp[cnt];
441                 nhi  = nlo>>4;
442                 nlo &= 0xf;
443
444                 rem  = (size_t)Z.lo&0xf;
445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446                 Z.hi = (Z.hi>>4);
447                 if (sizeof(size_t)==8)
448                         Z.hi ^= rem_4bit[rem];
449                 else
450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452                 Z.hi ^= Htable[nlo].hi;
453                 Z.lo ^= Htable[nlo].lo;
454         }
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */ 
462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504         Z.hi = Htable[cnt].hi;
505         Z.lo = Htable[cnt].lo;
506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507         Hshr4[cnt].hi = (Z.hi>>4);
508         Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510
511     do {
512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513                 nlo  = ((const u8 *)Xi)[cnt];
514                 nlo ^= inp[cnt];
515                 nhi  = nlo>>4;
516                 nlo &= 0xf;
517
518                 Z.hi ^= Htable[nlo].hi;
519                 Z.lo ^= Htable[nlo].lo;
520
521                 rem = (size_t)Z.lo&0xff;
522
523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524                 Z.hi = (Z.hi>>8);
525
526                 Z.hi ^= Hshr4[nhi].hi;
527                 Z.lo ^= Hshr4[nhi].lo;
528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529         }
530
531         nlo  = ((const u8 *)Xi)[0];
532         nlo ^= inp[0];
533         nhi  = nlo>>4;
534         nlo &= 0xf;
535
536         Z.hi ^= Htable[nlo].hi;
537         Z.lo ^= Htable[nlo].lo;
538
539         rem = (size_t)Z.lo&0xf;
540
541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
542         Z.hi = (Z.hi>>4);
543
544         Z.hi ^= Htable[nhi].hi;
545         Z.lo ^= Htable[nhi].lo;
546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548
549         if (is_endian.little) {
550 #ifdef BSWAP8
551                 Xi[0] = BSWAP8(Z.hi);
552                 Xi[1] = BSWAP8(Z.lo);
553 #else
554                 u8 *p = (u8 *)Xi;
555                 u32 v;
556                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
558                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
560 #endif
561         }
562         else {
563                 Xi[0] = Z.hi;
564                 Xi[1] = Z.lo;
565         }
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582
583 #else   /* TABLE_BITS */
584
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587         u128 V,Z = { 0,0 };
588         long X;
589         int  i,j;
590         const long *xi = (const long *)Xi;
591         const union { long one; char little; } is_endian = {1};
592
593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
594         V.lo = H[1];
595
596         for (j=0; j<16/sizeof(long); ++j) {
597                 if (is_endian.little) {
598                         if (sizeof(long)==8) {
599 #ifdef BSWAP8
600                                 X = (long)(BSWAP8(xi[j]));
601 #else
602                                 const u8 *p = (const u8 *)(xi+j);
603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605                         }
606                         else {
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)GETU32(p);
609                         }
610                 }
611                 else
612                         X = xi[j];
613
614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
616                         Z.hi ^= V.hi&M;
617                         Z.lo ^= V.lo&M;
618
619                         REDUCE1BIT(V);
620                 }
621         }
622
623         if (is_endian.little) {
624 #ifdef BSWAP8
625                 Xi[0] = BSWAP8(Z.hi);
626                 Xi[1] = BSWAP8(Z.lo);
627 #else
628                 u8 *p = (u8 *)Xi;
629                 u32 v;
630                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
632                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
634 #endif
635         }
636         else {
637                 Xi[0] = Z.hi;
638                 Xi[1] = Z.lo;
639         }
640 }
641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643 #endif
644
645 #if     TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
646         (defined(__i386)        || defined(__i386__)    || \
647          defined(__x86_64)      || defined(__x86_64__)  || \
648          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
649 # define GHASH_ASM_X86_OR_64
650 extern unsigned int OPENSSL_ia32cap_P[2];
651
652 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
653 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
654 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
655
656 # if    defined(__i386) || defined(__i386__) || defined(_M_IX86)
657 #  define GHASH_ASM_X86
658 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
662 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
663 # endif
664
665 # define GCM_FUNCREF_4BIT
666 #endif
667
668 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
669 {
670         const union { long one; char little; } is_endian = {1};
671
672         memset(ctx,0,sizeof(*ctx));
673         ctx->block = block;
674         ctx->key   = key;
675
676         (*block)(ctx->H.c,ctx->H.c,key);
677
678         if (is_endian.little) {
679                 /* H is stored in host byte order */
680 #ifdef BSWAP8
681                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
682                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
683 #else
684                 u8 *p = ctx->H.c;
685                 u64 hi,lo;
686                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
687                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
688                 ctx->H.u[0] = hi;
689                 ctx->H.u[1] = lo;
690 #endif
691         }
692
693 #if     TABLE_BITS==8
694         gcm_init_8bit(ctx->Htable,ctx->H.u);
695 #elif   TABLE_BITS==4
696 # if    defined(GHASH_ASM_X86_OR_64)
697 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
698         if (OPENSSL_ia32cap_P[1]&(1<<1)) {
699                 gcm_init_clmul(ctx->Htable,ctx->H.u);
700                 ctx->gmult = gcm_gmult_clmul;
701                 ctx->ghash = gcm_ghash_clmul;
702                 return;
703         }
704 #  endif
705         gcm_init_4bit(ctx->Htable,ctx->H.u);
706 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
707         if (OPENSSL_ia32cap_P[0]&(1<<23)) {
708                 ctx->gmult = gcm_gmult_4bit_mmx;
709                 ctx->ghash = gcm_ghash_4bit_mmx;
710         } else {
711                 ctx->gmult = gcm_gmult_4bit_x86;
712                 ctx->ghash = gcm_ghash_4bit_x86;
713         }
714 #  else
715         ctx->gmult = gcm_gmult_4bit;
716         ctx->ghash = gcm_ghash_4bit;
717 #  endif
718 # else
719         gcm_init_4bit(ctx->Htable,ctx->H.u);
720 # endif
721 #endif
722 }
723
724 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
725 {
726         const union { long one; char little; } is_endian = {1};
727         unsigned int ctr;
728 #ifdef GCM_FUNCREF_4BIT
729         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
730 #endif
731
732         ctx->Yi.u[0]  = 0;
733         ctx->Yi.u[1]  = 0;
734         ctx->Xi.u[0]  = 0;
735         ctx->Xi.u[1]  = 0;
736         ctx->len.u[0] = 0;      /* AAD length */
737         ctx->len.u[1] = 0;      /* message length */
738         ctx->ares = 0;
739         ctx->mres = 0;
740
741         if (len==12) {
742                 memcpy(ctx->Yi.c,iv,12);
743                 ctx->Yi.c[15]=1;
744                 ctr=1;
745         }
746         else {
747                 size_t i;
748                 u64 len0 = len;
749
750                 while (len>=16) {
751                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
752                         GCM_MUL(ctx,Yi);
753                         iv += 16;
754                         len -= 16;
755                 }
756                 if (len) {
757                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
758                         GCM_MUL(ctx,Yi);
759                 }
760                 len0 <<= 3;
761                 if (is_endian.little) {
762 #ifdef BSWAP8
763                         ctx->Yi.u[1]  ^= BSWAP8(len0);
764 #else
765                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
766                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
767                         ctx->Yi.c[10] ^= (u8)(len0>>40);
768                         ctx->Yi.c[11] ^= (u8)(len0>>32);
769                         ctx->Yi.c[12] ^= (u8)(len0>>24);
770                         ctx->Yi.c[13] ^= (u8)(len0>>16);
771                         ctx->Yi.c[14] ^= (u8)(len0>>8);
772                         ctx->Yi.c[15] ^= (u8)(len0);
773 #endif
774                 }
775                 else
776                         ctx->Yi.u[1]  ^= len0;
777
778                 GCM_MUL(ctx,Yi);
779
780                 if (is_endian.little)
781                         ctr = GETU32(ctx->Yi.c+12);
782                 else
783                         ctr = ctx->Yi.d[3];
784         }
785
786         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
787         ++ctr;
788         if (is_endian.little)
789                 PUTU32(ctx->Yi.c+12,ctr);
790         else
791                 ctx->Yi.d[3] = ctr;
792 }
793
794 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
795 {
796         size_t i;
797         unsigned int n;
798         u64 alen = ctx->len.u[0];
799 #ifdef GCM_FUNCREF_4BIT
800         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
801 # ifdef GHASH
802         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
803                                 const u8 *inp,size_t len) = ctx->ghash;
804 # endif
805 #endif
806
807         if (ctx->len.u[1]) return -2;
808
809         alen += len;
810         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
811                 return -1;
812         ctx->len.u[0] = alen;
813
814         n = ctx->ares;
815         if (n) {
816                 while (n && len) {
817                         ctx->Xi.c[n] ^= *(aad++);
818                         --len;
819                         n = (n+1)%16;
820                 }
821                 if (n==0) GCM_MUL(ctx,Xi);
822                 else {
823                         ctx->ares = n;
824                         return 0;
825                 }
826         }
827
828 #ifdef GHASH
829         if ((i = (len&(size_t)-16))) {
830                 GHASH(ctx,aad,i);
831                 aad += i;
832                 len -= i;
833         }
834 #else
835         while (len>=16) {
836                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
837                 GCM_MUL(ctx,Xi);
838                 aad += 16;
839                 len -= 16;
840         }
841 #endif
842         if (len) {
843                 n = (unsigned int)len;
844                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
845         }
846
847         ctx->ares = n;
848         return 0;
849 }
850
851 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
852                 const unsigned char *in, unsigned char *out,
853                 size_t len)
854 {
855         const union { long one; char little; } is_endian = {1};
856         unsigned int n, ctr;
857         size_t i;
858         u64 mlen = ctx->len.u[1];
859 #ifdef GCM_FUNCREF_4BIT
860         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
861 # ifdef GHASH
862         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
863                                 const u8 *inp,size_t len) = ctx->ghash;
864 # endif
865 #endif
866
867 #if 0
868         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
869 #endif
870         mlen += len;
871         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
872                 return -1;
873         ctx->len.u[1] = mlen;
874
875         if (ctx->ares) {
876                 /* First call to encrypt finalizes GHASH(AAD) */
877                 GCM_MUL(ctx,Xi);
878                 ctx->ares = 0;
879         }
880
881         if (is_endian.little)
882                 ctr = GETU32(ctx->Yi.c+12);
883         else
884                 ctr = ctx->Yi.d[3];
885
886         n = ctx->mres;
887 #if !defined(OPENSSL_SMALL_FOOTPRINT)
888         if (16%sizeof(size_t) == 0) do {        /* always true actually */
889                 if (n) {
890                         while (n && len) {
891                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
892                                 --len;
893                                 n = (n+1)%16;
894                         }
895                         if (n==0) GCM_MUL(ctx,Xi);
896                         else {
897                                 ctx->mres = n;
898                                 return 0;
899                         }
900                 }
901 #if defined(STRICT_ALIGNMENT)
902                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
903                         break;
904 #endif
905 #if defined(GHASH) && defined(GHASH_CHUNK)
906                 while (len>=GHASH_CHUNK) {
907                     size_t j=GHASH_CHUNK;
908
909                     while (j) {
910                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
911                         ++ctr;
912                         if (is_endian.little)
913                                 PUTU32(ctx->Yi.c+12,ctr);
914                         else
915                                 ctx->Yi.d[3] = ctr;
916                         for (i=0; i<16; i+=sizeof(size_t))
917                                 *(size_t *)(out+i) =
918                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
919                         out += 16;
920                         in  += 16;
921                         j   -= 16;
922                     }
923                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
924                     len -= GHASH_CHUNK;
925                 }
926                 if ((i = (len&(size_t)-16))) {
927                     size_t j=i;
928
929                     while (len>=16) {
930                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
931                         ++ctr;
932                         if (is_endian.little)
933                                 PUTU32(ctx->Yi.c+12,ctr);
934                         else
935                                 ctx->Yi.d[3] = ctr;
936                         for (i=0; i<16; i+=sizeof(size_t))
937                                 *(size_t *)(out+i) =
938                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
939                         out += 16;
940                         in  += 16;
941                         len -= 16;
942                     }
943                     GHASH(ctx,out-j,j);
944                 }
945 #else
946                 while (len>=16) {
947                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
948                         ++ctr;
949                         if (is_endian.little)
950                                 PUTU32(ctx->Yi.c+12,ctr);
951                         else
952                                 ctx->Yi.d[3] = ctr;
953                         for (i=0; i<16; i+=sizeof(size_t))
954                                 *(size_t *)(ctx->Xi.c+i) ^=
955                                 *(size_t *)(out+i) =
956                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
957                         GCM_MUL(ctx,Xi);
958                         out += 16;
959                         in  += 16;
960                         len -= 16;
961                 }
962 #endif
963                 if (len) {
964                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
965                         ++ctr;
966                         if (is_endian.little)
967                                 PUTU32(ctx->Yi.c+12,ctr);
968                         else
969                                 ctx->Yi.d[3] = ctr;
970                         while (len--) {
971                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
972                                 ++n;
973                         }
974                 }
975
976                 ctx->mres = n;
977                 return 0;
978         } while(0);
979 #endif
980         for (i=0;i<len;++i) {
981                 if (n==0) {
982                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
983                         ++ctr;
984                         if (is_endian.little)
985                                 PUTU32(ctx->Yi.c+12,ctr);
986                         else
987                                 ctx->Yi.d[3] = ctr;
988                 }
989                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
990                 n = (n+1)%16;
991                 if (n==0)
992                         GCM_MUL(ctx,Xi);
993         }
994
995         ctx->mres = n;
996         return 0;
997 }
998
999 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1000                 const unsigned char *in, unsigned char *out,
1001                 size_t len)
1002 {
1003         const union { long one; char little; } is_endian = {1};
1004         unsigned int n, ctr;
1005         size_t i;
1006         u64 mlen = ctx->len.u[1];
1007 #ifdef GCM_FUNCREF_4BIT
1008         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1009 # ifdef GHASH
1010         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1011                                 const u8 *inp,size_t len) = ctx->ghash;
1012 # endif
1013 #endif
1014
1015         mlen += len;
1016         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1017                 return -1;
1018         ctx->len.u[1] = mlen;
1019
1020         if (ctx->ares) {
1021                 /* First call to decrypt finalizes GHASH(AAD) */
1022                 GCM_MUL(ctx,Xi);
1023                 ctx->ares = 0;
1024         }
1025
1026         if (is_endian.little)
1027                 ctr = GETU32(ctx->Yi.c+12);
1028         else
1029                 ctr = ctx->Yi.d[3];
1030
1031         n = ctx->mres;
1032 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1033         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1034                 if (n) {
1035                         while (n && len) {
1036                                 u8 c = *(in++);
1037                                 *(out++) = c^ctx->EKi.c[n];
1038                                 ctx->Xi.c[n] ^= c;
1039                                 --len;
1040                                 n = (n+1)%16;
1041                         }
1042                         if (n==0) GCM_MUL (ctx,Xi);
1043                         else {
1044                                 ctx->mres = n;
1045                                 return 0;
1046                         }
1047                 }
1048 #if defined(STRICT_ALIGNMENT)
1049                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1050                         break;
1051 #endif
1052 #if defined(GHASH) && defined(GHASH_CHUNK)
1053                 while (len>=GHASH_CHUNK) {
1054                     size_t j=GHASH_CHUNK;
1055
1056                     GHASH(ctx,in,GHASH_CHUNK);
1057                     while (j) {
1058                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1059                         ++ctr;
1060                         if (is_endian.little)
1061                                 PUTU32(ctx->Yi.c+12,ctr);
1062                         else
1063                                 ctx->Yi.d[3] = ctr;
1064                         for (i=0; i<16; i+=sizeof(size_t))
1065                                 *(size_t *)(out+i) =
1066                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1067                         out += 16;
1068                         in  += 16;
1069                         j   -= 16;
1070                     }
1071                     len -= GHASH_CHUNK;
1072                 }
1073                 if ((i = (len&(size_t)-16))) {
1074                     GHASH(ctx,in,i);
1075                     while (len>=16) {
1076                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1077                         ++ctr;
1078                         if (is_endian.little)
1079                                 PUTU32(ctx->Yi.c+12,ctr);
1080                         else
1081                                 ctx->Yi.d[3] = ctr;
1082                         for (i=0; i<16; i+=sizeof(size_t))
1083                                 *(size_t *)(out+i) =
1084                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1085                         out += 16;
1086                         in  += 16;
1087                         len -= 16;
1088                     }
1089                 }
1090 #else
1091                 while (len>=16) {
1092                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1093                         ++ctr;
1094                         if (is_endian.little)
1095                                 PUTU32(ctx->Yi.c+12,ctr);
1096                         else
1097                                 ctx->Yi.d[3] = ctr;
1098                         for (i=0; i<16; i+=sizeof(size_t)) {
1099                                 size_t c = *(size_t *)(in+i);
1100                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1101                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1102                         }
1103                         GCM_MUL(ctx,Xi);
1104                         out += 16;
1105                         in  += 16;
1106                         len -= 16;
1107                 }
1108 #endif
1109                 if (len) {
1110                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1111                         ++ctr;
1112                         if (is_endian.little)
1113                                 PUTU32(ctx->Yi.c+12,ctr);
1114                         else
1115                                 ctx->Yi.d[3] = ctr;
1116                         while (len--) {
1117                                 u8 c = in[n];
1118                                 ctx->Xi.c[n] ^= c;
1119                                 out[n] = c^ctx->EKi.c[n];
1120                                 ++n;
1121                         }
1122                 }
1123
1124                 ctx->mres = n;
1125                 return 0;
1126         } while(0);
1127 #endif
1128         for (i=0;i<len;++i) {
1129                 u8 c;
1130                 if (n==0) {
1131                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1132                         ++ctr;
1133                         if (is_endian.little)
1134                                 PUTU32(ctx->Yi.c+12,ctr);
1135                         else
1136                                 ctx->Yi.d[3] = ctr;
1137                 }
1138                 c = in[i];
1139                 out[i] = c^ctx->EKi.c[n];
1140                 ctx->Xi.c[n] ^= c;
1141                 n = (n+1)%16;
1142                 if (n==0)
1143                         GCM_MUL(ctx,Xi);
1144         }
1145
1146         ctx->mres = n;
1147         return 0;
1148 }
1149
1150 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1151                 const unsigned char *in, unsigned char *out,
1152                 size_t len, ctr128_f stream)
1153 {
1154         const union { long one; char little; } is_endian = {1};
1155         unsigned int n, ctr;
1156         size_t i;
1157         u64 mlen = ctx->len.u[1];
1158 #ifdef GCM_FUNCREF_4BIT
1159         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1160 # ifdef GHASH
1161         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1162                                 const u8 *inp,size_t len) = ctx->ghash;
1163 # endif
1164 #endif
1165
1166         mlen += len;
1167         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1168                 return -1;
1169         ctx->len.u[1] = mlen;
1170
1171         if (ctx->ares) {
1172                 /* First call to encrypt finalizes GHASH(AAD) */
1173                 GCM_MUL(ctx,Xi);
1174                 ctx->ares = 0;
1175         }
1176
1177         if (is_endian.little)
1178                 ctr = GETU32(ctx->Yi.c+12);
1179         else
1180                 ctr = ctx->Yi.d[3];
1181
1182         n = ctx->mres;
1183         if (n) {
1184                 while (n && len) {
1185                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1186                         --len;
1187                         n = (n+1)%16;
1188                 }
1189                 if (n==0) GCM_MUL(ctx,Xi);
1190                 else {
1191                         ctx->mres = n;
1192                         return 0;
1193                 }
1194         }
1195 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1196         while (len>=GHASH_CHUNK) {
1197                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1198                 ctr += GHASH_CHUNK/16;
1199                 if (is_endian.little)
1200                         PUTU32(ctx->Yi.c+12,ctr);
1201                 else
1202                         ctx->Yi.d[3] = ctr;
1203                 GHASH(ctx,out,GHASH_CHUNK);
1204                 out += GHASH_CHUNK;
1205                 in  += GHASH_CHUNK;
1206                 len -= GHASH_CHUNK;
1207         }
1208 #endif
1209         if ((i = (len&(size_t)-16))) {
1210                 size_t j=i/16;
1211
1212                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1213                 ctr += (unsigned int)j;
1214                 if (is_endian.little)
1215                         PUTU32(ctx->Yi.c+12,ctr);
1216                 else
1217                         ctx->Yi.d[3] = ctr;
1218                 in  += i;
1219                 len -= i;
1220 #if defined(GHASH)
1221                 GHASH(ctx,out,i);
1222                 out += i;
1223 #else
1224                 while (j--) {
1225                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1226                         GCM_MUL(ctx,Xi);
1227                         out += 16;
1228                 }
1229 #endif
1230         }
1231         if (len) {
1232                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1233                 ++ctr;
1234                 if (is_endian.little)
1235                         PUTU32(ctx->Yi.c+12,ctr);
1236                 else
1237                         ctx->Yi.d[3] = ctr;
1238                 while (len--) {
1239                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1240                         ++n;
1241                 }
1242         }
1243
1244         ctx->mres = n;
1245         return 0;
1246 }
1247
1248 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1249                 const unsigned char *in, unsigned char *out,
1250                 size_t len,ctr128_f stream)
1251 {
1252         const union { long one; char little; } is_endian = {1};
1253         unsigned int n, ctr;
1254         size_t i;
1255         u64 mlen = ctx->len.u[1];
1256 #ifdef GCM_FUNCREF_4BIT
1257         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1258 # ifdef GHASH
1259         void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1260                                 const u8 *inp,size_t len) = ctx->ghash;
1261 # endif
1262 #endif
1263
1264         mlen += len;
1265         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1266                 return -1;
1267         ctx->len.u[1] = mlen;
1268
1269         if (ctx->ares) {
1270                 /* First call to decrypt finalizes GHASH(AAD) */
1271                 GCM_MUL(ctx,Xi);
1272                 ctx->ares = 0;
1273         }
1274
1275         if (is_endian.little)
1276                 ctr = GETU32(ctx->Yi.c+12);
1277         else
1278                 ctr = ctx->Yi.d[3];
1279
1280         n = ctx->mres;
1281         if (n) {
1282                 while (n && len) {
1283                         u8 c = *(in++);
1284                         *(out++) = c^ctx->EKi.c[n];
1285                         ctx->Xi.c[n] ^= c;
1286                         --len;
1287                         n = (n+1)%16;
1288                 }
1289                 if (n==0) GCM_MUL (ctx,Xi);
1290                 else {
1291                         ctx->mres = n;
1292                         return 0;
1293                 }
1294         }
1295 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1296         while (len>=GHASH_CHUNK) {
1297                 GHASH(ctx,in,GHASH_CHUNK);
1298                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1299                 ctr += GHASH_CHUNK/16;
1300                 if (is_endian.little)
1301                         PUTU32(ctx->Yi.c+12,ctr);
1302                 else
1303                         ctx->Yi.d[3] = ctr;
1304                 out += GHASH_CHUNK;
1305                 in  += GHASH_CHUNK;
1306                 len -= GHASH_CHUNK;
1307         }
1308 #endif
1309         if ((i = (len&(size_t)-16))) {
1310                 size_t j=i/16;
1311
1312 #if defined(GHASH)
1313                 GHASH(ctx,in,i);
1314 #else
1315                 while (j--) {
1316                         size_t k;
1317                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1318                         GCM_MUL(ctx,Xi);
1319                         in += 16;
1320                 }
1321                 j   = i/16;
1322                 in -= i;
1323 #endif
1324                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1325                 ctr += (unsigned int)j;
1326                 if (is_endian.little)
1327                         PUTU32(ctx->Yi.c+12,ctr);
1328                 else
1329                         ctx->Yi.d[3] = ctr;
1330                 out += i;
1331                 in  += i;
1332                 len -= i;
1333         }
1334         if (len) {
1335                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1336                 ++ctr;
1337                 if (is_endian.little)
1338                         PUTU32(ctx->Yi.c+12,ctr);
1339                 else
1340                         ctx->Yi.d[3] = ctr;
1341                 while (len--) {
1342                         u8 c = in[n];
1343                         ctx->Xi.c[n] ^= c;
1344                         out[n] = c^ctx->EKi.c[n];
1345                         ++n;
1346                 }
1347         }
1348
1349         ctx->mres = n;
1350         return 0;
1351 }
1352
1353 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1354                         size_t len)
1355 {
1356         const union { long one; char little; } is_endian = {1};
1357         u64 alen = ctx->len.u[0]<<3;
1358         u64 clen = ctx->len.u[1]<<3;
1359 #ifdef GCM_FUNCREF_4BIT
1360         void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1361 #endif
1362
1363         if (ctx->mres)
1364                 GCM_MUL(ctx,Xi);
1365
1366         if (is_endian.little) {
1367 #ifdef BSWAP8
1368                 alen = BSWAP8(alen);
1369                 clen = BSWAP8(clen);
1370 #else
1371                 u8 *p = ctx->len.c;
1372
1373                 ctx->len.u[0] = alen;
1374                 ctx->len.u[1] = clen;
1375
1376                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1377                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1378 #endif
1379         }
1380
1381         ctx->Xi.u[0] ^= alen;
1382         ctx->Xi.u[1] ^= clen;
1383         GCM_MUL(ctx,Xi);
1384
1385         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1386         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1387
1388         if (tag && len<=sizeof(ctx->Xi))
1389                 return memcmp(ctx->Xi.c,tag,len);
1390         else
1391                 return -1;
1392 }
1393
1394 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1395 {
1396         CRYPTO_gcm128_finish(ctx, NULL, 0);
1397         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1398 }
1399
1400 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1401 {
1402         GCM128_CONTEXT *ret;
1403
1404         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1405                 CRYPTO_gcm128_init(ret,key,block);
1406
1407         return ret;
1408 }
1409
1410 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1411 {
1412         if (ctx) {
1413                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1414                 OPENSSL_free(ctx);
1415         }
1416 }
1417
1418 #if defined(SELFTEST)
1419 #include <stdio.h>
1420 #include <openssl/aes.h>
1421
1422 /* Test Case 1 */
1423 static const u8 K1[16],
1424                 *P1=NULL,
1425                 *A1=NULL,
1426                 IV1[12],
1427                 *C1=NULL,
1428                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1429
1430 /* Test Case 2 */
1431 #define K2 K1
1432 #define A2 A1
1433 #define IV2 IV1
1434 static const u8 P2[16],
1435                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1436                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1437
1438 /* Test Case 3 */
1439 #define A3 A2
1440 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1441                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1442                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1443                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1444                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1445                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1446                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1447                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1448                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1449                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1450                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1451
1452 /* Test Case 4 */
1453 #define K4 K3
1454 #define IV4 IV3
1455 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1456                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1457                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1458                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1459                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1460                         0xab,0xad,0xda,0xd2},
1461                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1462                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1463                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1464                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1465                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1466
1467 /* Test Case 5 */
1468 #define K5 K4
1469 #define P5 P4
1470 #define A5 A4
1471 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1472                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1473                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1474                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1475                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1476                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1477
1478 /* Test Case 6 */
1479 #define K6 K5
1480 #define P6 P5
1481 #define A6 A5
1482 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1483                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1484                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1485                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1486                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1487                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1488                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1489                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1490                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1491
1492 /* Test Case 7 */
1493 static const u8 K7[24],
1494                 *P7=NULL,
1495                 *A7=NULL,
1496                 IV7[12],
1497                 *C7=NULL,
1498                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1499
1500 /* Test Case 8 */
1501 #define K8 K7
1502 #define IV8 IV7
1503 #define A8 A7
1504 static const u8 P8[16],
1505                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1506                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1507
1508 /* Test Case 9 */
1509 #define A9 A8
1510 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1511                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1512                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1513                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1514                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1515                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1516                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1517                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1518                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1519                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1520                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1521                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1522
1523 /* Test Case 10 */
1524 #define K10 K9
1525 #define IV10 IV9
1526 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1527                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1528                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1529                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1530                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1531                         0xab,0xad,0xda,0xd2},
1532                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1533                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1534                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1535                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1536                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1537
1538 /* Test Case 11 */
1539 #define K11 K10
1540 #define P11 P10
1541 #define A11 A10
1542 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1543                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1544                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1545                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1546                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1547                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1548
1549 /* Test Case 12 */
1550 #define K12 K11
1551 #define P12 P11
1552 #define A12 A11
1553 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1554                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1555                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1556                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1557                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1558                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1559                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1560                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1561                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1562
1563 /* Test Case 13 */
1564 static const u8 K13[32],
1565                 *P13=NULL,
1566                 *A13=NULL,
1567                 IV13[12],
1568                 *C13=NULL,
1569                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1570
1571 /* Test Case 14 */
1572 #define K14 K13
1573 #define A14 A13
1574 static const u8 P14[16],
1575                 IV14[12],
1576                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1577                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1578
1579 /* Test Case 15 */
1580 #define A15 A14
1581 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1582                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1583                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1584                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1585                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1586                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1587                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1588                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1589                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1590                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1591                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1592                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1593
1594 /* Test Case 16 */
1595 #define K16 K15
1596 #define IV16 IV15
1597 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1598                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1599                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1600                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1601                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1602                         0xab,0xad,0xda,0xd2},
1603                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1604                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1605                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1606                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1607                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1608
1609 /* Test Case 17 */
1610 #define K17 K16
1611 #define P17 P16
1612 #define A17 A16
1613 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1614                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1615                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1616                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1617                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1618                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1619
1620 /* Test Case 18 */
1621 #define K18 K17
1622 #define P18 P17
1623 #define A18 A17
1624 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1625                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1626                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1627                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1628                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1629                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1630                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1631                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1632                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1633
1634 #define TEST_CASE(n)    do {                                    \
1635         u8 out[sizeof(P##n)];                                   \
1636         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1637         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1638         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1639         memset(out,0,sizeof(out));                              \
1640         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1641         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1642         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1643             (C##n && memcmp(out,C##n,sizeof(out))))             \
1644                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1645         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1646         memset(out,0,sizeof(out));                              \
1647         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1648         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1649         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1650             (P##n && memcmp(out,P##n,sizeof(out))))             \
1651                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1652         } while(0)
1653
1654 int main()
1655 {
1656         GCM128_CONTEXT ctx;
1657         AES_KEY key;
1658         int ret=0;
1659
1660         TEST_CASE(1);
1661         TEST_CASE(2);
1662         TEST_CASE(3);
1663         TEST_CASE(4);
1664         TEST_CASE(5);
1665         TEST_CASE(6);
1666         TEST_CASE(7);
1667         TEST_CASE(8);
1668         TEST_CASE(9);
1669         TEST_CASE(10);
1670         TEST_CASE(11);
1671         TEST_CASE(12);
1672         TEST_CASE(13);
1673         TEST_CASE(14);
1674         TEST_CASE(15);
1675         TEST_CASE(16);
1676         TEST_CASE(17);
1677         TEST_CASE(18);
1678
1679 #ifdef OPENSSL_CPUID_OBJ
1680         {
1681         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1682         union { u64 u; u8 c[1024]; } buf;
1683         int i;
1684
1685         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1686         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1687         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1688
1689         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1690         start = OPENSSL_rdtsc();
1691         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1692         gcm_t = OPENSSL_rdtsc() - start;
1693
1694         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1695                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1696                         (block128_f)AES_encrypt);
1697         start = OPENSSL_rdtsc();
1698         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1699                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1700                         (block128_f)AES_encrypt);
1701         ctr_t = OPENSSL_rdtsc() - start;
1702
1703         printf("%.2f-%.2f=%.2f\n",
1704                         gcm_t/(double)sizeof(buf),
1705                         ctr_t/(double)sizeof(buf),
1706                         (gcm_t-ctr_t)/(double)sizeof(buf));
1707 #ifdef GHASH
1708         GHASH(&ctx,buf.c,sizeof(buf));
1709         start = OPENSSL_rdtsc();
1710         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1711         gcm_t = OPENSSL_rdtsc() - start;
1712         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1713 #endif
1714         }
1715 #endif
1716
1717         return ret;
1718 }
1719 #endif