8a76e1a7fd6d596acdc9d00adfce3594a512694b
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         static const size_t rem_8bit[256] = {
152                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
216
217         while (1) {
218                 Z.hi ^= Htable[n].hi;
219                 Z.lo ^= Htable[n].lo;
220
221                 if ((u8 *)Xi==xi)       break;
222
223                 n = *(--xi);
224
225                 rem  = (size_t)Z.lo&0xff;
226                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
227                 Z.hi = (Z.hi>>8);
228                 if (sizeof(size_t)==8)
229                         Z.hi ^= rem_8bit[rem];
230                 else
231                         Z.hi ^= (u64)rem_8bit[rem]<<32;
232         }
233
234         if (is_endian.little) {
235 #ifdef BSWAP8
236                 Xi[0] = BSWAP8(Z.hi);
237                 Xi[1] = BSWAP8(Z.lo);
238 #else
239                 u8 *p = (u8 *)Xi;
240                 u32 v;
241                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
242                 v = (u32)(Z.hi);        PUTU32(p+4,v);
243                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
244                 v = (u32)(Z.lo);        PUTU32(p+12,v);
245 #endif
246         }
247         else {
248                 Xi[0] = Z.hi;
249                 Xi[1] = Z.lo;
250         }
251 }
252 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
253
254 #elif   TABLE_BITS==4
255
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
257 {
258         u128 V;
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         int  i;
261 #endif
262
263         Htable[0].hi = 0;
264         Htable[0].lo = 0;
265         V.hi = H[0];
266         V.lo = H[1];
267
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269         for (Htable[8]=V, i=4; i>0; i>>=1) {
270                 REDUCE1BIT(V);
271                 Htable[i] = V;
272         }
273
274         for (i=2; i<16; i<<=1) {
275                 u128 *Hi = Htable+i;
276                 int   j;
277                 for (V=*Hi, j=1; j<i; ++j) {
278                         Hi[j].hi = V.hi^Htable[j].hi;
279                         Hi[j].lo = V.lo^Htable[j].lo;
280                 }
281         }
282 #else
283         Htable[8] = V;
284         REDUCE1BIT(V);
285         Htable[4] = V;
286         REDUCE1BIT(V);
287         Htable[2] = V;
288         REDUCE1BIT(V);
289         Htable[1] = V;
290         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
291         V=Htable[4];
292         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
293         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
294         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
295         V=Htable[8];
296         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
297         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
303 #endif
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
305         /*
306          * ARM assembler expects specific dword order in Htable.
307          */
308         {
309         int j;
310         const union { long one; char little; } is_endian = {1};
311
312         if (is_endian.little)
313                 for (j=0;j<16;++j) {
314                         V = Htable[j];
315                         Htable[j].hi = V.lo;
316                         Htable[j].lo = V.hi;
317                 }
318         else
319                 for (j=0;j<16;++j) {
320                         V = Htable[j];
321                         Htable[j].hi = V.lo<<32|V.lo>>32;
322                         Htable[j].lo = V.hi<<32|V.hi>>32;
323                 }
324         }
325 #endif
326 }
327
328 #ifndef GHASH_ASM
329 static const size_t rem_4bit[16] = {
330         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
334
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
336 {
337         u128 Z;
338         int cnt = 15;
339         size_t rem, nlo, nhi;
340         const union { long one; char little; } is_endian = {1};
341
342         nlo  = ((const u8 *)Xi)[15];
343         nhi  = nlo>>4;
344         nlo &= 0xf;
345
346         Z.hi = Htable[nlo].hi;
347         Z.lo = Htable[nlo].lo;
348
349         while (1) {
350                 rem  = (size_t)Z.lo&0xf;
351                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
352                 Z.hi = (Z.hi>>4);
353                 if (sizeof(size_t)==8)
354                         Z.hi ^= rem_4bit[rem];
355                 else
356                         Z.hi ^= (u64)rem_4bit[rem]<<32;
357
358                 Z.hi ^= Htable[nhi].hi;
359                 Z.lo ^= Htable[nhi].lo;
360
361                 if (--cnt<0)            break;
362
363                 nlo  = ((const u8 *)Xi)[cnt];
364                 nhi  = nlo>>4;
365                 nlo &= 0xf;
366
367                 rem  = (size_t)Z.lo&0xf;
368                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
369                 Z.hi = (Z.hi>>4);
370                 if (sizeof(size_t)==8)
371                         Z.hi ^= rem_4bit[rem];
372                 else
373                         Z.hi ^= (u64)rem_4bit[rem]<<32;
374
375                 Z.hi ^= Htable[nlo].hi;
376                 Z.lo ^= Htable[nlo].lo;
377         }
378
379         if (is_endian.little) {
380 #ifdef BSWAP8
381                 Xi[0] = BSWAP8(Z.hi);
382                 Xi[1] = BSWAP8(Z.lo);
383 #else
384                 u8 *p = (u8 *)Xi;
385                 u32 v;
386                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
387                 v = (u32)(Z.hi);        PUTU32(p+4,v);
388                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
389                 v = (u32)(Z.lo);        PUTU32(p+12,v);
390 #endif
391         }
392         else {
393                 Xi[0] = Z.hi;
394                 Xi[1] = Z.lo;
395         }
396 }
397
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
399 /*
400  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401  * details... Compiler-generated code doesn't seem to give any
402  * performance improvement, at least not on x86[_64]. It's here
403  * mostly as reference and a placeholder for possible future
404  * non-trivial optimization[s]...
405  */
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407                                 const u8 *inp,size_t len)
408 {
409     u128 Z;
410     int cnt;
411     size_t rem, nlo, nhi;
412     const union { long one; char little; } is_endian = {1};
413
414 #if 1
415     do {
416         cnt  = 15;
417         nlo  = ((const u8 *)Xi)[15];
418         nlo ^= inp[15];
419         nhi  = nlo>>4;
420         nlo &= 0xf;
421
422         Z.hi = Htable[nlo].hi;
423         Z.lo = Htable[nlo].lo;
424
425         while (1) {
426                 rem  = (size_t)Z.lo&0xf;
427                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
428                 Z.hi = (Z.hi>>4);
429                 if (sizeof(size_t)==8)
430                         Z.hi ^= rem_4bit[rem];
431                 else
432                         Z.hi ^= (u64)rem_4bit[rem]<<32;
433
434                 Z.hi ^= Htable[nhi].hi;
435                 Z.lo ^= Htable[nhi].lo;
436
437                 if (--cnt<0)            break;
438
439                 nlo  = ((const u8 *)Xi)[cnt];
440                 nlo ^= inp[cnt];
441                 nhi  = nlo>>4;
442                 nlo &= 0xf;
443
444                 rem  = (size_t)Z.lo&0xf;
445                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
446                 Z.hi = (Z.hi>>4);
447                 if (sizeof(size_t)==8)
448                         Z.hi ^= rem_4bit[rem];
449                 else
450                         Z.hi ^= (u64)rem_4bit[rem]<<32;
451
452                 Z.hi ^= Htable[nlo].hi;
453                 Z.lo ^= Htable[nlo].lo;
454         }
455 #else
456     /*
457      * Extra 256+16 bytes per-key plus 512 bytes shared tables
458      * [should] give ~50% improvement... One could have PACK()-ed
459      * the rem_8bit even here, but the priority is to minimize
460      * cache footprint...
461      */ 
462     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
463     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
464     static const unsigned short rem_8bit[256] = {
465         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
497     /*
498      * This pre-processing phase slows down procedure by approximately
499      * same time as it makes each loop spin faster. In other words
500      * single block performance is approximately same as straightforward
501      * "4-bit" implementation, and then it goes only faster...
502      */
503     for (cnt=0; cnt<16; ++cnt) {
504         Z.hi = Htable[cnt].hi;
505         Z.lo = Htable[cnt].lo;
506         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507         Hshr4[cnt].hi = (Z.hi>>4);
508         Hshl4[cnt]    = (u8)(Z.lo<<4);
509     }
510
511     do {
512         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513                 nlo  = ((const u8 *)Xi)[cnt];
514                 nlo ^= inp[cnt];
515                 nhi  = nlo>>4;
516                 nlo &= 0xf;
517
518                 Z.hi ^= Htable[nlo].hi;
519                 Z.lo ^= Htable[nlo].lo;
520
521                 rem = (size_t)Z.lo&0xff;
522
523                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
524                 Z.hi = (Z.hi>>8);
525
526                 Z.hi ^= Hshr4[nhi].hi;
527                 Z.lo ^= Hshr4[nhi].lo;
528                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
529         }
530
531         nlo  = ((const u8 *)Xi)[0];
532         nlo ^= inp[0];
533         nhi  = nlo>>4;
534         nlo &= 0xf;
535
536         Z.hi ^= Htable[nlo].hi;
537         Z.lo ^= Htable[nlo].lo;
538
539         rem = (size_t)Z.lo&0xf;
540
541         Z.lo = (Z.hi<<60)|(Z.lo>>4);
542         Z.hi = (Z.hi>>4);
543
544         Z.hi ^= Htable[nhi].hi;
545         Z.lo ^= Htable[nhi].lo;
546         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
547 #endif
548
549         if (is_endian.little) {
550 #ifdef BSWAP8
551                 Xi[0] = BSWAP8(Z.hi);
552                 Xi[1] = BSWAP8(Z.lo);
553 #else
554                 u8 *p = (u8 *)Xi;
555                 u32 v;
556                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
557                 v = (u32)(Z.hi);        PUTU32(p+4,v);
558                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
559                 v = (u32)(Z.lo);        PUTU32(p+12,v);
560 #endif
561         }
562         else {
563                 Xi[0] = Z.hi;
564                 Xi[1] = Z.lo;
565         }
566     } while (inp+=16, len-=16);
567 }
568 #endif
569 #else
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
572 #endif
573
574 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578  * trashing effect. In other words idea is to hash data while it's
579  * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK       (3*1024)
581 #endif
582
583 #else   /* TABLE_BITS */
584
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
586 {
587         u128 V,Z = { 0,0 };
588         long X;
589         int  i,j;
590         const long *xi = (const long *)Xi;
591         const union { long one; char little; } is_endian = {1};
592
593         V.hi = H[0];    /* H is in host byte order, no byte swapping */
594         V.lo = H[1];
595
596         for (j=0; j<16/sizeof(long); ++j) {
597                 if (is_endian.little) {
598                         if (sizeof(long)==8) {
599 #ifdef BSWAP8
600                                 X = (long)(BSWAP8(xi[j]));
601 #else
602                                 const u8 *p = (const u8 *)(xi+j);
603                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
604 #endif
605                         }
606                         else {
607                                 const u8 *p = (const u8 *)(xi+j);
608                                 X = (long)GETU32(p);
609                         }
610                 }
611                 else
612                         X = xi[j];
613
614                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615                         u64 M = (u64)(X>>(8*sizeof(long)-1));
616                         Z.hi ^= V.hi&M;
617                         Z.lo ^= V.lo&M;
618
619                         REDUCE1BIT(V);
620                 }
621         }
622
623         if (is_endian.little) {
624 #ifdef BSWAP8
625                 Xi[0] = BSWAP8(Z.hi);
626                 Xi[1] = BSWAP8(Z.lo);
627 #else
628                 u8 *p = (u8 *)Xi;
629                 u32 v;
630                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
631                 v = (u32)(Z.hi);        PUTU32(p+4,v);
632                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
633                 v = (u32)(Z.lo);        PUTU32(p+12,v);
634 #endif
635         }
636         else {
637                 Xi[0] = Z.hi;
638                 Xi[1] = Z.lo;
639         }
640 }
641 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
642
643 #endif
644
645 #if     TABLE_BITS==4 && defined(GHASH_ASM)
646 # if    !defined(I386_ONLY) && \
647         (defined(__i386)        || defined(__i386__)    || \
648          defined(__x86_64)      || defined(__x86_64__)  || \
649          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
650 #  define GHASH_ASM_X86_OR_64
651 #  define GCM_FUNCREF_4BIT
652 extern unsigned int OPENSSL_ia32cap_P[2];
653
654 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
655 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
656 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
657
658 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
659 #   define GHASH_ASM_X86
660 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
661 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662
663 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665 #  endif
666 # elif defined(__arm__) || defined(__arm)
667 #  include "arm_arch.h"
668 #  if __ARM_ARCH__>=7
669 #   define GHASH_ASM_ARM
670 #   define GCM_FUNCREF_4BIT
671 extern unsigned int OPENSSL_armcap;
672
673 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
675 #  endif
676 # endif
677 #endif
678
679 #ifdef GCM_FUNCREF_4BIT
680 # undef  GCM_MUL
681 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
682 # ifdef GHASH
683 #  undef  GHASH
684 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
685 # endif
686 #endif
687
688 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
689 {
690         const union { long one; char little; } is_endian = {1};
691
692         memset(ctx,0,sizeof(*ctx));
693         ctx->block = block;
694         ctx->key   = key;
695
696         (*block)(ctx->H.c,ctx->H.c,key);
697
698         if (is_endian.little) {
699                 /* H is stored in host byte order */
700 #ifdef BSWAP8
701                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
702                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
703 #else
704                 u8 *p = ctx->H.c;
705                 u64 hi,lo;
706                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
707                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
708                 ctx->H.u[0] = hi;
709                 ctx->H.u[1] = lo;
710 #endif
711         }
712
713 #if     TABLE_BITS==8
714         gcm_init_8bit(ctx->Htable,ctx->H.u);
715 #elif   TABLE_BITS==4
716 # if    defined(GHASH_ASM_X86_OR_64)
717 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
718         if (OPENSSL_ia32cap_P[1]&(1<<1)) {      /* check PCLMULQDQ bit */
719                 gcm_init_clmul(ctx->Htable,ctx->H.u);
720                 ctx->gmult = gcm_gmult_clmul;
721                 ctx->ghash = gcm_ghash_clmul;
722                 return;
723         }
724 #  endif
725         gcm_init_4bit(ctx->Htable,ctx->H.u);
726 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
727         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
728                 ctx->gmult = gcm_gmult_4bit_mmx;
729                 ctx->ghash = gcm_ghash_4bit_mmx;
730         } else {
731                 ctx->gmult = gcm_gmult_4bit_x86;
732                 ctx->ghash = gcm_ghash_4bit_x86;
733         }
734 #  else
735         ctx->gmult = gcm_gmult_4bit;
736         ctx->ghash = gcm_ghash_4bit;
737 #  endif
738 # elif  defined(GHASH_ASM_ARM)
739         if (OPENSSL_armcap & 1) {
740                 ctx->gmult = gcm_gmult_neon;
741                 ctx->ghash = gcm_ghash_neon;
742         } else {
743                 gcm_init_4bit(ctx->Htable,ctx->H.u);
744                 ctx->gmult = gcm_gmult_4bit;
745                 ctx->ghash = gcm_ghash_4bit;
746         }
747 # else
748         gcm_init_4bit(ctx->Htable,ctx->H.u);
749 # endif
750 #endif
751 }
752
753 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
754 {
755         const union { long one; char little; } is_endian = {1};
756         unsigned int ctr;
757 #ifdef GCM_FUNCREF_4BIT
758         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
759 #endif
760
761         ctx->Yi.u[0]  = 0;
762         ctx->Yi.u[1]  = 0;
763         ctx->Xi.u[0]  = 0;
764         ctx->Xi.u[1]  = 0;
765         ctx->len.u[0] = 0;      /* AAD length */
766         ctx->len.u[1] = 0;      /* message length */
767         ctx->ares = 0;
768         ctx->mres = 0;
769
770         if (len==12) {
771                 memcpy(ctx->Yi.c,iv,12);
772                 ctx->Yi.c[15]=1;
773                 ctr=1;
774         }
775         else {
776                 size_t i;
777                 u64 len0 = len;
778
779                 while (len>=16) {
780                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
781                         GCM_MUL(ctx,Yi);
782                         iv += 16;
783                         len -= 16;
784                 }
785                 if (len) {
786                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
787                         GCM_MUL(ctx,Yi);
788                 }
789                 len0 <<= 3;
790                 if (is_endian.little) {
791 #ifdef BSWAP8
792                         ctx->Yi.u[1]  ^= BSWAP8(len0);
793 #else
794                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
795                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
796                         ctx->Yi.c[10] ^= (u8)(len0>>40);
797                         ctx->Yi.c[11] ^= (u8)(len0>>32);
798                         ctx->Yi.c[12] ^= (u8)(len0>>24);
799                         ctx->Yi.c[13] ^= (u8)(len0>>16);
800                         ctx->Yi.c[14] ^= (u8)(len0>>8);
801                         ctx->Yi.c[15] ^= (u8)(len0);
802 #endif
803                 }
804                 else
805                         ctx->Yi.u[1]  ^= len0;
806
807                 GCM_MUL(ctx,Yi);
808
809                 if (is_endian.little)
810                         ctr = GETU32(ctx->Yi.c+12);
811                 else
812                         ctr = ctx->Yi.d[3];
813         }
814
815         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
816         ++ctr;
817         if (is_endian.little)
818                 PUTU32(ctx->Yi.c+12,ctr);
819         else
820                 ctx->Yi.d[3] = ctr;
821 }
822
823 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
824 {
825         size_t i;
826         unsigned int n;
827         u64 alen = ctx->len.u[0];
828 #ifdef GCM_FUNCREF_4BIT
829         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
830 # ifdef GHASH
831         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
832                                 const u8 *inp,size_t len)       = ctx->ghash;
833 # endif
834 #endif
835
836         if (ctx->len.u[1]) return -2;
837
838         alen += len;
839         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
840                 return -1;
841         ctx->len.u[0] = alen;
842
843         n = ctx->ares;
844         if (n) {
845                 while (n && len) {
846                         ctx->Xi.c[n] ^= *(aad++);
847                         --len;
848                         n = (n+1)%16;
849                 }
850                 if (n==0) GCM_MUL(ctx,Xi);
851                 else {
852                         ctx->ares = n;
853                         return 0;
854                 }
855         }
856
857 #ifdef GHASH
858         if ((i = (len&(size_t)-16))) {
859                 GHASH(ctx,aad,i);
860                 aad += i;
861                 len -= i;
862         }
863 #else
864         while (len>=16) {
865                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
866                 GCM_MUL(ctx,Xi);
867                 aad += 16;
868                 len -= 16;
869         }
870 #endif
871         if (len) {
872                 n = (unsigned int)len;
873                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
874         }
875
876         ctx->ares = n;
877         return 0;
878 }
879
880 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
881                 const unsigned char *in, unsigned char *out,
882                 size_t len)
883 {
884         const union { long one; char little; } is_endian = {1};
885         unsigned int n, ctr;
886         size_t i;
887         u64 mlen = ctx->len.u[1];
888 #ifdef GCM_FUNCREF_4BIT
889         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
890 # ifdef GHASH
891         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
892                                 const u8 *inp,size_t len)       = ctx->ghash;
893 # endif
894 #endif
895
896 #if 0
897         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
898 #endif
899         mlen += len;
900         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
901                 return -1;
902         ctx->len.u[1] = mlen;
903
904         if (ctx->ares) {
905                 /* First call to encrypt finalizes GHASH(AAD) */
906                 GCM_MUL(ctx,Xi);
907                 ctx->ares = 0;
908         }
909
910         if (is_endian.little)
911                 ctr = GETU32(ctx->Yi.c+12);
912         else
913                 ctr = ctx->Yi.d[3];
914
915         n = ctx->mres;
916 #if !defined(OPENSSL_SMALL_FOOTPRINT)
917         if (16%sizeof(size_t) == 0) do {        /* always true actually */
918                 if (n) {
919                         while (n && len) {
920                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
921                                 --len;
922                                 n = (n+1)%16;
923                         }
924                         if (n==0) GCM_MUL(ctx,Xi);
925                         else {
926                                 ctx->mres = n;
927                                 return 0;
928                         }
929                 }
930 #if defined(STRICT_ALIGNMENT)
931                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
932                         break;
933 #endif
934 #if defined(GHASH) && defined(GHASH_CHUNK)
935                 while (len>=GHASH_CHUNK) {
936                     size_t j=GHASH_CHUNK;
937
938                     while (j) {
939                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
940                         ++ctr;
941                         if (is_endian.little)
942                                 PUTU32(ctx->Yi.c+12,ctr);
943                         else
944                                 ctx->Yi.d[3] = ctr;
945                         for (i=0; i<16; i+=sizeof(size_t))
946                                 *(size_t *)(out+i) =
947                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
948                         out += 16;
949                         in  += 16;
950                         j   -= 16;
951                     }
952                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
953                     len -= GHASH_CHUNK;
954                 }
955                 if ((i = (len&(size_t)-16))) {
956                     size_t j=i;
957
958                     while (len>=16) {
959                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
960                         ++ctr;
961                         if (is_endian.little)
962                                 PUTU32(ctx->Yi.c+12,ctr);
963                         else
964                                 ctx->Yi.d[3] = ctr;
965                         for (i=0; i<16; i+=sizeof(size_t))
966                                 *(size_t *)(out+i) =
967                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
968                         out += 16;
969                         in  += 16;
970                         len -= 16;
971                     }
972                     GHASH(ctx,out-j,j);
973                 }
974 #else
975                 while (len>=16) {
976                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
977                         ++ctr;
978                         if (is_endian.little)
979                                 PUTU32(ctx->Yi.c+12,ctr);
980                         else
981                                 ctx->Yi.d[3] = ctr;
982                         for (i=0; i<16; i+=sizeof(size_t))
983                                 *(size_t *)(ctx->Xi.c+i) ^=
984                                 *(size_t *)(out+i) =
985                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
986                         GCM_MUL(ctx,Xi);
987                         out += 16;
988                         in  += 16;
989                         len -= 16;
990                 }
991 #endif
992                 if (len) {
993                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
994                         ++ctr;
995                         if (is_endian.little)
996                                 PUTU32(ctx->Yi.c+12,ctr);
997                         else
998                                 ctx->Yi.d[3] = ctr;
999                         while (len--) {
1000                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1001                                 ++n;
1002                         }
1003                 }
1004
1005                 ctx->mres = n;
1006                 return 0;
1007         } while(0);
1008 #endif
1009         for (i=0;i<len;++i) {
1010                 if (n==0) {
1011                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1012                         ++ctr;
1013                         if (is_endian.little)
1014                                 PUTU32(ctx->Yi.c+12,ctr);
1015                         else
1016                                 ctx->Yi.d[3] = ctr;
1017                 }
1018                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1019                 n = (n+1)%16;
1020                 if (n==0)
1021                         GCM_MUL(ctx,Xi);
1022         }
1023
1024         ctx->mres = n;
1025         return 0;
1026 }
1027
1028 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1029                 const unsigned char *in, unsigned char *out,
1030                 size_t len)
1031 {
1032         const union { long one; char little; } is_endian = {1};
1033         unsigned int n, ctr;
1034         size_t i;
1035         u64 mlen = ctx->len.u[1];
1036 #ifdef GCM_FUNCREF_4BIT
1037         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1038 # ifdef GHASH
1039         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1040                                 const u8 *inp,size_t len)       = ctx->ghash;
1041 # endif
1042 #endif
1043
1044         mlen += len;
1045         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1046                 return -1;
1047         ctx->len.u[1] = mlen;
1048
1049         if (ctx->ares) {
1050                 /* First call to decrypt finalizes GHASH(AAD) */
1051                 GCM_MUL(ctx,Xi);
1052                 ctx->ares = 0;
1053         }
1054
1055         if (is_endian.little)
1056                 ctr = GETU32(ctx->Yi.c+12);
1057         else
1058                 ctr = ctx->Yi.d[3];
1059
1060         n = ctx->mres;
1061 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1062         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1063                 if (n) {
1064                         while (n && len) {
1065                                 u8 c = *(in++);
1066                                 *(out++) = c^ctx->EKi.c[n];
1067                                 ctx->Xi.c[n] ^= c;
1068                                 --len;
1069                                 n = (n+1)%16;
1070                         }
1071                         if (n==0) GCM_MUL (ctx,Xi);
1072                         else {
1073                                 ctx->mres = n;
1074                                 return 0;
1075                         }
1076                 }
1077 #if defined(STRICT_ALIGNMENT)
1078                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1079                         break;
1080 #endif
1081 #if defined(GHASH) && defined(GHASH_CHUNK)
1082                 while (len>=GHASH_CHUNK) {
1083                     size_t j=GHASH_CHUNK;
1084
1085                     GHASH(ctx,in,GHASH_CHUNK);
1086                     while (j) {
1087                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1088                         ++ctr;
1089                         if (is_endian.little)
1090                                 PUTU32(ctx->Yi.c+12,ctr);
1091                         else
1092                                 ctx->Yi.d[3] = ctr;
1093                         for (i=0; i<16; i+=sizeof(size_t))
1094                                 *(size_t *)(out+i) =
1095                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1096                         out += 16;
1097                         in  += 16;
1098                         j   -= 16;
1099                     }
1100                     len -= GHASH_CHUNK;
1101                 }
1102                 if ((i = (len&(size_t)-16))) {
1103                     GHASH(ctx,in,i);
1104                     while (len>=16) {
1105                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1106                         ++ctr;
1107                         if (is_endian.little)
1108                                 PUTU32(ctx->Yi.c+12,ctr);
1109                         else
1110                                 ctx->Yi.d[3] = ctr;
1111                         for (i=0; i<16; i+=sizeof(size_t))
1112                                 *(size_t *)(out+i) =
1113                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1114                         out += 16;
1115                         in  += 16;
1116                         len -= 16;
1117                     }
1118                 }
1119 #else
1120                 while (len>=16) {
1121                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1122                         ++ctr;
1123                         if (is_endian.little)
1124                                 PUTU32(ctx->Yi.c+12,ctr);
1125                         else
1126                                 ctx->Yi.d[3] = ctr;
1127                         for (i=0; i<16; i+=sizeof(size_t)) {
1128                                 size_t c = *(size_t *)(in+i);
1129                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1130                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1131                         }
1132                         GCM_MUL(ctx,Xi);
1133                         out += 16;
1134                         in  += 16;
1135                         len -= 16;
1136                 }
1137 #endif
1138                 if (len) {
1139                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1140                         ++ctr;
1141                         if (is_endian.little)
1142                                 PUTU32(ctx->Yi.c+12,ctr);
1143                         else
1144                                 ctx->Yi.d[3] = ctr;
1145                         while (len--) {
1146                                 u8 c = in[n];
1147                                 ctx->Xi.c[n] ^= c;
1148                                 out[n] = c^ctx->EKi.c[n];
1149                                 ++n;
1150                         }
1151                 }
1152
1153                 ctx->mres = n;
1154                 return 0;
1155         } while(0);
1156 #endif
1157         for (i=0;i<len;++i) {
1158                 u8 c;
1159                 if (n==0) {
1160                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1161                         ++ctr;
1162                         if (is_endian.little)
1163                                 PUTU32(ctx->Yi.c+12,ctr);
1164                         else
1165                                 ctx->Yi.d[3] = ctr;
1166                 }
1167                 c = in[i];
1168                 out[i] = c^ctx->EKi.c[n];
1169                 ctx->Xi.c[n] ^= c;
1170                 n = (n+1)%16;
1171                 if (n==0)
1172                         GCM_MUL(ctx,Xi);
1173         }
1174
1175         ctx->mres = n;
1176         return 0;
1177 }
1178
1179 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1180                 const unsigned char *in, unsigned char *out,
1181                 size_t len, ctr128_f stream)
1182 {
1183         const union { long one; char little; } is_endian = {1};
1184         unsigned int n, ctr;
1185         size_t i;
1186         u64 mlen = ctx->len.u[1];
1187 #ifdef GCM_FUNCREF_4BIT
1188         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1189 # ifdef GHASH
1190         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1191                                 const u8 *inp,size_t len)       = ctx->ghash;
1192 # endif
1193 #endif
1194
1195         mlen += len;
1196         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1197                 return -1;
1198         ctx->len.u[1] = mlen;
1199
1200         if (ctx->ares) {
1201                 /* First call to encrypt finalizes GHASH(AAD) */
1202                 GCM_MUL(ctx,Xi);
1203                 ctx->ares = 0;
1204         }
1205
1206         if (is_endian.little)
1207                 ctr = GETU32(ctx->Yi.c+12);
1208         else
1209                 ctr = ctx->Yi.d[3];
1210
1211         n = ctx->mres;
1212         if (n) {
1213                 while (n && len) {
1214                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1215                         --len;
1216                         n = (n+1)%16;
1217                 }
1218                 if (n==0) GCM_MUL(ctx,Xi);
1219                 else {
1220                         ctx->mres = n;
1221                         return 0;
1222                 }
1223         }
1224 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1225         while (len>=GHASH_CHUNK) {
1226                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1227                 ctr += GHASH_CHUNK/16;
1228                 if (is_endian.little)
1229                         PUTU32(ctx->Yi.c+12,ctr);
1230                 else
1231                         ctx->Yi.d[3] = ctr;
1232                 GHASH(ctx,out,GHASH_CHUNK);
1233                 out += GHASH_CHUNK;
1234                 in  += GHASH_CHUNK;
1235                 len -= GHASH_CHUNK;
1236         }
1237 #endif
1238         if ((i = (len&(size_t)-16))) {
1239                 size_t j=i/16;
1240
1241                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1242                 ctr += (unsigned int)j;
1243                 if (is_endian.little)
1244                         PUTU32(ctx->Yi.c+12,ctr);
1245                 else
1246                         ctx->Yi.d[3] = ctr;
1247                 in  += i;
1248                 len -= i;
1249 #if defined(GHASH)
1250                 GHASH(ctx,out,i);
1251                 out += i;
1252 #else
1253                 while (j--) {
1254                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1255                         GCM_MUL(ctx,Xi);
1256                         out += 16;
1257                 }
1258 #endif
1259         }
1260         if (len) {
1261                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1262                 ++ctr;
1263                 if (is_endian.little)
1264                         PUTU32(ctx->Yi.c+12,ctr);
1265                 else
1266                         ctx->Yi.d[3] = ctr;
1267                 while (len--) {
1268                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1269                         ++n;
1270                 }
1271         }
1272
1273         ctx->mres = n;
1274         return 0;
1275 }
1276
1277 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1278                 const unsigned char *in, unsigned char *out,
1279                 size_t len,ctr128_f stream)
1280 {
1281         const union { long one; char little; } is_endian = {1};
1282         unsigned int n, ctr;
1283         size_t i;
1284         u64 mlen = ctx->len.u[1];
1285 #ifdef GCM_FUNCREF_4BIT
1286         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1287 # ifdef GHASH
1288         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1289                                 const u8 *inp,size_t len)       = ctx->ghash;
1290 # endif
1291 #endif
1292
1293         mlen += len;
1294         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1295                 return -1;
1296         ctx->len.u[1] = mlen;
1297
1298         if (ctx->ares) {
1299                 /* First call to decrypt finalizes GHASH(AAD) */
1300                 GCM_MUL(ctx,Xi);
1301                 ctx->ares = 0;
1302         }
1303
1304         if (is_endian.little)
1305                 ctr = GETU32(ctx->Yi.c+12);
1306         else
1307                 ctr = ctx->Yi.d[3];
1308
1309         n = ctx->mres;
1310         if (n) {
1311                 while (n && len) {
1312                         u8 c = *(in++);
1313                         *(out++) = c^ctx->EKi.c[n];
1314                         ctx->Xi.c[n] ^= c;
1315                         --len;
1316                         n = (n+1)%16;
1317                 }
1318                 if (n==0) GCM_MUL (ctx,Xi);
1319                 else {
1320                         ctx->mres = n;
1321                         return 0;
1322                 }
1323         }
1324 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1325         while (len>=GHASH_CHUNK) {
1326                 GHASH(ctx,in,GHASH_CHUNK);
1327                 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1328                 ctr += GHASH_CHUNK/16;
1329                 if (is_endian.little)
1330                         PUTU32(ctx->Yi.c+12,ctr);
1331                 else
1332                         ctx->Yi.d[3] = ctr;
1333                 out += GHASH_CHUNK;
1334                 in  += GHASH_CHUNK;
1335                 len -= GHASH_CHUNK;
1336         }
1337 #endif
1338         if ((i = (len&(size_t)-16))) {
1339                 size_t j=i/16;
1340
1341 #if defined(GHASH)
1342                 GHASH(ctx,in,i);
1343 #else
1344                 while (j--) {
1345                         size_t k;
1346                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1347                         GCM_MUL(ctx,Xi);
1348                         in += 16;
1349                 }
1350                 j   = i/16;
1351                 in -= i;
1352 #endif
1353                 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1354                 ctr += (unsigned int)j;
1355                 if (is_endian.little)
1356                         PUTU32(ctx->Yi.c+12,ctr);
1357                 else
1358                         ctx->Yi.d[3] = ctr;
1359                 out += i;
1360                 in  += i;
1361                 len -= i;
1362         }
1363         if (len) {
1364                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1365                 ++ctr;
1366                 if (is_endian.little)
1367                         PUTU32(ctx->Yi.c+12,ctr);
1368                 else
1369                         ctx->Yi.d[3] = ctr;
1370                 while (len--) {
1371                         u8 c = in[n];
1372                         ctx->Xi.c[n] ^= c;
1373                         out[n] = c^ctx->EKi.c[n];
1374                         ++n;
1375                 }
1376         }
1377
1378         ctx->mres = n;
1379         return 0;
1380 }
1381
1382 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1383                         size_t len)
1384 {
1385         const union { long one; char little; } is_endian = {1};
1386         u64 alen = ctx->len.u[0]<<3;
1387         u64 clen = ctx->len.u[1]<<3;
1388 #ifdef GCM_FUNCREF_4BIT
1389         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1390 #endif
1391
1392         if (ctx->mres)
1393                 GCM_MUL(ctx,Xi);
1394
1395         if (is_endian.little) {
1396 #ifdef BSWAP8
1397                 alen = BSWAP8(alen);
1398                 clen = BSWAP8(clen);
1399 #else
1400                 u8 *p = ctx->len.c;
1401
1402                 ctx->len.u[0] = alen;
1403                 ctx->len.u[1] = clen;
1404
1405                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1406                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1407 #endif
1408         }
1409
1410         ctx->Xi.u[0] ^= alen;
1411         ctx->Xi.u[1] ^= clen;
1412         GCM_MUL(ctx,Xi);
1413
1414         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1415         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1416
1417         if (tag && len<=sizeof(ctx->Xi))
1418                 return memcmp(ctx->Xi.c,tag,len);
1419         else
1420                 return -1;
1421 }
1422
1423 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1424 {
1425         CRYPTO_gcm128_finish(ctx, NULL, 0);
1426         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1427 }
1428
1429 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1430 {
1431         GCM128_CONTEXT *ret;
1432
1433         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1434                 CRYPTO_gcm128_init(ret,key,block);
1435
1436         return ret;
1437 }
1438
1439 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1440 {
1441         if (ctx) {
1442                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1443                 OPENSSL_free(ctx);
1444         }
1445 }
1446
1447 #if defined(SELFTEST)
1448 #include <stdio.h>
1449 #include <openssl/aes.h>
1450
1451 /* Test Case 1 */
1452 static const u8 K1[16],
1453                 *P1=NULL,
1454                 *A1=NULL,
1455                 IV1[12],
1456                 *C1=NULL,
1457                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1458
1459 /* Test Case 2 */
1460 #define K2 K1
1461 #define A2 A1
1462 #define IV2 IV1
1463 static const u8 P2[16],
1464                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1465                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1466
1467 /* Test Case 3 */
1468 #define A3 A2
1469 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1470                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1471                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1472                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1473                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1474                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1475                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1476                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1477                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1478                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1479                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1480
1481 /* Test Case 4 */
1482 #define K4 K3
1483 #define IV4 IV3
1484 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1485                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1486                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1487                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1488                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1489                         0xab,0xad,0xda,0xd2},
1490                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1491                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1492                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1493                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1494                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1495
1496 /* Test Case 5 */
1497 #define K5 K4
1498 #define P5 P4
1499 #define A5 A4
1500 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1501                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1502                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1503                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1504                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1505                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1506
1507 /* Test Case 6 */
1508 #define K6 K5
1509 #define P6 P5
1510 #define A6 A5
1511 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1512                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1513                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1514                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1515                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1516                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1517                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1518                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1519                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1520
1521 /* Test Case 7 */
1522 static const u8 K7[24],
1523                 *P7=NULL,
1524                 *A7=NULL,
1525                 IV7[12],
1526                 *C7=NULL,
1527                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1528
1529 /* Test Case 8 */
1530 #define K8 K7
1531 #define IV8 IV7
1532 #define A8 A7
1533 static const u8 P8[16],
1534                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1535                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1536
1537 /* Test Case 9 */
1538 #define A9 A8
1539 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1540                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1541                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1542                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1543                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1544                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1545                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1546                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1547                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1548                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1549                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1550                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1551
1552 /* Test Case 10 */
1553 #define K10 K9
1554 #define IV10 IV9
1555 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1556                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1557                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1558                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1559                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1560                         0xab,0xad,0xda,0xd2},
1561                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1562                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1563                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1564                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1565                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1566
1567 /* Test Case 11 */
1568 #define K11 K10
1569 #define P11 P10
1570 #define A11 A10
1571 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1572                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1573                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1574                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1575                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1576                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1577
1578 /* Test Case 12 */
1579 #define K12 K11
1580 #define P12 P11
1581 #define A12 A11
1582 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1583                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1584                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1585                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1586                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1587                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1588                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1589                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1590                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1591
1592 /* Test Case 13 */
1593 static const u8 K13[32],
1594                 *P13=NULL,
1595                 *A13=NULL,
1596                 IV13[12],
1597                 *C13=NULL,
1598                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1599
1600 /* Test Case 14 */
1601 #define K14 K13
1602 #define A14 A13
1603 static const u8 P14[16],
1604                 IV14[12],
1605                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1606                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1607
1608 /* Test Case 15 */
1609 #define A15 A14
1610 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1611                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1612                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1613                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1614                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1615                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1616                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1617                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1618                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1619                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1620                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1621                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1622
1623 /* Test Case 16 */
1624 #define K16 K15
1625 #define IV16 IV15
1626 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1627                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1628                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1629                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1630                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1631                         0xab,0xad,0xda,0xd2},
1632                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1633                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1634                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1635                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1636                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1637
1638 /* Test Case 17 */
1639 #define K17 K16
1640 #define P17 P16
1641 #define A17 A16
1642 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1643                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1644                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1645                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1646                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1647                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1648
1649 /* Test Case 18 */
1650 #define K18 K17
1651 #define P18 P17
1652 #define A18 A17
1653 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1654                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1655                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1656                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1657                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1658                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1659                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1660                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1661                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1662
1663 #define TEST_CASE(n)    do {                                    \
1664         u8 out[sizeof(P##n)];                                   \
1665         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1666         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1667         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1668         memset(out,0,sizeof(out));                              \
1669         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1670         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1671         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1672             (C##n && memcmp(out,C##n,sizeof(out))))             \
1673                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1674         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1675         memset(out,0,sizeof(out));                              \
1676         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1677         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1678         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1679             (P##n && memcmp(out,P##n,sizeof(out))))             \
1680                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1681         } while(0)
1682
1683 int main()
1684 {
1685         GCM128_CONTEXT ctx;
1686         AES_KEY key;
1687         int ret=0;
1688
1689         TEST_CASE(1);
1690         TEST_CASE(2);
1691         TEST_CASE(3);
1692         TEST_CASE(4);
1693         TEST_CASE(5);
1694         TEST_CASE(6);
1695         TEST_CASE(7);
1696         TEST_CASE(8);
1697         TEST_CASE(9);
1698         TEST_CASE(10);
1699         TEST_CASE(11);
1700         TEST_CASE(12);
1701         TEST_CASE(13);
1702         TEST_CASE(14);
1703         TEST_CASE(15);
1704         TEST_CASE(16);
1705         TEST_CASE(17);
1706         TEST_CASE(18);
1707
1708 #ifdef OPENSSL_CPUID_OBJ
1709         {
1710         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1711         union { u64 u; u8 c[1024]; } buf;
1712         int i;
1713
1714         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1715         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1716         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1717
1718         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1719         start = OPENSSL_rdtsc();
1720         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1721         gcm_t = OPENSSL_rdtsc() - start;
1722
1723         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1724                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1725                         (block128_f)AES_encrypt);
1726         start = OPENSSL_rdtsc();
1727         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1728                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1729                         (block128_f)AES_encrypt);
1730         ctr_t = OPENSSL_rdtsc() - start;
1731
1732         printf("%.2f-%.2f=%.2f\n",
1733                         gcm_t/(double)sizeof(buf),
1734                         ctr_t/(double)sizeof(buf),
1735                         (gcm_t-ctr_t)/(double)sizeof(buf));
1736 #ifdef GHASH
1737         GHASH(&ctx,buf.c,sizeof(buf));
1738         start = OPENSSL_rdtsc();
1739         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1740         gcm_t = OPENSSL_rdtsc() - start;
1741         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1742 #endif
1743         }
1744 #endif
1745
1746         return ret;
1747 }
1748 #endif