ghash-x86.pl: engage original MMX version in no-sse2 builds.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         __fips_constseg
152         static const size_t rem_8bit[256] = {
153                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217
218         while (1) {
219                 Z.hi ^= Htable[n].hi;
220                 Z.lo ^= Htable[n].lo;
221
222                 if ((u8 *)Xi==xi)       break;
223
224                 n = *(--xi);
225
226                 rem  = (size_t)Z.lo&0xff;
227                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228                 Z.hi = (Z.hi>>8);
229                 if (sizeof(size_t)==8)
230                         Z.hi ^= rem_8bit[rem];
231                 else
232                         Z.hi ^= (u64)rem_8bit[rem]<<32;
233         }
234
235         if (is_endian.little) {
236 #ifdef BSWAP8
237                 Xi[0] = BSWAP8(Z.hi);
238                 Xi[1] = BSWAP8(Z.lo);
239 #else
240                 u8 *p = (u8 *)Xi;
241                 u32 v;
242                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
243                 v = (u32)(Z.hi);        PUTU32(p+4,v);
244                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
245                 v = (u32)(Z.lo);        PUTU32(p+12,v);
246 #endif
247         }
248         else {
249                 Xi[0] = Z.hi;
250                 Xi[1] = Z.lo;
251         }
252 }
253 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
254
255 #elif   TABLE_BITS==4
256
257 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
258 {
259         u128 V;
260 #if defined(OPENSSL_SMALL_FOOTPRINT)
261         int  i;
262 #endif
263
264         Htable[0].hi = 0;
265         Htable[0].lo = 0;
266         V.hi = H[0];
267         V.lo = H[1];
268
269 #if defined(OPENSSL_SMALL_FOOTPRINT)
270         for (Htable[8]=V, i=4; i>0; i>>=1) {
271                 REDUCE1BIT(V);
272                 Htable[i] = V;
273         }
274
275         for (i=2; i<16; i<<=1) {
276                 u128 *Hi = Htable+i;
277                 int   j;
278                 for (V=*Hi, j=1; j<i; ++j) {
279                         Hi[j].hi = V.hi^Htable[j].hi;
280                         Hi[j].lo = V.lo^Htable[j].lo;
281                 }
282         }
283 #else
284         Htable[8] = V;
285         REDUCE1BIT(V);
286         Htable[4] = V;
287         REDUCE1BIT(V);
288         Htable[2] = V;
289         REDUCE1BIT(V);
290         Htable[1] = V;
291         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
292         V=Htable[4];
293         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
294         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
295         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
296         V=Htable[8];
297         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
298         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
299         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
300         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
301         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
302         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
303         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #endif
305 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306         /*
307          * ARM assembler expects specific dword order in Htable.
308          */
309         {
310         int j;
311         const union { long one; char little; } is_endian = {1};
312
313         if (is_endian.little)
314                 for (j=0;j<16;++j) {
315                         V = Htable[j];
316                         Htable[j].hi = V.lo;
317                         Htable[j].lo = V.hi;
318                 }
319         else
320                 for (j=0;j<16;++j) {
321                         V = Htable[j];
322                         Htable[j].hi = V.lo<<32|V.lo>>32;
323                         Htable[j].lo = V.hi<<32|V.hi>>32;
324                 }
325         }
326 #endif
327 }
328
329 #ifndef GHASH_ASM
330 __fips_constseg
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but the priority is to minimize
462      * cache footprint...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     __fips_constseg
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 #if     TABLE_BITS==4 && defined(GHASH_ASM)
649 # if    !defined(I386_ONLY) && \
650         (defined(__i386)        || defined(__i386__)    || \
651          defined(__x86_64)      || defined(__x86_64__)  || \
652          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
653 #  define GHASH_ASM_X86_OR_64
654 #  define GCM_FUNCREF_4BIT
655 extern unsigned int OPENSSL_ia32cap_P[2];
656
657 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
658 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
662 #   define GHASH_ASM_X86
663 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665
666 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
667 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
668 #  endif
669 # elif defined(__arm__) || defined(__arm)
670 #  include "arm_arch.h"
671 #  if __ARM_ARCH__>=7
672 #   define GHASH_ASM_ARM
673 #   define GCM_FUNCREF_4BIT
674 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
675 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 #  endif
677 # endif
678 #endif
679
680 #ifdef GCM_FUNCREF_4BIT
681 # undef  GCM_MUL
682 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
683 # ifdef GHASH
684 #  undef  GHASH
685 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
686 # endif
687 #endif
688
689 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
690 {
691         const union { long one; char little; } is_endian = {1};
692
693         memset(ctx,0,sizeof(*ctx));
694         ctx->block = block;
695         ctx->key   = key;
696
697         (*block)(ctx->H.c,ctx->H.c,key);
698
699         if (is_endian.little) {
700                 /* H is stored in host byte order */
701 #ifdef BSWAP8
702                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
703                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
704 #else
705                 u8 *p = ctx->H.c;
706                 u64 hi,lo;
707                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
708                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
709                 ctx->H.u[0] = hi;
710                 ctx->H.u[1] = lo;
711 #endif
712         }
713
714 #if     TABLE_BITS==8
715         gcm_init_8bit(ctx->Htable,ctx->H.u);
716 #elif   TABLE_BITS==4
717 # if    defined(GHASH_ASM_X86_OR_64)
718 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
719         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
720             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
721                 gcm_init_clmul(ctx->Htable,ctx->H.u);
722                 ctx->gmult = gcm_gmult_clmul;
723                 ctx->ghash = gcm_ghash_clmul;
724                 return;
725         }
726 #  endif
727         gcm_init_4bit(ctx->Htable,ctx->H.u);
728 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
729 #   if  defined(OPENSSL_IA32_SSE2)
730         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
731 #   else
732         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
733 #   endif
734                 ctx->gmult = gcm_gmult_4bit_mmx;
735                 ctx->ghash = gcm_ghash_4bit_mmx;
736         } else {
737                 ctx->gmult = gcm_gmult_4bit_x86;
738                 ctx->ghash = gcm_ghash_4bit_x86;
739         }
740 #  else
741         ctx->gmult = gcm_gmult_4bit;
742         ctx->ghash = gcm_ghash_4bit;
743 #  endif
744 # elif  defined(GHASH_ASM_ARM)
745         if (OPENSSL_armcap_P & ARMV7_NEON) {
746                 ctx->gmult = gcm_gmult_neon;
747                 ctx->ghash = gcm_ghash_neon;
748         } else {
749                 gcm_init_4bit(ctx->Htable,ctx->H.u);
750                 ctx->gmult = gcm_gmult_4bit;
751                 ctx->ghash = gcm_ghash_4bit;
752         }
753 # else
754         gcm_init_4bit(ctx->Htable,ctx->H.u);
755 # endif
756 #endif
757 }
758
759 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
760 {
761         const union { long one; char little; } is_endian = {1};
762         unsigned int ctr;
763 #ifdef GCM_FUNCREF_4BIT
764         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
765 #endif
766
767         ctx->Yi.u[0]  = 0;
768         ctx->Yi.u[1]  = 0;
769         ctx->Xi.u[0]  = 0;
770         ctx->Xi.u[1]  = 0;
771         ctx->len.u[0] = 0;      /* AAD length */
772         ctx->len.u[1] = 0;      /* message length */
773         ctx->ares = 0;
774         ctx->mres = 0;
775
776         if (len==12) {
777                 memcpy(ctx->Yi.c,iv,12);
778                 ctx->Yi.c[15]=1;
779                 ctr=1;
780         }
781         else {
782                 size_t i;
783                 u64 len0 = len;
784
785                 while (len>=16) {
786                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
787                         GCM_MUL(ctx,Yi);
788                         iv += 16;
789                         len -= 16;
790                 }
791                 if (len) {
792                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
793                         GCM_MUL(ctx,Yi);
794                 }
795                 len0 <<= 3;
796                 if (is_endian.little) {
797 #ifdef BSWAP8
798                         ctx->Yi.u[1]  ^= BSWAP8(len0);
799 #else
800                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
801                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
802                         ctx->Yi.c[10] ^= (u8)(len0>>40);
803                         ctx->Yi.c[11] ^= (u8)(len0>>32);
804                         ctx->Yi.c[12] ^= (u8)(len0>>24);
805                         ctx->Yi.c[13] ^= (u8)(len0>>16);
806                         ctx->Yi.c[14] ^= (u8)(len0>>8);
807                         ctx->Yi.c[15] ^= (u8)(len0);
808 #endif
809                 }
810                 else
811                         ctx->Yi.u[1]  ^= len0;
812
813                 GCM_MUL(ctx,Yi);
814
815                 if (is_endian.little)
816                         ctr = GETU32(ctx->Yi.c+12);
817                 else
818                         ctr = ctx->Yi.d[3];
819         }
820
821         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
822         ++ctr;
823         if (is_endian.little)
824                 PUTU32(ctx->Yi.c+12,ctr);
825         else
826                 ctx->Yi.d[3] = ctr;
827 }
828
829 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
830 {
831         size_t i;
832         unsigned int n;
833         u64 alen = ctx->len.u[0];
834 #ifdef GCM_FUNCREF_4BIT
835         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
836 # ifdef GHASH
837         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
838                                 const u8 *inp,size_t len)       = ctx->ghash;
839 # endif
840 #endif
841
842         if (ctx->len.u[1]) return -2;
843
844         alen += len;
845         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
846                 return -1;
847         ctx->len.u[0] = alen;
848
849         n = ctx->ares;
850         if (n) {
851                 while (n && len) {
852                         ctx->Xi.c[n] ^= *(aad++);
853                         --len;
854                         n = (n+1)%16;
855                 }
856                 if (n==0) GCM_MUL(ctx,Xi);
857                 else {
858                         ctx->ares = n;
859                         return 0;
860                 }
861         }
862
863 #ifdef GHASH
864         if ((i = (len&(size_t)-16))) {
865                 GHASH(ctx,aad,i);
866                 aad += i;
867                 len -= i;
868         }
869 #else
870         while (len>=16) {
871                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
872                 GCM_MUL(ctx,Xi);
873                 aad += 16;
874                 len -= 16;
875         }
876 #endif
877         if (len) {
878                 n = (unsigned int)len;
879                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
880         }
881
882         ctx->ares = n;
883         return 0;
884 }
885
886 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
887                 const unsigned char *in, unsigned char *out,
888                 size_t len)
889 {
890         const union { long one; char little; } is_endian = {1};
891         unsigned int n, ctr;
892         size_t i;
893         u64        mlen  = ctx->len.u[1];
894         block128_f block = ctx->block;
895         void      *key   = ctx->key;
896 #ifdef GCM_FUNCREF_4BIT
897         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
898 # ifdef GHASH
899         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
900                                 const u8 *inp,size_t len)       = ctx->ghash;
901 # endif
902 #endif
903
904 #if 0
905         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
906 #endif
907         mlen += len;
908         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
909                 return -1;
910         ctx->len.u[1] = mlen;
911
912         if (ctx->ares) {
913                 /* First call to encrypt finalizes GHASH(AAD) */
914                 GCM_MUL(ctx,Xi);
915                 ctx->ares = 0;
916         }
917
918         if (is_endian.little)
919                 ctr = GETU32(ctx->Yi.c+12);
920         else
921                 ctr = ctx->Yi.d[3];
922
923         n = ctx->mres;
924 #if !defined(OPENSSL_SMALL_FOOTPRINT)
925         if (16%sizeof(size_t) == 0) do {        /* always true actually */
926                 if (n) {
927                         while (n && len) {
928                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
929                                 --len;
930                                 n = (n+1)%16;
931                         }
932                         if (n==0) GCM_MUL(ctx,Xi);
933                         else {
934                                 ctx->mres = n;
935                                 return 0;
936                         }
937                 }
938 #if defined(STRICT_ALIGNMENT)
939                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
940                         break;
941 #endif
942 #if defined(GHASH) && defined(GHASH_CHUNK)
943                 while (len>=GHASH_CHUNK) {
944                     size_t j=GHASH_CHUNK;
945
946                     while (j) {
947                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
948                         ++ctr;
949                         if (is_endian.little)
950                                 PUTU32(ctx->Yi.c+12,ctr);
951                         else
952                                 ctx->Yi.d[3] = ctr;
953                         for (i=0; i<16; i+=sizeof(size_t))
954                                 *(size_t *)(out+i) =
955                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
956                         out += 16;
957                         in  += 16;
958                         j   -= 16;
959                     }
960                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
961                     len -= GHASH_CHUNK;
962                 }
963                 if ((i = (len&(size_t)-16))) {
964                     size_t j=i;
965
966                     while (len>=16) {
967                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
968                         ++ctr;
969                         if (is_endian.little)
970                                 PUTU32(ctx->Yi.c+12,ctr);
971                         else
972                                 ctx->Yi.d[3] = ctr;
973                         for (i=0; i<16; i+=sizeof(size_t))
974                                 *(size_t *)(out+i) =
975                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
976                         out += 16;
977                         in  += 16;
978                         len -= 16;
979                     }
980                     GHASH(ctx,out-j,j);
981                 }
982 #else
983                 while (len>=16) {
984                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
985                         ++ctr;
986                         if (is_endian.little)
987                                 PUTU32(ctx->Yi.c+12,ctr);
988                         else
989                                 ctx->Yi.d[3] = ctr;
990                         for (i=0; i<16; i+=sizeof(size_t))
991                                 *(size_t *)(ctx->Xi.c+i) ^=
992                                 *(size_t *)(out+i) =
993                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
994                         GCM_MUL(ctx,Xi);
995                         out += 16;
996                         in  += 16;
997                         len -= 16;
998                 }
999 #endif
1000                 if (len) {
1001                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1002                         ++ctr;
1003                         if (is_endian.little)
1004                                 PUTU32(ctx->Yi.c+12,ctr);
1005                         else
1006                                 ctx->Yi.d[3] = ctr;
1007                         while (len--) {
1008                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1009                                 ++n;
1010                         }
1011                 }
1012
1013                 ctx->mres = n;
1014                 return 0;
1015         } while(0);
1016 #endif
1017         for (i=0;i<len;++i) {
1018                 if (n==0) {
1019                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1020                         ++ctr;
1021                         if (is_endian.little)
1022                                 PUTU32(ctx->Yi.c+12,ctr);
1023                         else
1024                                 ctx->Yi.d[3] = ctr;
1025                 }
1026                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1027                 n = (n+1)%16;
1028                 if (n==0)
1029                         GCM_MUL(ctx,Xi);
1030         }
1031
1032         ctx->mres = n;
1033         return 0;
1034 }
1035
1036 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1037                 const unsigned char *in, unsigned char *out,
1038                 size_t len)
1039 {
1040         const union { long one; char little; } is_endian = {1};
1041         unsigned int n, ctr;
1042         size_t i;
1043         u64        mlen  = ctx->len.u[1];
1044         block128_f block = ctx->block;
1045         void      *key   = ctx->key;
1046 #ifdef GCM_FUNCREF_4BIT
1047         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1048 # ifdef GHASH
1049         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1050                                 const u8 *inp,size_t len)       = ctx->ghash;
1051 # endif
1052 #endif
1053
1054         mlen += len;
1055         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1056                 return -1;
1057         ctx->len.u[1] = mlen;
1058
1059         if (ctx->ares) {
1060                 /* First call to decrypt finalizes GHASH(AAD) */
1061                 GCM_MUL(ctx,Xi);
1062                 ctx->ares = 0;
1063         }
1064
1065         if (is_endian.little)
1066                 ctr = GETU32(ctx->Yi.c+12);
1067         else
1068                 ctr = ctx->Yi.d[3];
1069
1070         n = ctx->mres;
1071 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1072         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1073                 if (n) {
1074                         while (n && len) {
1075                                 u8 c = *(in++);
1076                                 *(out++) = c^ctx->EKi.c[n];
1077                                 ctx->Xi.c[n] ^= c;
1078                                 --len;
1079                                 n = (n+1)%16;
1080                         }
1081                         if (n==0) GCM_MUL (ctx,Xi);
1082                         else {
1083                                 ctx->mres = n;
1084                                 return 0;
1085                         }
1086                 }
1087 #if defined(STRICT_ALIGNMENT)
1088                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1089                         break;
1090 #endif
1091 #if defined(GHASH) && defined(GHASH_CHUNK)
1092                 while (len>=GHASH_CHUNK) {
1093                     size_t j=GHASH_CHUNK;
1094
1095                     GHASH(ctx,in,GHASH_CHUNK);
1096                     while (j) {
1097                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1098                         ++ctr;
1099                         if (is_endian.little)
1100                                 PUTU32(ctx->Yi.c+12,ctr);
1101                         else
1102                                 ctx->Yi.d[3] = ctr;
1103                         for (i=0; i<16; i+=sizeof(size_t))
1104                                 *(size_t *)(out+i) =
1105                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1106                         out += 16;
1107                         in  += 16;
1108                         j   -= 16;
1109                     }
1110                     len -= GHASH_CHUNK;
1111                 }
1112                 if ((i = (len&(size_t)-16))) {
1113                     GHASH(ctx,in,i);
1114                     while (len>=16) {
1115                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1116                         ++ctr;
1117                         if (is_endian.little)
1118                                 PUTU32(ctx->Yi.c+12,ctr);
1119                         else
1120                                 ctx->Yi.d[3] = ctr;
1121                         for (i=0; i<16; i+=sizeof(size_t))
1122                                 *(size_t *)(out+i) =
1123                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1124                         out += 16;
1125                         in  += 16;
1126                         len -= 16;
1127                     }
1128                 }
1129 #else
1130                 while (len>=16) {
1131                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1132                         ++ctr;
1133                         if (is_endian.little)
1134                                 PUTU32(ctx->Yi.c+12,ctr);
1135                         else
1136                                 ctx->Yi.d[3] = ctr;
1137                         for (i=0; i<16; i+=sizeof(size_t)) {
1138                                 size_t c = *(size_t *)(in+i);
1139                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1140                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1141                         }
1142                         GCM_MUL(ctx,Xi);
1143                         out += 16;
1144                         in  += 16;
1145                         len -= 16;
1146                 }
1147 #endif
1148                 if (len) {
1149                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1150                         ++ctr;
1151                         if (is_endian.little)
1152                                 PUTU32(ctx->Yi.c+12,ctr);
1153                         else
1154                                 ctx->Yi.d[3] = ctr;
1155                         while (len--) {
1156                                 u8 c = in[n];
1157                                 ctx->Xi.c[n] ^= c;
1158                                 out[n] = c^ctx->EKi.c[n];
1159                                 ++n;
1160                         }
1161                 }
1162
1163                 ctx->mres = n;
1164                 return 0;
1165         } while(0);
1166 #endif
1167         for (i=0;i<len;++i) {
1168                 u8 c;
1169                 if (n==0) {
1170                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1171                         ++ctr;
1172                         if (is_endian.little)
1173                                 PUTU32(ctx->Yi.c+12,ctr);
1174                         else
1175                                 ctx->Yi.d[3] = ctr;
1176                 }
1177                 c = in[i];
1178                 out[i] = c^ctx->EKi.c[n];
1179                 ctx->Xi.c[n] ^= c;
1180                 n = (n+1)%16;
1181                 if (n==0)
1182                         GCM_MUL(ctx,Xi);
1183         }
1184
1185         ctx->mres = n;
1186         return 0;
1187 }
1188
1189 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1190                 const unsigned char *in, unsigned char *out,
1191                 size_t len, ctr128_f stream)
1192 {
1193         const union { long one; char little; } is_endian = {1};
1194         unsigned int n, ctr;
1195         size_t i;
1196         u64   mlen = ctx->len.u[1];
1197         void *key  = ctx->key;
1198 #ifdef GCM_FUNCREF_4BIT
1199         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1200 # ifdef GHASH
1201         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1202                                 const u8 *inp,size_t len)       = ctx->ghash;
1203 # endif
1204 #endif
1205
1206         mlen += len;
1207         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1208                 return -1;
1209         ctx->len.u[1] = mlen;
1210
1211         if (ctx->ares) {
1212                 /* First call to encrypt finalizes GHASH(AAD) */
1213                 GCM_MUL(ctx,Xi);
1214                 ctx->ares = 0;
1215         }
1216
1217         if (is_endian.little)
1218                 ctr = GETU32(ctx->Yi.c+12);
1219         else
1220                 ctr = ctx->Yi.d[3];
1221
1222         n = ctx->mres;
1223         if (n) {
1224                 while (n && len) {
1225                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1226                         --len;
1227                         n = (n+1)%16;
1228                 }
1229                 if (n==0) GCM_MUL(ctx,Xi);
1230                 else {
1231                         ctx->mres = n;
1232                         return 0;
1233                 }
1234         }
1235 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1236         while (len>=GHASH_CHUNK) {
1237                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1238                 ctr += GHASH_CHUNK/16;
1239                 if (is_endian.little)
1240                         PUTU32(ctx->Yi.c+12,ctr);
1241                 else
1242                         ctx->Yi.d[3] = ctr;
1243                 GHASH(ctx,out,GHASH_CHUNK);
1244                 out += GHASH_CHUNK;
1245                 in  += GHASH_CHUNK;
1246                 len -= GHASH_CHUNK;
1247         }
1248 #endif
1249         if ((i = (len&(size_t)-16))) {
1250                 size_t j=i/16;
1251
1252                 (*stream)(in,out,j,key,ctx->Yi.c);
1253                 ctr += (unsigned int)j;
1254                 if (is_endian.little)
1255                         PUTU32(ctx->Yi.c+12,ctr);
1256                 else
1257                         ctx->Yi.d[3] = ctr;
1258                 in  += i;
1259                 len -= i;
1260 #if defined(GHASH)
1261                 GHASH(ctx,out,i);
1262                 out += i;
1263 #else
1264                 while (j--) {
1265                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1266                         GCM_MUL(ctx,Xi);
1267                         out += 16;
1268                 }
1269 #endif
1270         }
1271         if (len) {
1272                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1273                 ++ctr;
1274                 if (is_endian.little)
1275                         PUTU32(ctx->Yi.c+12,ctr);
1276                 else
1277                         ctx->Yi.d[3] = ctr;
1278                 while (len--) {
1279                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1280                         ++n;
1281                 }
1282         }
1283
1284         ctx->mres = n;
1285         return 0;
1286 }
1287
1288 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1289                 const unsigned char *in, unsigned char *out,
1290                 size_t len,ctr128_f stream)
1291 {
1292         const union { long one; char little; } is_endian = {1};
1293         unsigned int n, ctr;
1294         size_t i;
1295         u64   mlen = ctx->len.u[1];
1296         void *key  = ctx->key;
1297 #ifdef GCM_FUNCREF_4BIT
1298         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1299 # ifdef GHASH
1300         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1301                                 const u8 *inp,size_t len)       = ctx->ghash;
1302 # endif
1303 #endif
1304
1305         mlen += len;
1306         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1307                 return -1;
1308         ctx->len.u[1] = mlen;
1309
1310         if (ctx->ares) {
1311                 /* First call to decrypt finalizes GHASH(AAD) */
1312                 GCM_MUL(ctx,Xi);
1313                 ctx->ares = 0;
1314         }
1315
1316         if (is_endian.little)
1317                 ctr = GETU32(ctx->Yi.c+12);
1318         else
1319                 ctr = ctx->Yi.d[3];
1320
1321         n = ctx->mres;
1322         if (n) {
1323                 while (n && len) {
1324                         u8 c = *(in++);
1325                         *(out++) = c^ctx->EKi.c[n];
1326                         ctx->Xi.c[n] ^= c;
1327                         --len;
1328                         n = (n+1)%16;
1329                 }
1330                 if (n==0) GCM_MUL (ctx,Xi);
1331                 else {
1332                         ctx->mres = n;
1333                         return 0;
1334                 }
1335         }
1336 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1337         while (len>=GHASH_CHUNK) {
1338                 GHASH(ctx,in,GHASH_CHUNK);
1339                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1340                 ctr += GHASH_CHUNK/16;
1341                 if (is_endian.little)
1342                         PUTU32(ctx->Yi.c+12,ctr);
1343                 else
1344                         ctx->Yi.d[3] = ctr;
1345                 out += GHASH_CHUNK;
1346                 in  += GHASH_CHUNK;
1347                 len -= GHASH_CHUNK;
1348         }
1349 #endif
1350         if ((i = (len&(size_t)-16))) {
1351                 size_t j=i/16;
1352
1353 #if defined(GHASH)
1354                 GHASH(ctx,in,i);
1355 #else
1356                 while (j--) {
1357                         size_t k;
1358                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1359                         GCM_MUL(ctx,Xi);
1360                         in += 16;
1361                 }
1362                 j   = i/16;
1363                 in -= i;
1364 #endif
1365                 (*stream)(in,out,j,key,ctx->Yi.c);
1366                 ctr += (unsigned int)j;
1367                 if (is_endian.little)
1368                         PUTU32(ctx->Yi.c+12,ctr);
1369                 else
1370                         ctx->Yi.d[3] = ctr;
1371                 out += i;
1372                 in  += i;
1373                 len -= i;
1374         }
1375         if (len) {
1376                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1377                 ++ctr;
1378                 if (is_endian.little)
1379                         PUTU32(ctx->Yi.c+12,ctr);
1380                 else
1381                         ctx->Yi.d[3] = ctr;
1382                 while (len--) {
1383                         u8 c = in[n];
1384                         ctx->Xi.c[n] ^= c;
1385                         out[n] = c^ctx->EKi.c[n];
1386                         ++n;
1387                 }
1388         }
1389
1390         ctx->mres = n;
1391         return 0;
1392 }
1393
1394 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1395                         size_t len)
1396 {
1397         const union { long one; char little; } is_endian = {1};
1398         u64 alen = ctx->len.u[0]<<3;
1399         u64 clen = ctx->len.u[1]<<3;
1400 #ifdef GCM_FUNCREF_4BIT
1401         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1402 #endif
1403
1404         if (ctx->mres)
1405                 GCM_MUL(ctx,Xi);
1406
1407         if (is_endian.little) {
1408 #ifdef BSWAP8
1409                 alen = BSWAP8(alen);
1410                 clen = BSWAP8(clen);
1411 #else
1412                 u8 *p = ctx->len.c;
1413
1414                 ctx->len.u[0] = alen;
1415                 ctx->len.u[1] = clen;
1416
1417                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1418                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1419 #endif
1420         }
1421
1422         ctx->Xi.u[0] ^= alen;
1423         ctx->Xi.u[1] ^= clen;
1424         GCM_MUL(ctx,Xi);
1425
1426         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1427         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1428
1429         if (tag && len<=sizeof(ctx->Xi))
1430                 return memcmp(ctx->Xi.c,tag,len);
1431         else
1432                 return -1;
1433 }
1434
1435 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1436 {
1437         CRYPTO_gcm128_finish(ctx, NULL, 0);
1438         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1439 }
1440
1441 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1442 {
1443         GCM128_CONTEXT *ret;
1444
1445         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1446                 CRYPTO_gcm128_init(ret,key,block);
1447
1448         return ret;
1449 }
1450
1451 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1452 {
1453         if (ctx) {
1454                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1455                 OPENSSL_free(ctx);
1456         }
1457 }
1458
1459 #if defined(SELFTEST)
1460 #include <stdio.h>
1461 #include <openssl/aes.h>
1462
1463 /* Test Case 1 */
1464 static const u8 K1[16],
1465                 *P1=NULL,
1466                 *A1=NULL,
1467                 IV1[12],
1468                 *C1=NULL,
1469                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1470
1471 /* Test Case 2 */
1472 #define K2 K1
1473 #define A2 A1
1474 #define IV2 IV1
1475 static const u8 P2[16],
1476                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1477                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1478
1479 /* Test Case 3 */
1480 #define A3 A2
1481 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1482                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1483                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1484                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1485                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1486                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1487                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1488                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1489                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1490                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1491                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1492
1493 /* Test Case 4 */
1494 #define K4 K3
1495 #define IV4 IV3
1496 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1497                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1498                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1499                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1500                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1501                         0xab,0xad,0xda,0xd2},
1502                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1503                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1504                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1505                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1506                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1507
1508 /* Test Case 5 */
1509 #define K5 K4
1510 #define P5 P4
1511 #define A5 A4
1512 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1513                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1514                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1515                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1516                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1517                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1518
1519 /* Test Case 6 */
1520 #define K6 K5
1521 #define P6 P5
1522 #define A6 A5
1523 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1524                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1525                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1526                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1527                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1528                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1529                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1530                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1531                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1532
1533 /* Test Case 7 */
1534 static const u8 K7[24],
1535                 *P7=NULL,
1536                 *A7=NULL,
1537                 IV7[12],
1538                 *C7=NULL,
1539                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1540
1541 /* Test Case 8 */
1542 #define K8 K7
1543 #define IV8 IV7
1544 #define A8 A7
1545 static const u8 P8[16],
1546                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1547                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1548
1549 /* Test Case 9 */
1550 #define A9 A8
1551 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1552                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1553                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1554                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1555                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1556                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1557                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1558                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1559                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1560                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1561                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1562                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1563
1564 /* Test Case 10 */
1565 #define K10 K9
1566 #define IV10 IV9
1567 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1568                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1569                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1570                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1571                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1572                         0xab,0xad,0xda,0xd2},
1573                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1574                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1575                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1576                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1577                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1578
1579 /* Test Case 11 */
1580 #define K11 K10
1581 #define P11 P10
1582 #define A11 A10
1583 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1584                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1585                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1586                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1587                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1588                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1589
1590 /* Test Case 12 */
1591 #define K12 K11
1592 #define P12 P11
1593 #define A12 A11
1594 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1595                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1596                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1597                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1598                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1599                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1600                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1601                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1602                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1603
1604 /* Test Case 13 */
1605 static const u8 K13[32],
1606                 *P13=NULL,
1607                 *A13=NULL,
1608                 IV13[12],
1609                 *C13=NULL,
1610                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1611
1612 /* Test Case 14 */
1613 #define K14 K13
1614 #define A14 A13
1615 static const u8 P14[16],
1616                 IV14[12],
1617                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1618                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1619
1620 /* Test Case 15 */
1621 #define A15 A14
1622 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1623                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1624                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1625                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1626                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1627                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1628                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1629                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1630                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1631                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1632                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1633                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1634
1635 /* Test Case 16 */
1636 #define K16 K15
1637 #define IV16 IV15
1638 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1639                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1640                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1641                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1642                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1643                         0xab,0xad,0xda,0xd2},
1644                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1645                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1646                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1647                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1648                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1649
1650 /* Test Case 17 */
1651 #define K17 K16
1652 #define P17 P16
1653 #define A17 A16
1654 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1655                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1656                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1657                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1658                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1659                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1660
1661 /* Test Case 18 */
1662 #define K18 K17
1663 #define P18 P17
1664 #define A18 A17
1665 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1666                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1667                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1668                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1669                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1670                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1671                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1672                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1673                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1674
1675 #define TEST_CASE(n)    do {                                    \
1676         u8 out[sizeof(P##n)];                                   \
1677         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1678         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1679         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1680         memset(out,0,sizeof(out));                              \
1681         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1682         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1683         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1684             (C##n && memcmp(out,C##n,sizeof(out))))             \
1685                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1686         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1687         memset(out,0,sizeof(out));                              \
1688         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1689         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1690         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1691             (P##n && memcmp(out,P##n,sizeof(out))))             \
1692                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1693         } while(0)
1694
1695 int main()
1696 {
1697         GCM128_CONTEXT ctx;
1698         AES_KEY key;
1699         int ret=0;
1700
1701         TEST_CASE(1);
1702         TEST_CASE(2);
1703         TEST_CASE(3);
1704         TEST_CASE(4);
1705         TEST_CASE(5);
1706         TEST_CASE(6);
1707         TEST_CASE(7);
1708         TEST_CASE(8);
1709         TEST_CASE(9);
1710         TEST_CASE(10);
1711         TEST_CASE(11);
1712         TEST_CASE(12);
1713         TEST_CASE(13);
1714         TEST_CASE(14);
1715         TEST_CASE(15);
1716         TEST_CASE(16);
1717         TEST_CASE(17);
1718         TEST_CASE(18);
1719
1720 #ifdef OPENSSL_CPUID_OBJ
1721         {
1722         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1723         union { u64 u; u8 c[1024]; } buf;
1724         int i;
1725
1726         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1727         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1728         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1729
1730         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1731         start = OPENSSL_rdtsc();
1732         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1733         gcm_t = OPENSSL_rdtsc() - start;
1734
1735         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1736                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1737                         (block128_f)AES_encrypt);
1738         start = OPENSSL_rdtsc();
1739         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1740                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1741                         (block128_f)AES_encrypt);
1742         ctr_t = OPENSSL_rdtsc() - start;
1743
1744         printf("%.2f-%.2f=%.2f\n",
1745                         gcm_t/(double)sizeof(buf),
1746                         ctr_t/(double)sizeof(buf),
1747                         (gcm_t-ctr_t)/(double)sizeof(buf));
1748 #ifdef GHASH
1749         GHASH(&ctx,buf.c,sizeof(buf));
1750         start = OPENSSL_rdtsc();
1751         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1752         gcm_t = OPENSSL_rdtsc() - start;
1753         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1754 #endif
1755         }
1756 #endif
1757
1758         return ret;
1759 }
1760 #endif