ghash-sparcv9.pl: add VIS3 code path.
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         __fips_constseg
152         static const size_t rem_8bit[256] = {
153                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217
218         while (1) {
219                 Z.hi ^= Htable[n].hi;
220                 Z.lo ^= Htable[n].lo;
221
222                 if ((u8 *)Xi==xi)       break;
223
224                 n = *(--xi);
225
226                 rem  = (size_t)Z.lo&0xff;
227                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228                 Z.hi = (Z.hi>>8);
229                 if (sizeof(size_t)==8)
230                         Z.hi ^= rem_8bit[rem];
231                 else
232                         Z.hi ^= (u64)rem_8bit[rem]<<32;
233         }
234
235         if (is_endian.little) {
236 #ifdef BSWAP8
237                 Xi[0] = BSWAP8(Z.hi);
238                 Xi[1] = BSWAP8(Z.lo);
239 #else
240                 u8 *p = (u8 *)Xi;
241                 u32 v;
242                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
243                 v = (u32)(Z.hi);        PUTU32(p+4,v);
244                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
245                 v = (u32)(Z.lo);        PUTU32(p+12,v);
246 #endif
247         }
248         else {
249                 Xi[0] = Z.hi;
250                 Xi[1] = Z.lo;
251         }
252 }
253 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
254
255 #elif   TABLE_BITS==4
256
257 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
258 {
259         u128 V;
260 #if defined(OPENSSL_SMALL_FOOTPRINT)
261         int  i;
262 #endif
263
264         Htable[0].hi = 0;
265         Htable[0].lo = 0;
266         V.hi = H[0];
267         V.lo = H[1];
268
269 #if defined(OPENSSL_SMALL_FOOTPRINT)
270         for (Htable[8]=V, i=4; i>0; i>>=1) {
271                 REDUCE1BIT(V);
272                 Htable[i] = V;
273         }
274
275         for (i=2; i<16; i<<=1) {
276                 u128 *Hi = Htable+i;
277                 int   j;
278                 for (V=*Hi, j=1; j<i; ++j) {
279                         Hi[j].hi = V.hi^Htable[j].hi;
280                         Hi[j].lo = V.lo^Htable[j].lo;
281                 }
282         }
283 #else
284         Htable[8] = V;
285         REDUCE1BIT(V);
286         Htable[4] = V;
287         REDUCE1BIT(V);
288         Htable[2] = V;
289         REDUCE1BIT(V);
290         Htable[1] = V;
291         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
292         V=Htable[4];
293         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
294         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
295         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
296         V=Htable[8];
297         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
298         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
299         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
300         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
301         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
302         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
303         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #endif
305 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306         /*
307          * ARM assembler expects specific dword order in Htable.
308          */
309         {
310         int j;
311         const union { long one; char little; } is_endian = {1};
312
313         if (is_endian.little)
314                 for (j=0;j<16;++j) {
315                         V = Htable[j];
316                         Htable[j].hi = V.lo;
317                         Htable[j].lo = V.hi;
318                 }
319         else
320                 for (j=0;j<16;++j) {
321                         V = Htable[j];
322                         Htable[j].hi = V.lo<<32|V.lo>>32;
323                         Htable[j].lo = V.hi<<32|V.hi>>32;
324                 }
325         }
326 #endif
327 }
328
329 #ifndef GHASH_ASM
330 __fips_constseg
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but the priority is to minimize
462      * cache footprint...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     __fips_constseg
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 #if     TABLE_BITS==4 && defined(GHASH_ASM)
649 # if    !defined(I386_ONLY) && \
650         (defined(__i386)        || defined(__i386__)    || \
651          defined(__x86_64)      || defined(__x86_64__)  || \
652          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
653 #  define GHASH_ASM_X86_OR_64
654 #  define GCM_FUNCREF_4BIT
655 extern unsigned int OPENSSL_ia32cap_P[2];
656
657 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
658 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
662 #   define GHASH_ASM_X86
663 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665
666 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
667 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
668 #  endif
669 # elif defined(__arm__) || defined(__arm)
670 #  include "arm_arch.h"
671 #  if __ARM_ARCH__>=7
672 #   define GHASH_ASM_ARM
673 #   define GCM_FUNCREF_4BIT
674 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
675 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 #  endif
677 # elif defined(__sparc__) || defined(__sparc)
678 #  include "sparc_arch.h"
679 #  define GHASH_ASM_SPARC
680 #  define GCM_FUNCREF_4BIT
681 extern unsigned int OPENSSL_sparcv9cap_P[];
682 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
683 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
684 # endif
685 #endif
686
687 #ifdef GCM_FUNCREF_4BIT
688 # undef  GCM_MUL
689 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
690 # ifdef GHASH
691 #  undef  GHASH
692 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
693 # endif
694 #endif
695
696 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
697 {
698         const union { long one; char little; } is_endian = {1};
699
700         memset(ctx,0,sizeof(*ctx));
701         ctx->block = block;
702         ctx->key   = key;
703
704         (*block)(ctx->H.c,ctx->H.c,key);
705
706         if (is_endian.little) {
707                 /* H is stored in host byte order */
708 #ifdef BSWAP8
709                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
710                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
711 #else
712                 u8 *p = ctx->H.c;
713                 u64 hi,lo;
714                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
715                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
716                 ctx->H.u[0] = hi;
717                 ctx->H.u[1] = lo;
718 #endif
719         }
720
721 #if     TABLE_BITS==8
722         gcm_init_8bit(ctx->Htable,ctx->H.u);
723 #elif   TABLE_BITS==4
724 # if    defined(GHASH_ASM_X86_OR_64)
725 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
726         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
727             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
728                 gcm_init_clmul(ctx->Htable,ctx->H.u);
729                 ctx->gmult = gcm_gmult_clmul;
730                 ctx->ghash = gcm_ghash_clmul;
731                 return;
732         }
733 #  endif
734         gcm_init_4bit(ctx->Htable,ctx->H.u);
735 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
736 #   if  defined(OPENSSL_IA32_SSE2)
737         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
738 #   else
739         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
740 #   endif
741                 ctx->gmult = gcm_gmult_4bit_mmx;
742                 ctx->ghash = gcm_ghash_4bit_mmx;
743         } else {
744                 ctx->gmult = gcm_gmult_4bit_x86;
745                 ctx->ghash = gcm_ghash_4bit_x86;
746         }
747 #  else
748         ctx->gmult = gcm_gmult_4bit;
749         ctx->ghash = gcm_ghash_4bit;
750 #  endif
751 # elif  defined(GHASH_ASM_ARM)
752         if (OPENSSL_armcap_P & ARMV7_NEON) {
753                 ctx->gmult = gcm_gmult_neon;
754                 ctx->ghash = gcm_ghash_neon;
755         } else {
756                 gcm_init_4bit(ctx->Htable,ctx->H.u);
757                 ctx->gmult = gcm_gmult_4bit;
758                 ctx->ghash = gcm_ghash_4bit;
759         }
760 # elif  defined(GHASH_ASM_SPARC)
761         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
762                 ctx->gmult = gcm_gmult_vis3;
763                 ctx->ghash = gcm_ghash_vis3;
764         } else {
765                 gcm_init_4bit(ctx->Htable,ctx->H.u);
766                 ctx->gmult = gcm_gmult_4bit;
767                 ctx->ghash = gcm_ghash_4bit;
768         }
769 # else
770         gcm_init_4bit(ctx->Htable,ctx->H.u);
771 # endif
772 #endif
773 }
774
775 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
776 {
777         const union { long one; char little; } is_endian = {1};
778         unsigned int ctr;
779 #ifdef GCM_FUNCREF_4BIT
780         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
781 #endif
782
783         ctx->Yi.u[0]  = 0;
784         ctx->Yi.u[1]  = 0;
785         ctx->Xi.u[0]  = 0;
786         ctx->Xi.u[1]  = 0;
787         ctx->len.u[0] = 0;      /* AAD length */
788         ctx->len.u[1] = 0;      /* message length */
789         ctx->ares = 0;
790         ctx->mres = 0;
791
792         if (len==12) {
793                 memcpy(ctx->Yi.c,iv,12);
794                 ctx->Yi.c[15]=1;
795                 ctr=1;
796         }
797         else {
798                 size_t i;
799                 u64 len0 = len;
800
801                 while (len>=16) {
802                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
803                         GCM_MUL(ctx,Yi);
804                         iv += 16;
805                         len -= 16;
806                 }
807                 if (len) {
808                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
809                         GCM_MUL(ctx,Yi);
810                 }
811                 len0 <<= 3;
812                 if (is_endian.little) {
813 #ifdef BSWAP8
814                         ctx->Yi.u[1]  ^= BSWAP8(len0);
815 #else
816                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
817                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
818                         ctx->Yi.c[10] ^= (u8)(len0>>40);
819                         ctx->Yi.c[11] ^= (u8)(len0>>32);
820                         ctx->Yi.c[12] ^= (u8)(len0>>24);
821                         ctx->Yi.c[13] ^= (u8)(len0>>16);
822                         ctx->Yi.c[14] ^= (u8)(len0>>8);
823                         ctx->Yi.c[15] ^= (u8)(len0);
824 #endif
825                 }
826                 else
827                         ctx->Yi.u[1]  ^= len0;
828
829                 GCM_MUL(ctx,Yi);
830
831                 if (is_endian.little)
832                         ctr = GETU32(ctx->Yi.c+12);
833                 else
834                         ctr = ctx->Yi.d[3];
835         }
836
837         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
838         ++ctr;
839         if (is_endian.little)
840                 PUTU32(ctx->Yi.c+12,ctr);
841         else
842                 ctx->Yi.d[3] = ctr;
843 }
844
845 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
846 {
847         size_t i;
848         unsigned int n;
849         u64 alen = ctx->len.u[0];
850 #ifdef GCM_FUNCREF_4BIT
851         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
852 # ifdef GHASH
853         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
854                                 const u8 *inp,size_t len)       = ctx->ghash;
855 # endif
856 #endif
857
858         if (ctx->len.u[1]) return -2;
859
860         alen += len;
861         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
862                 return -1;
863         ctx->len.u[0] = alen;
864
865         n = ctx->ares;
866         if (n) {
867                 while (n && len) {
868                         ctx->Xi.c[n] ^= *(aad++);
869                         --len;
870                         n = (n+1)%16;
871                 }
872                 if (n==0) GCM_MUL(ctx,Xi);
873                 else {
874                         ctx->ares = n;
875                         return 0;
876                 }
877         }
878
879 #ifdef GHASH
880         if ((i = (len&(size_t)-16))) {
881                 GHASH(ctx,aad,i);
882                 aad += i;
883                 len -= i;
884         }
885 #else
886         while (len>=16) {
887                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
888                 GCM_MUL(ctx,Xi);
889                 aad += 16;
890                 len -= 16;
891         }
892 #endif
893         if (len) {
894                 n = (unsigned int)len;
895                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
896         }
897
898         ctx->ares = n;
899         return 0;
900 }
901
902 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
903                 const unsigned char *in, unsigned char *out,
904                 size_t len)
905 {
906         const union { long one; char little; } is_endian = {1};
907         unsigned int n, ctr;
908         size_t i;
909         u64        mlen  = ctx->len.u[1];
910         block128_f block = ctx->block;
911         void      *key   = ctx->key;
912 #ifdef GCM_FUNCREF_4BIT
913         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
914 # ifdef GHASH
915         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
916                                 const u8 *inp,size_t len)       = ctx->ghash;
917 # endif
918 #endif
919
920 #if 0
921         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
922 #endif
923         mlen += len;
924         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
925                 return -1;
926         ctx->len.u[1] = mlen;
927
928         if (ctx->ares) {
929                 /* First call to encrypt finalizes GHASH(AAD) */
930                 GCM_MUL(ctx,Xi);
931                 ctx->ares = 0;
932         }
933
934         if (is_endian.little)
935                 ctr = GETU32(ctx->Yi.c+12);
936         else
937                 ctr = ctx->Yi.d[3];
938
939         n = ctx->mres;
940 #if !defined(OPENSSL_SMALL_FOOTPRINT)
941         if (16%sizeof(size_t) == 0) do {        /* always true actually */
942                 if (n) {
943                         while (n && len) {
944                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
945                                 --len;
946                                 n = (n+1)%16;
947                         }
948                         if (n==0) GCM_MUL(ctx,Xi);
949                         else {
950                                 ctx->mres = n;
951                                 return 0;
952                         }
953                 }
954 #if defined(STRICT_ALIGNMENT)
955                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
956                         break;
957 #endif
958 #if defined(GHASH) && defined(GHASH_CHUNK)
959                 while (len>=GHASH_CHUNK) {
960                     size_t j=GHASH_CHUNK;
961
962                     while (j) {
963                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
964                         ++ctr;
965                         if (is_endian.little)
966                                 PUTU32(ctx->Yi.c+12,ctr);
967                         else
968                                 ctx->Yi.d[3] = ctr;
969                         for (i=0; i<16; i+=sizeof(size_t))
970                                 *(size_t *)(out+i) =
971                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
972                         out += 16;
973                         in  += 16;
974                         j   -= 16;
975                     }
976                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
977                     len -= GHASH_CHUNK;
978                 }
979                 if ((i = (len&(size_t)-16))) {
980                     size_t j=i;
981
982                     while (len>=16) {
983                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
984                         ++ctr;
985                         if (is_endian.little)
986                                 PUTU32(ctx->Yi.c+12,ctr);
987                         else
988                                 ctx->Yi.d[3] = ctr;
989                         for (i=0; i<16; i+=sizeof(size_t))
990                                 *(size_t *)(out+i) =
991                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
992                         out += 16;
993                         in  += 16;
994                         len -= 16;
995                     }
996                     GHASH(ctx,out-j,j);
997                 }
998 #else
999                 while (len>=16) {
1000                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1001                         ++ctr;
1002                         if (is_endian.little)
1003                                 PUTU32(ctx->Yi.c+12,ctr);
1004                         else
1005                                 ctx->Yi.d[3] = ctr;
1006                         for (i=0; i<16; i+=sizeof(size_t))
1007                                 *(size_t *)(ctx->Xi.c+i) ^=
1008                                 *(size_t *)(out+i) =
1009                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1010                         GCM_MUL(ctx,Xi);
1011                         out += 16;
1012                         in  += 16;
1013                         len -= 16;
1014                 }
1015 #endif
1016                 if (len) {
1017                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1018                         ++ctr;
1019                         if (is_endian.little)
1020                                 PUTU32(ctx->Yi.c+12,ctr);
1021                         else
1022                                 ctx->Yi.d[3] = ctr;
1023                         while (len--) {
1024                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1025                                 ++n;
1026                         }
1027                 }
1028
1029                 ctx->mres = n;
1030                 return 0;
1031         } while(0);
1032 #endif
1033         for (i=0;i<len;++i) {
1034                 if (n==0) {
1035                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1036                         ++ctr;
1037                         if (is_endian.little)
1038                                 PUTU32(ctx->Yi.c+12,ctr);
1039                         else
1040                                 ctx->Yi.d[3] = ctr;
1041                 }
1042                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1043                 n = (n+1)%16;
1044                 if (n==0)
1045                         GCM_MUL(ctx,Xi);
1046         }
1047
1048         ctx->mres = n;
1049         return 0;
1050 }
1051
1052 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1053                 const unsigned char *in, unsigned char *out,
1054                 size_t len)
1055 {
1056         const union { long one; char little; } is_endian = {1};
1057         unsigned int n, ctr;
1058         size_t i;
1059         u64        mlen  = ctx->len.u[1];
1060         block128_f block = ctx->block;
1061         void      *key   = ctx->key;
1062 #ifdef GCM_FUNCREF_4BIT
1063         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1064 # ifdef GHASH
1065         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1066                                 const u8 *inp,size_t len)       = ctx->ghash;
1067 # endif
1068 #endif
1069
1070         mlen += len;
1071         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1072                 return -1;
1073         ctx->len.u[1] = mlen;
1074
1075         if (ctx->ares) {
1076                 /* First call to decrypt finalizes GHASH(AAD) */
1077                 GCM_MUL(ctx,Xi);
1078                 ctx->ares = 0;
1079         }
1080
1081         if (is_endian.little)
1082                 ctr = GETU32(ctx->Yi.c+12);
1083         else
1084                 ctr = ctx->Yi.d[3];
1085
1086         n = ctx->mres;
1087 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1088         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1089                 if (n) {
1090                         while (n && len) {
1091                                 u8 c = *(in++);
1092                                 *(out++) = c^ctx->EKi.c[n];
1093                                 ctx->Xi.c[n] ^= c;
1094                                 --len;
1095                                 n = (n+1)%16;
1096                         }
1097                         if (n==0) GCM_MUL (ctx,Xi);
1098                         else {
1099                                 ctx->mres = n;
1100                                 return 0;
1101                         }
1102                 }
1103 #if defined(STRICT_ALIGNMENT)
1104                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1105                         break;
1106 #endif
1107 #if defined(GHASH) && defined(GHASH_CHUNK)
1108                 while (len>=GHASH_CHUNK) {
1109                     size_t j=GHASH_CHUNK;
1110
1111                     GHASH(ctx,in,GHASH_CHUNK);
1112                     while (j) {
1113                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1114                         ++ctr;
1115                         if (is_endian.little)
1116                                 PUTU32(ctx->Yi.c+12,ctr);
1117                         else
1118                                 ctx->Yi.d[3] = ctr;
1119                         for (i=0; i<16; i+=sizeof(size_t))
1120                                 *(size_t *)(out+i) =
1121                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1122                         out += 16;
1123                         in  += 16;
1124                         j   -= 16;
1125                     }
1126                     len -= GHASH_CHUNK;
1127                 }
1128                 if ((i = (len&(size_t)-16))) {
1129                     GHASH(ctx,in,i);
1130                     while (len>=16) {
1131                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1132                         ++ctr;
1133                         if (is_endian.little)
1134                                 PUTU32(ctx->Yi.c+12,ctr);
1135                         else
1136                                 ctx->Yi.d[3] = ctr;
1137                         for (i=0; i<16; i+=sizeof(size_t))
1138                                 *(size_t *)(out+i) =
1139                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1140                         out += 16;
1141                         in  += 16;
1142                         len -= 16;
1143                     }
1144                 }
1145 #else
1146                 while (len>=16) {
1147                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1148                         ++ctr;
1149                         if (is_endian.little)
1150                                 PUTU32(ctx->Yi.c+12,ctr);
1151                         else
1152                                 ctx->Yi.d[3] = ctr;
1153                         for (i=0; i<16; i+=sizeof(size_t)) {
1154                                 size_t c = *(size_t *)(in+i);
1155                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1156                                 *(size_t *)(ctx->Xi.c+i) ^= c;
1157                         }
1158                         GCM_MUL(ctx,Xi);
1159                         out += 16;
1160                         in  += 16;
1161                         len -= 16;
1162                 }
1163 #endif
1164                 if (len) {
1165                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1166                         ++ctr;
1167                         if (is_endian.little)
1168                                 PUTU32(ctx->Yi.c+12,ctr);
1169                         else
1170                                 ctx->Yi.d[3] = ctr;
1171                         while (len--) {
1172                                 u8 c = in[n];
1173                                 ctx->Xi.c[n] ^= c;
1174                                 out[n] = c^ctx->EKi.c[n];
1175                                 ++n;
1176                         }
1177                 }
1178
1179                 ctx->mres = n;
1180                 return 0;
1181         } while(0);
1182 #endif
1183         for (i=0;i<len;++i) {
1184                 u8 c;
1185                 if (n==0) {
1186                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1187                         ++ctr;
1188                         if (is_endian.little)
1189                                 PUTU32(ctx->Yi.c+12,ctr);
1190                         else
1191                                 ctx->Yi.d[3] = ctr;
1192                 }
1193                 c = in[i];
1194                 out[i] = c^ctx->EKi.c[n];
1195                 ctx->Xi.c[n] ^= c;
1196                 n = (n+1)%16;
1197                 if (n==0)
1198                         GCM_MUL(ctx,Xi);
1199         }
1200
1201         ctx->mres = n;
1202         return 0;
1203 }
1204
1205 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1206                 const unsigned char *in, unsigned char *out,
1207                 size_t len, ctr128_f stream)
1208 {
1209         const union { long one; char little; } is_endian = {1};
1210         unsigned int n, ctr;
1211         size_t i;
1212         u64   mlen = ctx->len.u[1];
1213         void *key  = ctx->key;
1214 #ifdef GCM_FUNCREF_4BIT
1215         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1216 # ifdef GHASH
1217         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1218                                 const u8 *inp,size_t len)       = ctx->ghash;
1219 # endif
1220 #endif
1221
1222         mlen += len;
1223         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1224                 return -1;
1225         ctx->len.u[1] = mlen;
1226
1227         if (ctx->ares) {
1228                 /* First call to encrypt finalizes GHASH(AAD) */
1229                 GCM_MUL(ctx,Xi);
1230                 ctx->ares = 0;
1231         }
1232
1233         if (is_endian.little)
1234                 ctr = GETU32(ctx->Yi.c+12);
1235         else
1236                 ctr = ctx->Yi.d[3];
1237
1238         n = ctx->mres;
1239         if (n) {
1240                 while (n && len) {
1241                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1242                         --len;
1243                         n = (n+1)%16;
1244                 }
1245                 if (n==0) GCM_MUL(ctx,Xi);
1246                 else {
1247                         ctx->mres = n;
1248                         return 0;
1249                 }
1250         }
1251 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1252         while (len>=GHASH_CHUNK) {
1253                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1254                 ctr += GHASH_CHUNK/16;
1255                 if (is_endian.little)
1256                         PUTU32(ctx->Yi.c+12,ctr);
1257                 else
1258                         ctx->Yi.d[3] = ctr;
1259                 GHASH(ctx,out,GHASH_CHUNK);
1260                 out += GHASH_CHUNK;
1261                 in  += GHASH_CHUNK;
1262                 len -= GHASH_CHUNK;
1263         }
1264 #endif
1265         if ((i = (len&(size_t)-16))) {
1266                 size_t j=i/16;
1267
1268                 (*stream)(in,out,j,key,ctx->Yi.c);
1269                 ctr += (unsigned int)j;
1270                 if (is_endian.little)
1271                         PUTU32(ctx->Yi.c+12,ctr);
1272                 else
1273                         ctx->Yi.d[3] = ctr;
1274                 in  += i;
1275                 len -= i;
1276 #if defined(GHASH)
1277                 GHASH(ctx,out,i);
1278                 out += i;
1279 #else
1280                 while (j--) {
1281                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1282                         GCM_MUL(ctx,Xi);
1283                         out += 16;
1284                 }
1285 #endif
1286         }
1287         if (len) {
1288                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1289                 ++ctr;
1290                 if (is_endian.little)
1291                         PUTU32(ctx->Yi.c+12,ctr);
1292                 else
1293                         ctx->Yi.d[3] = ctr;
1294                 while (len--) {
1295                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1296                         ++n;
1297                 }
1298         }
1299
1300         ctx->mres = n;
1301         return 0;
1302 }
1303
1304 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1305                 const unsigned char *in, unsigned char *out,
1306                 size_t len,ctr128_f stream)
1307 {
1308         const union { long one; char little; } is_endian = {1};
1309         unsigned int n, ctr;
1310         size_t i;
1311         u64   mlen = ctx->len.u[1];
1312         void *key  = ctx->key;
1313 #ifdef GCM_FUNCREF_4BIT
1314         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1315 # ifdef GHASH
1316         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1317                                 const u8 *inp,size_t len)       = ctx->ghash;
1318 # endif
1319 #endif
1320
1321         mlen += len;
1322         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1323                 return -1;
1324         ctx->len.u[1] = mlen;
1325
1326         if (ctx->ares) {
1327                 /* First call to decrypt finalizes GHASH(AAD) */
1328                 GCM_MUL(ctx,Xi);
1329                 ctx->ares = 0;
1330         }
1331
1332         if (is_endian.little)
1333                 ctr = GETU32(ctx->Yi.c+12);
1334         else
1335                 ctr = ctx->Yi.d[3];
1336
1337         n = ctx->mres;
1338         if (n) {
1339                 while (n && len) {
1340                         u8 c = *(in++);
1341                         *(out++) = c^ctx->EKi.c[n];
1342                         ctx->Xi.c[n] ^= c;
1343                         --len;
1344                         n = (n+1)%16;
1345                 }
1346                 if (n==0) GCM_MUL (ctx,Xi);
1347                 else {
1348                         ctx->mres = n;
1349                         return 0;
1350                 }
1351         }
1352 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1353         while (len>=GHASH_CHUNK) {
1354                 GHASH(ctx,in,GHASH_CHUNK);
1355                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1356                 ctr += GHASH_CHUNK/16;
1357                 if (is_endian.little)
1358                         PUTU32(ctx->Yi.c+12,ctr);
1359                 else
1360                         ctx->Yi.d[3] = ctr;
1361                 out += GHASH_CHUNK;
1362                 in  += GHASH_CHUNK;
1363                 len -= GHASH_CHUNK;
1364         }
1365 #endif
1366         if ((i = (len&(size_t)-16))) {
1367                 size_t j=i/16;
1368
1369 #if defined(GHASH)
1370                 GHASH(ctx,in,i);
1371 #else
1372                 while (j--) {
1373                         size_t k;
1374                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1375                         GCM_MUL(ctx,Xi);
1376                         in += 16;
1377                 }
1378                 j   = i/16;
1379                 in -= i;
1380 #endif
1381                 (*stream)(in,out,j,key,ctx->Yi.c);
1382                 ctr += (unsigned int)j;
1383                 if (is_endian.little)
1384                         PUTU32(ctx->Yi.c+12,ctr);
1385                 else
1386                         ctx->Yi.d[3] = ctr;
1387                 out += i;
1388                 in  += i;
1389                 len -= i;
1390         }
1391         if (len) {
1392                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1393                 ++ctr;
1394                 if (is_endian.little)
1395                         PUTU32(ctx->Yi.c+12,ctr);
1396                 else
1397                         ctx->Yi.d[3] = ctr;
1398                 while (len--) {
1399                         u8 c = in[n];
1400                         ctx->Xi.c[n] ^= c;
1401                         out[n] = c^ctx->EKi.c[n];
1402                         ++n;
1403                 }
1404         }
1405
1406         ctx->mres = n;
1407         return 0;
1408 }
1409
1410 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1411                         size_t len)
1412 {
1413         const union { long one; char little; } is_endian = {1};
1414         u64 alen = ctx->len.u[0]<<3;
1415         u64 clen = ctx->len.u[1]<<3;
1416 #ifdef GCM_FUNCREF_4BIT
1417         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1418 #endif
1419
1420         if (ctx->mres || ctx->ares)
1421                 GCM_MUL(ctx,Xi);
1422
1423         if (is_endian.little) {
1424 #ifdef BSWAP8
1425                 alen = BSWAP8(alen);
1426                 clen = BSWAP8(clen);
1427 #else
1428                 u8 *p = ctx->len.c;
1429
1430                 ctx->len.u[0] = alen;
1431                 ctx->len.u[1] = clen;
1432
1433                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1434                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1435 #endif
1436         }
1437
1438         ctx->Xi.u[0] ^= alen;
1439         ctx->Xi.u[1] ^= clen;
1440         GCM_MUL(ctx,Xi);
1441
1442         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1443         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1444
1445         if (tag && len<=sizeof(ctx->Xi))
1446                 return memcmp(ctx->Xi.c,tag,len);
1447         else
1448                 return -1;
1449 }
1450
1451 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1452 {
1453         CRYPTO_gcm128_finish(ctx, NULL, 0);
1454         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1455 }
1456
1457 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1458 {
1459         GCM128_CONTEXT *ret;
1460
1461         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1462                 CRYPTO_gcm128_init(ret,key,block);
1463
1464         return ret;
1465 }
1466
1467 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1468 {
1469         if (ctx) {
1470                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1471                 OPENSSL_free(ctx);
1472         }
1473 }
1474
1475 #if defined(SELFTEST)
1476 #include <stdio.h>
1477 #include <openssl/aes.h>
1478
1479 /* Test Case 1 */
1480 static const u8 K1[16],
1481                 *P1=NULL,
1482                 *A1=NULL,
1483                 IV1[12],
1484                 *C1=NULL,
1485                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1486
1487 /* Test Case 2 */
1488 #define K2 K1
1489 #define A2 A1
1490 #define IV2 IV1
1491 static const u8 P2[16],
1492                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1493                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1494
1495 /* Test Case 3 */
1496 #define A3 A2
1497 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1498                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1499                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1500                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1501                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1502                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1503                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1504                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1505                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1506                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1507                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1508
1509 /* Test Case 4 */
1510 #define K4 K3
1511 #define IV4 IV3
1512 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1513                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1514                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1515                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1516                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1517                         0xab,0xad,0xda,0xd2},
1518                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1519                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1520                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1521                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1522                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1523
1524 /* Test Case 5 */
1525 #define K5 K4
1526 #define P5 P4
1527 #define A5 A4
1528 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1529                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1530                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1531                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1532                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1533                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1534
1535 /* Test Case 6 */
1536 #define K6 K5
1537 #define P6 P5
1538 #define A6 A5
1539 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1540                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1541                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1542                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1543                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1544                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1545                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1546                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1547                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1548
1549 /* Test Case 7 */
1550 static const u8 K7[24],
1551                 *P7=NULL,
1552                 *A7=NULL,
1553                 IV7[12],
1554                 *C7=NULL,
1555                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1556
1557 /* Test Case 8 */
1558 #define K8 K7
1559 #define IV8 IV7
1560 #define A8 A7
1561 static const u8 P8[16],
1562                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1563                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1564
1565 /* Test Case 9 */
1566 #define A9 A8
1567 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1568                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1569                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1570                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1571                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1572                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1573                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1574                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1575                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1576                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1577                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1578                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1579
1580 /* Test Case 10 */
1581 #define K10 K9
1582 #define IV10 IV9
1583 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1584                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1585                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1586                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1587                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1588                         0xab,0xad,0xda,0xd2},
1589                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1590                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1591                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1592                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1593                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1594
1595 /* Test Case 11 */
1596 #define K11 K10
1597 #define P11 P10
1598 #define A11 A10
1599 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1600                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1601                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1602                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1603                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1604                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1605
1606 /* Test Case 12 */
1607 #define K12 K11
1608 #define P12 P11
1609 #define A12 A11
1610 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1611                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1612                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1613                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1614                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1615                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1616                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1617                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1618                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1619
1620 /* Test Case 13 */
1621 static const u8 K13[32],
1622                 *P13=NULL,
1623                 *A13=NULL,
1624                 IV13[12],
1625                 *C13=NULL,
1626                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1627
1628 /* Test Case 14 */
1629 #define K14 K13
1630 #define A14 A13
1631 static const u8 P14[16],
1632                 IV14[12],
1633                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1634                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1635
1636 /* Test Case 15 */
1637 #define A15 A14
1638 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1639                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1640                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1641                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1642                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1643                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1644                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1645                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1646                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1647                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1648                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1649                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1650
1651 /* Test Case 16 */
1652 #define K16 K15
1653 #define IV16 IV15
1654 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1655                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1656                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1657                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1658                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1659                         0xab,0xad,0xda,0xd2},
1660                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1661                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1662                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1663                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1664                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1665
1666 /* Test Case 17 */
1667 #define K17 K16
1668 #define P17 P16
1669 #define A17 A16
1670 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1671                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1672                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1673                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1674                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1675                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1676
1677 /* Test Case 18 */
1678 #define K18 K17
1679 #define P18 P17
1680 #define A18 A17
1681 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1682                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1683                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1684                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1685                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1686                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1687                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1688                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1689                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1690
1691 #define TEST_CASE(n)    do {                                    \
1692         u8 out[sizeof(P##n)];                                   \
1693         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1694         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1695         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1696         memset(out,0,sizeof(out));                              \
1697         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1698         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1699         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1700             (C##n && memcmp(out,C##n,sizeof(out))))             \
1701                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1702         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1703         memset(out,0,sizeof(out));                              \
1704         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1705         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1706         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1707             (P##n && memcmp(out,P##n,sizeof(out))))             \
1708                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1709         } while(0)
1710
1711 int main()
1712 {
1713         GCM128_CONTEXT ctx;
1714         AES_KEY key;
1715         int ret=0;
1716
1717         TEST_CASE(1);
1718         TEST_CASE(2);
1719         TEST_CASE(3);
1720         TEST_CASE(4);
1721         TEST_CASE(5);
1722         TEST_CASE(6);
1723         TEST_CASE(7);
1724         TEST_CASE(8);
1725         TEST_CASE(9);
1726         TEST_CASE(10);
1727         TEST_CASE(11);
1728         TEST_CASE(12);
1729         TEST_CASE(13);
1730         TEST_CASE(14);
1731         TEST_CASE(15);
1732         TEST_CASE(16);
1733         TEST_CASE(17);
1734         TEST_CASE(18);
1735
1736 #ifdef OPENSSL_CPUID_OBJ
1737         {
1738         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1739         union { u64 u; u8 c[1024]; } buf;
1740         int i;
1741
1742         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1743         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1744         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1745
1746         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1747         start = OPENSSL_rdtsc();
1748         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1749         gcm_t = OPENSSL_rdtsc() - start;
1750
1751         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1752                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1753                         (block128_f)AES_encrypt);
1754         start = OPENSSL_rdtsc();
1755         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1756                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1757                         (block128_f)AES_encrypt);
1758         ctr_t = OPENSSL_rdtsc() - start;
1759
1760         printf("%.2f-%.2f=%.2f\n",
1761                         gcm_t/(double)sizeof(buf),
1762                         ctr_t/(double)sizeof(buf),
1763                         (gcm_t-ctr_t)/(double)sizeof(buf));
1764 #ifdef GHASH
1765         {
1766         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1767                                 const u8 *inp,size_t len)       = ctx.ghash;
1768
1769         GHASH((&ctx),buf.c,sizeof(buf));
1770         start = OPENSSL_rdtsc();
1771         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1772         gcm_t = OPENSSL_rdtsc() - start;
1773         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1774         }
1775 #endif
1776         }
1777 #endif
1778
1779         return ret;
1780 }
1781 #endif