a6e2af1b96783d7160981ddf94a1d4a6b38d5777
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #define OPENSSL_FIPSAPI
51
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
54 #include <string.h>
55
56 #ifndef MODES_DEBUG
57 # ifndef NDEBUG
58 #  define NDEBUG
59 # endif
60 #endif
61 #include <assert.h>
62
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
65 #undef  GETU32
66 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
67 #undef  PUTU32
68 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
69 #endif
70
71 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V)   do { \
73         if (sizeof(size_t)==8) { \
74                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75                 V.lo  = (V.hi<<63)|(V.lo>>1); \
76                 V.hi  = (V.hi>>1 )^T; \
77         } \
78         else { \
79                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80                 V.lo  = (V.hi<<63)|(V.lo>>1); \
81                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
82         } \
83 } while(0)
84
85 /*
86  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87  * never be set to 8. 8 is effectively reserved for testing purposes.
88  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90  * whole spectrum of possible table driven implementations. Why? In
91  * non-"Shoup's" case memory access pattern is segmented in such manner,
92  * that it's trivial to see that cache timing information can reveal
93  * fair portion of intermediate hash value. Given that ciphertext is
94  * always available to attacker, it's possible for him to attempt to
95  * deduce secret parameter H and if successful, tamper with messages
96  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97  * not as trivial, but there is no reason to believe that it's resistant
98  * to cache-timing attack. And the thing about "8-bit" implementation is
99  * that it consumes 16 (sixteen) times more memory, 4KB per individual
100  * key + 1KB shared. Well, on pros side it should be twice as fast as
101  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102  * was observed to run ~75% faster, closer to 100% for commercial
103  * compilers... Yet "4-bit" procedure is preferred, because it's
104  * believed to provide better security-performance balance and adequate
105  * all-round performance. "All-round" refers to things like:
106  *
107  * - shorter setup time effectively improves overall timing for
108  *   handling short messages;
109  * - larger table allocation can become unbearable because of VM
110  *   subsystem penalties (for example on Windows large enough free
111  *   results in VM working set trimming, meaning that consequent
112  *   malloc would immediately incur working set expansion);
113  * - larger table has larger cache footprint, which can affect
114  *   performance of other code paths (not necessarily even from same
115  *   thread in Hyper-Threading world);
116  *
117  * Value of 1 is not appropriate for performance reasons.
118  */
119 #if     TABLE_BITS==8
120
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
122 {
123         int  i, j;
124         u128 V;
125
126         Htable[0].hi = 0;
127         Htable[0].lo = 0;
128         V.hi = H[0];
129         V.lo = H[1];
130
131         for (Htable[128]=V, i=64; i>0; i>>=1) {
132                 REDUCE1BIT(V);
133                 Htable[i] = V;
134         }
135
136         for (i=2; i<256; i<<=1) {
137                 u128 *Hi = Htable+i, H0 = *Hi;
138                 for (j=1; j<i; ++j) {
139                         Hi[j].hi = H0.hi^Htable[j].hi;
140                         Hi[j].lo = H0.lo^Htable[j].lo;
141                 }
142         }
143 }
144
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
146 {
147         u128 Z = { 0, 0};
148         const u8 *xi = (const u8 *)Xi+15;
149         size_t rem, n = *xi;
150         const union { long one; char little; } is_endian = {1};
151         __fips_constseg
152         static const size_t rem_8bit[256] = {
153                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
154                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
155                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
156                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
157                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
158                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
159                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
160                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
161                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
162                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
163                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
164                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
165                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
166                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
167                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
168                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
169                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
170                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
171                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
172                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
173                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
174                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
175                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
176                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
177                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
178                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
179                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
180                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
181                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
182                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
183                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
184                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
185                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
186                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
187                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
188                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
189                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
190                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
191                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
192                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
193                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
194                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
195                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
196                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
197                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
198                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
199                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
200                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
201                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
202                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
203                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
204                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
205                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
206                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
207                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
208                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
209                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
210                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
211                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
212                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
213                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
214                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
215                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
216                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
217
218         while (1) {
219                 Z.hi ^= Htable[n].hi;
220                 Z.lo ^= Htable[n].lo;
221
222                 if ((u8 *)Xi==xi)       break;
223
224                 n = *(--xi);
225
226                 rem  = (size_t)Z.lo&0xff;
227                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228                 Z.hi = (Z.hi>>8);
229                 if (sizeof(size_t)==8)
230                         Z.hi ^= rem_8bit[rem];
231                 else
232                         Z.hi ^= (u64)rem_8bit[rem]<<32;
233         }
234
235         if (is_endian.little) {
236 #ifdef BSWAP8
237                 Xi[0] = BSWAP8(Z.hi);
238                 Xi[1] = BSWAP8(Z.lo);
239 #else
240                 u8 *p = (u8 *)Xi;
241                 u32 v;
242                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
243                 v = (u32)(Z.hi);        PUTU32(p+4,v);
244                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
245                 v = (u32)(Z.lo);        PUTU32(p+12,v);
246 #endif
247         }
248         else {
249                 Xi[0] = Z.hi;
250                 Xi[1] = Z.lo;
251         }
252 }
253 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
254
255 #elif   TABLE_BITS==4
256
257 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
258 {
259         u128 V;
260 #if defined(OPENSSL_SMALL_FOOTPRINT)
261         int  i;
262 #endif
263
264         Htable[0].hi = 0;
265         Htable[0].lo = 0;
266         V.hi = H[0];
267         V.lo = H[1];
268
269 #if defined(OPENSSL_SMALL_FOOTPRINT)
270         for (Htable[8]=V, i=4; i>0; i>>=1) {
271                 REDUCE1BIT(V);
272                 Htable[i] = V;
273         }
274
275         for (i=2; i<16; i<<=1) {
276                 u128 *Hi = Htable+i;
277                 int   j;
278                 for (V=*Hi, j=1; j<i; ++j) {
279                         Hi[j].hi = V.hi^Htable[j].hi;
280                         Hi[j].lo = V.lo^Htable[j].lo;
281                 }
282         }
283 #else
284         Htable[8] = V;
285         REDUCE1BIT(V);
286         Htable[4] = V;
287         REDUCE1BIT(V);
288         Htable[2] = V;
289         REDUCE1BIT(V);
290         Htable[1] = V;
291         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
292         V=Htable[4];
293         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
294         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
295         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
296         V=Htable[8];
297         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
298         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
299         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
300         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
301         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
302         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
303         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #endif
305 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306         /*
307          * ARM assembler expects specific dword order in Htable.
308          */
309         {
310         int j;
311         const union { long one; char little; } is_endian = {1};
312
313         if (is_endian.little)
314                 for (j=0;j<16;++j) {
315                         V = Htable[j];
316                         Htable[j].hi = V.lo;
317                         Htable[j].lo = V.hi;
318                 }
319         else
320                 for (j=0;j<16;++j) {
321                         V = Htable[j];
322                         Htable[j].hi = V.lo<<32|V.lo>>32;
323                         Htable[j].lo = V.hi<<32|V.hi>>32;
324                 }
325         }
326 #endif
327 }
328
329 #ifndef GHASH_ASM
330 __fips_constseg
331 static const size_t rem_4bit[16] = {
332         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
333         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
334         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
335         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
336
337 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
338 {
339         u128 Z;
340         int cnt = 15;
341         size_t rem, nlo, nhi;
342         const union { long one; char little; } is_endian = {1};
343
344         nlo  = ((const u8 *)Xi)[15];
345         nhi  = nlo>>4;
346         nlo &= 0xf;
347
348         Z.hi = Htable[nlo].hi;
349         Z.lo = Htable[nlo].lo;
350
351         while (1) {
352                 rem  = (size_t)Z.lo&0xf;
353                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
354                 Z.hi = (Z.hi>>4);
355                 if (sizeof(size_t)==8)
356                         Z.hi ^= rem_4bit[rem];
357                 else
358                         Z.hi ^= (u64)rem_4bit[rem]<<32;
359
360                 Z.hi ^= Htable[nhi].hi;
361                 Z.lo ^= Htable[nhi].lo;
362
363                 if (--cnt<0)            break;
364
365                 nlo  = ((const u8 *)Xi)[cnt];
366                 nhi  = nlo>>4;
367                 nlo &= 0xf;
368
369                 rem  = (size_t)Z.lo&0xf;
370                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
371                 Z.hi = (Z.hi>>4);
372                 if (sizeof(size_t)==8)
373                         Z.hi ^= rem_4bit[rem];
374                 else
375                         Z.hi ^= (u64)rem_4bit[rem]<<32;
376
377                 Z.hi ^= Htable[nlo].hi;
378                 Z.lo ^= Htable[nlo].lo;
379         }
380
381         if (is_endian.little) {
382 #ifdef BSWAP8
383                 Xi[0] = BSWAP8(Z.hi);
384                 Xi[1] = BSWAP8(Z.lo);
385 #else
386                 u8 *p = (u8 *)Xi;
387                 u32 v;
388                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
389                 v = (u32)(Z.hi);        PUTU32(p+4,v);
390                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
391                 v = (u32)(Z.lo);        PUTU32(p+12,v);
392 #endif
393         }
394         else {
395                 Xi[0] = Z.hi;
396                 Xi[1] = Z.lo;
397         }
398 }
399
400 #if !defined(OPENSSL_SMALL_FOOTPRINT)
401 /*
402  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
403  * details... Compiler-generated code doesn't seem to give any
404  * performance improvement, at least not on x86[_64]. It's here
405  * mostly as reference and a placeholder for possible future
406  * non-trivial optimization[s]...
407  */
408 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
409                                 const u8 *inp,size_t len)
410 {
411     u128 Z;
412     int cnt;
413     size_t rem, nlo, nhi;
414     const union { long one; char little; } is_endian = {1};
415
416 #if 1
417     do {
418         cnt  = 15;
419         nlo  = ((const u8 *)Xi)[15];
420         nlo ^= inp[15];
421         nhi  = nlo>>4;
422         nlo &= 0xf;
423
424         Z.hi = Htable[nlo].hi;
425         Z.lo = Htable[nlo].lo;
426
427         while (1) {
428                 rem  = (size_t)Z.lo&0xf;
429                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
430                 Z.hi = (Z.hi>>4);
431                 if (sizeof(size_t)==8)
432                         Z.hi ^= rem_4bit[rem];
433                 else
434                         Z.hi ^= (u64)rem_4bit[rem]<<32;
435
436                 Z.hi ^= Htable[nhi].hi;
437                 Z.lo ^= Htable[nhi].lo;
438
439                 if (--cnt<0)            break;
440
441                 nlo  = ((const u8 *)Xi)[cnt];
442                 nlo ^= inp[cnt];
443                 nhi  = nlo>>4;
444                 nlo &= 0xf;
445
446                 rem  = (size_t)Z.lo&0xf;
447                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
448                 Z.hi = (Z.hi>>4);
449                 if (sizeof(size_t)==8)
450                         Z.hi ^= rem_4bit[rem];
451                 else
452                         Z.hi ^= (u64)rem_4bit[rem]<<32;
453
454                 Z.hi ^= Htable[nlo].hi;
455                 Z.lo ^= Htable[nlo].lo;
456         }
457 #else
458     /*
459      * Extra 256+16 bytes per-key plus 512 bytes shared tables
460      * [should] give ~50% improvement... One could have PACK()-ed
461      * the rem_8bit even here, but the priority is to minimize
462      * cache footprint...
463      */ 
464     u128 Hshr4[16];     /* Htable shifted right by 4 bits */
465     u8   Hshl4[16];     /* Htable shifted left  by 4 bits */
466     __fips_constseg
467     static const unsigned short rem_8bit[256] = {
468         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
500     /*
501      * This pre-processing phase slows down procedure by approximately
502      * same time as it makes each loop spin faster. In other words
503      * single block performance is approximately same as straightforward
504      * "4-bit" implementation, and then it goes only faster...
505      */
506     for (cnt=0; cnt<16; ++cnt) {
507         Z.hi = Htable[cnt].hi;
508         Z.lo = Htable[cnt].lo;
509         Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510         Hshr4[cnt].hi = (Z.hi>>4);
511         Hshl4[cnt]    = (u8)(Z.lo<<4);
512     }
513
514     do {
515         for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516                 nlo  = ((const u8 *)Xi)[cnt];
517                 nlo ^= inp[cnt];
518                 nhi  = nlo>>4;
519                 nlo &= 0xf;
520
521                 Z.hi ^= Htable[nlo].hi;
522                 Z.lo ^= Htable[nlo].lo;
523
524                 rem = (size_t)Z.lo&0xff;
525
526                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
527                 Z.hi = (Z.hi>>8);
528
529                 Z.hi ^= Hshr4[nhi].hi;
530                 Z.lo ^= Hshr4[nhi].lo;
531                 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
532         }
533
534         nlo  = ((const u8 *)Xi)[0];
535         nlo ^= inp[0];
536         nhi  = nlo>>4;
537         nlo &= 0xf;
538
539         Z.hi ^= Htable[nlo].hi;
540         Z.lo ^= Htable[nlo].lo;
541
542         rem = (size_t)Z.lo&0xf;
543
544         Z.lo = (Z.hi<<60)|(Z.lo>>4);
545         Z.hi = (Z.hi>>4);
546
547         Z.hi ^= Htable[nhi].hi;
548         Z.lo ^= Htable[nhi].lo;
549         Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
550 #endif
551
552         if (is_endian.little) {
553 #ifdef BSWAP8
554                 Xi[0] = BSWAP8(Z.hi);
555                 Xi[1] = BSWAP8(Z.lo);
556 #else
557                 u8 *p = (u8 *)Xi;
558                 u32 v;
559                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
560                 v = (u32)(Z.hi);        PUTU32(p+4,v);
561                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
562                 v = (u32)(Z.lo);        PUTU32(p+12,v);
563 #endif
564         }
565         else {
566                 Xi[0] = Z.hi;
567                 Xi[1] = Z.lo;
568         }
569     } while (inp+=16, len-=16);
570 }
571 #endif
572 #else
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
575 #endif
576
577 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581  * trashing effect. In other words idea is to hash data while it's
582  * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK       (3*1024)
584 #endif
585
586 #else   /* TABLE_BITS */
587
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
589 {
590         u128 V,Z = { 0,0 };
591         long X;
592         int  i,j;
593         const long *xi = (const long *)Xi;
594         const union { long one; char little; } is_endian = {1};
595
596         V.hi = H[0];    /* H is in host byte order, no byte swapping */
597         V.lo = H[1];
598
599         for (j=0; j<16/sizeof(long); ++j) {
600                 if (is_endian.little) {
601                         if (sizeof(long)==8) {
602 #ifdef BSWAP8
603                                 X = (long)(BSWAP8(xi[j]));
604 #else
605                                 const u8 *p = (const u8 *)(xi+j);
606                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 #endif
608                         }
609                         else {
610                                 const u8 *p = (const u8 *)(xi+j);
611                                 X = (long)GETU32(p);
612                         }
613                 }
614                 else
615                         X = xi[j];
616
617                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618                         u64 M = (u64)(X>>(8*sizeof(long)-1));
619                         Z.hi ^= V.hi&M;
620                         Z.lo ^= V.lo&M;
621
622                         REDUCE1BIT(V);
623                 }
624         }
625
626         if (is_endian.little) {
627 #ifdef BSWAP8
628                 Xi[0] = BSWAP8(Z.hi);
629                 Xi[1] = BSWAP8(Z.lo);
630 #else
631                 u8 *p = (u8 *)Xi;
632                 u32 v;
633                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
634                 v = (u32)(Z.hi);        PUTU32(p+4,v);
635                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
636                 v = (u32)(Z.lo);        PUTU32(p+12,v);
637 #endif
638         }
639         else {
640                 Xi[0] = Z.hi;
641                 Xi[1] = Z.lo;
642         }
643 }
644 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645
646 #endif
647
648 #if     TABLE_BITS==4 && defined(GHASH_ASM)
649 # if    !defined(I386_ONLY) && \
650         (defined(__i386)        || defined(__i386__)    || \
651          defined(__x86_64)      || defined(__x86_64__)  || \
652          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
653 #  define GHASH_ASM_X86_OR_64
654 #  define GCM_FUNCREF_4BIT
655 extern unsigned int OPENSSL_ia32cap_P[2];
656
657 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
658 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660
661 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
662 #   define GHASH_ASM_X86
663 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
664 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665
666 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
667 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
668 #  endif
669 # elif defined(__arm__) || defined(__arm)
670 #  include "arm_arch.h"
671 #  if __ARM_ARCH__>=7
672 #   define GHASH_ASM_ARM
673 #   define GCM_FUNCREF_4BIT
674 void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
675 void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 #  endif
677 # elif defined(__sparc__) || defined(__sparc)
678 #  include "sparc_arch.h"
679 #  define GHASH_ASM_SPARC
680 #  define GCM_FUNCREF_4BIT
681 extern unsigned int OPENSSL_sparcv9cap_P[];
682 void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
683 void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
684 void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
685 # endif
686 #endif
687
688 #ifdef GCM_FUNCREF_4BIT
689 # undef  GCM_MUL
690 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
691 # ifdef GHASH
692 #  undef  GHASH
693 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
694 # endif
695 #endif
696
697 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
698 {
699         const union { long one; char little; } is_endian = {1};
700
701         memset(ctx,0,sizeof(*ctx));
702         ctx->block = block;
703         ctx->key   = key;
704
705         (*block)(ctx->H.c,ctx->H.c,key);
706
707         if (is_endian.little) {
708                 /* H is stored in host byte order */
709 #ifdef BSWAP8
710                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
711                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
712 #else
713                 u8 *p = ctx->H.c;
714                 u64 hi,lo;
715                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
716                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
717                 ctx->H.u[0] = hi;
718                 ctx->H.u[1] = lo;
719 #endif
720         }
721
722 #if     TABLE_BITS==8
723         gcm_init_8bit(ctx->Htable,ctx->H.u);
724 #elif   TABLE_BITS==4
725 # if    defined(GHASH_ASM_X86_OR_64)
726 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
727         if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
728             OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
729                 gcm_init_clmul(ctx->Htable,ctx->H.u);
730                 ctx->gmult = gcm_gmult_clmul;
731                 ctx->ghash = gcm_ghash_clmul;
732                 return;
733         }
734 #  endif
735         gcm_init_4bit(ctx->Htable,ctx->H.u);
736 #  if   defined(GHASH_ASM_X86)                  /* x86 only */
737 #   if  defined(OPENSSL_IA32_SSE2)
738         if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
739 #   else
740         if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
741 #   endif
742                 ctx->gmult = gcm_gmult_4bit_mmx;
743                 ctx->ghash = gcm_ghash_4bit_mmx;
744         } else {
745                 ctx->gmult = gcm_gmult_4bit_x86;
746                 ctx->ghash = gcm_ghash_4bit_x86;
747         }
748 #  else
749         ctx->gmult = gcm_gmult_4bit;
750         ctx->ghash = gcm_ghash_4bit;
751 #  endif
752 # elif  defined(GHASH_ASM_ARM)
753         if (OPENSSL_armcap_P & ARMV7_NEON) {
754                 ctx->gmult = gcm_gmult_neon;
755                 ctx->ghash = gcm_ghash_neon;
756         } else {
757                 gcm_init_4bit(ctx->Htable,ctx->H.u);
758                 ctx->gmult = gcm_gmult_4bit;
759                 ctx->ghash = gcm_ghash_4bit;
760         }
761 # elif  defined(GHASH_ASM_SPARC)
762         if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
763                 gcm_init_vis3(ctx->Htable,ctx->H.u);
764                 ctx->gmult = gcm_gmult_vis3;
765                 ctx->ghash = gcm_ghash_vis3;
766         } else {
767                 gcm_init_4bit(ctx->Htable,ctx->H.u);
768                 ctx->gmult = gcm_gmult_4bit;
769                 ctx->ghash = gcm_ghash_4bit;
770         }
771 # else
772         gcm_init_4bit(ctx->Htable,ctx->H.u);
773 # endif
774 #endif
775 }
776
777 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
778 {
779         const union { long one; char little; } is_endian = {1};
780         unsigned int ctr;
781 #ifdef GCM_FUNCREF_4BIT
782         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
783 #endif
784
785         ctx->Yi.u[0]  = 0;
786         ctx->Yi.u[1]  = 0;
787         ctx->Xi.u[0]  = 0;
788         ctx->Xi.u[1]  = 0;
789         ctx->len.u[0] = 0;      /* AAD length */
790         ctx->len.u[1] = 0;      /* message length */
791         ctx->ares = 0;
792         ctx->mres = 0;
793
794         if (len==12) {
795                 memcpy(ctx->Yi.c,iv,12);
796                 ctx->Yi.c[15]=1;
797                 ctr=1;
798         }
799         else {
800                 size_t i;
801                 u64 len0 = len;
802
803                 while (len>=16) {
804                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
805                         GCM_MUL(ctx,Yi);
806                         iv += 16;
807                         len -= 16;
808                 }
809                 if (len) {
810                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
811                         GCM_MUL(ctx,Yi);
812                 }
813                 len0 <<= 3;
814                 if (is_endian.little) {
815 #ifdef BSWAP8
816                         ctx->Yi.u[1]  ^= BSWAP8(len0);
817 #else
818                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
819                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
820                         ctx->Yi.c[10] ^= (u8)(len0>>40);
821                         ctx->Yi.c[11] ^= (u8)(len0>>32);
822                         ctx->Yi.c[12] ^= (u8)(len0>>24);
823                         ctx->Yi.c[13] ^= (u8)(len0>>16);
824                         ctx->Yi.c[14] ^= (u8)(len0>>8);
825                         ctx->Yi.c[15] ^= (u8)(len0);
826 #endif
827                 }
828                 else
829                         ctx->Yi.u[1]  ^= len0;
830
831                 GCM_MUL(ctx,Yi);
832
833                 if (is_endian.little)
834                         ctr = GETU32(ctx->Yi.c+12);
835                 else
836                         ctr = ctx->Yi.d[3];
837         }
838
839         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
840         ++ctr;
841         if (is_endian.little)
842                 PUTU32(ctx->Yi.c+12,ctr);
843         else
844                 ctx->Yi.d[3] = ctr;
845 }
846
847 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
848 {
849         size_t i;
850         unsigned int n;
851         u64 alen = ctx->len.u[0];
852 #ifdef GCM_FUNCREF_4BIT
853         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
854 # ifdef GHASH
855         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
856                                 const u8 *inp,size_t len)       = ctx->ghash;
857 # endif
858 #endif
859
860         if (ctx->len.u[1]) return -2;
861
862         alen += len;
863         if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
864                 return -1;
865         ctx->len.u[0] = alen;
866
867         n = ctx->ares;
868         if (n) {
869                 while (n && len) {
870                         ctx->Xi.c[n] ^= *(aad++);
871                         --len;
872                         n = (n+1)%16;
873                 }
874                 if (n==0) GCM_MUL(ctx,Xi);
875                 else {
876                         ctx->ares = n;
877                         return 0;
878                 }
879         }
880
881 #ifdef GHASH
882         if ((i = (len&(size_t)-16))) {
883                 GHASH(ctx,aad,i);
884                 aad += i;
885                 len -= i;
886         }
887 #else
888         while (len>=16) {
889                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
890                 GCM_MUL(ctx,Xi);
891                 aad += 16;
892                 len -= 16;
893         }
894 #endif
895         if (len) {
896                 n = (unsigned int)len;
897                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
898         }
899
900         ctx->ares = n;
901         return 0;
902 }
903
904 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
905                 const unsigned char *in, unsigned char *out,
906                 size_t len)
907 {
908         const union { long one; char little; } is_endian = {1};
909         unsigned int n, ctr;
910         size_t i;
911         u64        mlen  = ctx->len.u[1];
912         block128_f block = ctx->block;
913         void      *key   = ctx->key;
914 #ifdef GCM_FUNCREF_4BIT
915         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
916 # ifdef GHASH
917         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
918                                 const u8 *inp,size_t len)       = ctx->ghash;
919 # endif
920 #endif
921
922 #if 0
923         n = (unsigned int)mlen%16; /* alternative to ctx->mres */
924 #endif
925         mlen += len;
926         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
927                 return -1;
928         ctx->len.u[1] = mlen;
929
930         if (ctx->ares) {
931                 /* First call to encrypt finalizes GHASH(AAD) */
932                 GCM_MUL(ctx,Xi);
933                 ctx->ares = 0;
934         }
935
936         if (is_endian.little)
937                 ctr = GETU32(ctx->Yi.c+12);
938         else
939                 ctr = ctx->Yi.d[3];
940
941         n = ctx->mres;
942 #if !defined(OPENSSL_SMALL_FOOTPRINT)
943         if (16%sizeof(size_t) == 0) do {        /* always true actually */
944                 if (n) {
945                         while (n && len) {
946                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
947                                 --len;
948                                 n = (n+1)%16;
949                         }
950                         if (n==0) GCM_MUL(ctx,Xi);
951                         else {
952                                 ctx->mres = n;
953                                 return 0;
954                         }
955                 }
956 #if defined(STRICT_ALIGNMENT)
957                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
958                         break;
959 #endif
960 #if defined(GHASH) && defined(GHASH_CHUNK)
961                 while (len>=GHASH_CHUNK) {
962                     size_t j=GHASH_CHUNK;
963
964                     while (j) {
965                         size_t *out_t=(size_t *)out;
966                         const size_t *in_t=(const size_t *)in;
967
968                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
969                         ++ctr;
970                         if (is_endian.little)
971                                 PUTU32(ctx->Yi.c+12,ctr);
972                         else
973                                 ctx->Yi.d[3] = ctr;
974                         for (i=0; i<16/sizeof(size_t); ++i)
975                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
976                         out += 16;
977                         in  += 16;
978                         j   -= 16;
979                     }
980                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
981                     len -= GHASH_CHUNK;
982                 }
983                 if ((i = (len&(size_t)-16))) {
984                     size_t j=i;
985
986                     while (len>=16) {
987                         size_t *out_t=(size_t *)out;
988                         const size_t *in_t=(const size_t *)in;
989
990                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
991                         ++ctr;
992                         if (is_endian.little)
993                                 PUTU32(ctx->Yi.c+12,ctr);
994                         else
995                                 ctx->Yi.d[3] = ctr;
996                         for (i=0; i<16/sizeof(size_t); ++i)
997                                 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
998                         out += 16;
999                         in  += 16;
1000                         len -= 16;
1001                     }
1002                     GHASH(ctx,out-j,j);
1003                 }
1004 #else
1005                 while (len>=16) {
1006                         size_t *out_t=(size_t *)out;
1007                         const size_t *in_t=(const size_t *)in;
1008
1009                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1010                         ++ctr;
1011                         if (is_endian.little)
1012                                 PUTU32(ctx->Yi.c+12,ctr);
1013                         else
1014                                 ctx->Yi.d[3] = ctr;
1015                         for (i=0; i<16/sizeof(size_t); ++i)
1016                                 ctx->Xi.t[i] ^=
1017                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1018                         GCM_MUL(ctx,Xi);
1019                         out += 16;
1020                         in  += 16;
1021                         len -= 16;
1022                 }
1023 #endif
1024                 if (len) {
1025                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1026                         ++ctr;
1027                         if (is_endian.little)
1028                                 PUTU32(ctx->Yi.c+12,ctr);
1029                         else
1030                                 ctx->Yi.d[3] = ctr;
1031                         while (len--) {
1032                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1033                                 ++n;
1034                         }
1035                 }
1036
1037                 ctx->mres = n;
1038                 return 0;
1039         } while(0);
1040 #endif
1041         for (i=0;i<len;++i) {
1042                 if (n==0) {
1043                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1044                         ++ctr;
1045                         if (is_endian.little)
1046                                 PUTU32(ctx->Yi.c+12,ctr);
1047                         else
1048                                 ctx->Yi.d[3] = ctr;
1049                 }
1050                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1051                 n = (n+1)%16;
1052                 if (n==0)
1053                         GCM_MUL(ctx,Xi);
1054         }
1055
1056         ctx->mres = n;
1057         return 0;
1058 }
1059
1060 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1061                 const unsigned char *in, unsigned char *out,
1062                 size_t len)
1063 {
1064         const union { long one; char little; } is_endian = {1};
1065         unsigned int n, ctr;
1066         size_t i;
1067         u64        mlen  = ctx->len.u[1];
1068         block128_f block = ctx->block;
1069         void      *key   = ctx->key;
1070 #ifdef GCM_FUNCREF_4BIT
1071         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1072 # ifdef GHASH
1073         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1074                                 const u8 *inp,size_t len)       = ctx->ghash;
1075 # endif
1076 #endif
1077
1078         mlen += len;
1079         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1080                 return -1;
1081         ctx->len.u[1] = mlen;
1082
1083         if (ctx->ares) {
1084                 /* First call to decrypt finalizes GHASH(AAD) */
1085                 GCM_MUL(ctx,Xi);
1086                 ctx->ares = 0;
1087         }
1088
1089         if (is_endian.little)
1090                 ctr = GETU32(ctx->Yi.c+12);
1091         else
1092                 ctr = ctx->Yi.d[3];
1093
1094         n = ctx->mres;
1095 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1096         if (16%sizeof(size_t) == 0) do {        /* always true actually */
1097                 if (n) {
1098                         while (n && len) {
1099                                 u8 c = *(in++);
1100                                 *(out++) = c^ctx->EKi.c[n];
1101                                 ctx->Xi.c[n] ^= c;
1102                                 --len;
1103                                 n = (n+1)%16;
1104                         }
1105                         if (n==0) GCM_MUL (ctx,Xi);
1106                         else {
1107                                 ctx->mres = n;
1108                                 return 0;
1109                         }
1110                 }
1111 #if defined(STRICT_ALIGNMENT)
1112                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1113                         break;
1114 #endif
1115 #if defined(GHASH) && defined(GHASH_CHUNK)
1116                 while (len>=GHASH_CHUNK) {
1117                     size_t j=GHASH_CHUNK;
1118
1119                     GHASH(ctx,in,GHASH_CHUNK);
1120                     while (j) {
1121                         size_t *out_t=(size_t *)out;
1122                         const size_t *in_t=(const size_t *)in;
1123
1124                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1125                         ++ctr;
1126                         if (is_endian.little)
1127                                 PUTU32(ctx->Yi.c+12,ctr);
1128                         else
1129                                 ctx->Yi.d[3] = ctr;
1130                         for (i=0; i<16/sizeof(size_t); ++i)
1131                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1132                         out += 16;
1133                         in  += 16;
1134                         j   -= 16;
1135                     }
1136                     len -= GHASH_CHUNK;
1137                 }
1138                 if ((i = (len&(size_t)-16))) {
1139                     GHASH(ctx,in,i);
1140                     while (len>=16) {
1141                         size_t *out_t=(size_t *)out;
1142                         const size_t *in_t=(const size_t *)in;
1143
1144                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1145                         ++ctr;
1146                         if (is_endian.little)
1147                                 PUTU32(ctx->Yi.c+12,ctr);
1148                         else
1149                                 ctx->Yi.d[3] = ctr;
1150                         for (i=0; i<16/sizeof(size_t); ++i)
1151                                 out_t[i] = in_t[i]^ctx->EKi.t[i];
1152                         out += 16;
1153                         in  += 16;
1154                         len -= 16;
1155                     }
1156                 }
1157 #else
1158                 while (len>=16) {
1159                         size_t *out_t=(size_t *)out;
1160                         const size_t *in_t=(const size_t *)in;
1161
1162                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1163                         ++ctr;
1164                         if (is_endian.little)
1165                                 PUTU32(ctx->Yi.c+12,ctr);
1166                         else
1167                                 ctx->Yi.d[3] = ctr;
1168                         for (i=0; i<16/sizeof(size_t); ++i) {
1169                                 size_t c = in[i];
1170                                 out[i] = c^ctx->EKi.t[i];
1171                                 ctx->Xi.t[i] ^= c;
1172                         }
1173                         GCM_MUL(ctx,Xi);
1174                         out += 16;
1175                         in  += 16;
1176                         len -= 16;
1177                 }
1178 #endif
1179                 if (len) {
1180                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1181                         ++ctr;
1182                         if (is_endian.little)
1183                                 PUTU32(ctx->Yi.c+12,ctr);
1184                         else
1185                                 ctx->Yi.d[3] = ctr;
1186                         while (len--) {
1187                                 u8 c = in[n];
1188                                 ctx->Xi.c[n] ^= c;
1189                                 out[n] = c^ctx->EKi.c[n];
1190                                 ++n;
1191                         }
1192                 }
1193
1194                 ctx->mres = n;
1195                 return 0;
1196         } while(0);
1197 #endif
1198         for (i=0;i<len;++i) {
1199                 u8 c;
1200                 if (n==0) {
1201                         (*block)(ctx->Yi.c,ctx->EKi.c,key);
1202                         ++ctr;
1203                         if (is_endian.little)
1204                                 PUTU32(ctx->Yi.c+12,ctr);
1205                         else
1206                                 ctx->Yi.d[3] = ctr;
1207                 }
1208                 c = in[i];
1209                 out[i] = c^ctx->EKi.c[n];
1210                 ctx->Xi.c[n] ^= c;
1211                 n = (n+1)%16;
1212                 if (n==0)
1213                         GCM_MUL(ctx,Xi);
1214         }
1215
1216         ctx->mres = n;
1217         return 0;
1218 }
1219
1220 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1221                 const unsigned char *in, unsigned char *out,
1222                 size_t len, ctr128_f stream)
1223 {
1224         const union { long one; char little; } is_endian = {1};
1225         unsigned int n, ctr;
1226         size_t i;
1227         u64   mlen = ctx->len.u[1];
1228         void *key  = ctx->key;
1229 #ifdef GCM_FUNCREF_4BIT
1230         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1231 # ifdef GHASH
1232         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1233                                 const u8 *inp,size_t len)       = ctx->ghash;
1234 # endif
1235 #endif
1236
1237         mlen += len;
1238         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1239                 return -1;
1240         ctx->len.u[1] = mlen;
1241
1242         if (ctx->ares) {
1243                 /* First call to encrypt finalizes GHASH(AAD) */
1244                 GCM_MUL(ctx,Xi);
1245                 ctx->ares = 0;
1246         }
1247
1248         if (is_endian.little)
1249                 ctr = GETU32(ctx->Yi.c+12);
1250         else
1251                 ctr = ctx->Yi.d[3];
1252
1253         n = ctx->mres;
1254         if (n) {
1255                 while (n && len) {
1256                         ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1257                         --len;
1258                         n = (n+1)%16;
1259                 }
1260                 if (n==0) GCM_MUL(ctx,Xi);
1261                 else {
1262                         ctx->mres = n;
1263                         return 0;
1264                 }
1265         }
1266 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1267         while (len>=GHASH_CHUNK) {
1268                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1269                 ctr += GHASH_CHUNK/16;
1270                 if (is_endian.little)
1271                         PUTU32(ctx->Yi.c+12,ctr);
1272                 else
1273                         ctx->Yi.d[3] = ctr;
1274                 GHASH(ctx,out,GHASH_CHUNK);
1275                 out += GHASH_CHUNK;
1276                 in  += GHASH_CHUNK;
1277                 len -= GHASH_CHUNK;
1278         }
1279 #endif
1280         if ((i = (len&(size_t)-16))) {
1281                 size_t j=i/16;
1282
1283                 (*stream)(in,out,j,key,ctx->Yi.c);
1284                 ctr += (unsigned int)j;
1285                 if (is_endian.little)
1286                         PUTU32(ctx->Yi.c+12,ctr);
1287                 else
1288                         ctx->Yi.d[3] = ctr;
1289                 in  += i;
1290                 len -= i;
1291 #if defined(GHASH)
1292                 GHASH(ctx,out,i);
1293                 out += i;
1294 #else
1295                 while (j--) {
1296                         for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1297                         GCM_MUL(ctx,Xi);
1298                         out += 16;
1299                 }
1300 #endif
1301         }
1302         if (len) {
1303                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1304                 ++ctr;
1305                 if (is_endian.little)
1306                         PUTU32(ctx->Yi.c+12,ctr);
1307                 else
1308                         ctx->Yi.d[3] = ctr;
1309                 while (len--) {
1310                         ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1311                         ++n;
1312                 }
1313         }
1314
1315         ctx->mres = n;
1316         return 0;
1317 }
1318
1319 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1320                 const unsigned char *in, unsigned char *out,
1321                 size_t len,ctr128_f stream)
1322 {
1323         const union { long one; char little; } is_endian = {1};
1324         unsigned int n, ctr;
1325         size_t i;
1326         u64   mlen = ctx->len.u[1];
1327         void *key  = ctx->key;
1328 #ifdef GCM_FUNCREF_4BIT
1329         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1330 # ifdef GHASH
1331         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1332                                 const u8 *inp,size_t len)       = ctx->ghash;
1333 # endif
1334 #endif
1335
1336         mlen += len;
1337         if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1338                 return -1;
1339         ctx->len.u[1] = mlen;
1340
1341         if (ctx->ares) {
1342                 /* First call to decrypt finalizes GHASH(AAD) */
1343                 GCM_MUL(ctx,Xi);
1344                 ctx->ares = 0;
1345         }
1346
1347         if (is_endian.little)
1348                 ctr = GETU32(ctx->Yi.c+12);
1349         else
1350                 ctr = ctx->Yi.d[3];
1351
1352         n = ctx->mres;
1353         if (n) {
1354                 while (n && len) {
1355                         u8 c = *(in++);
1356                         *(out++) = c^ctx->EKi.c[n];
1357                         ctx->Xi.c[n] ^= c;
1358                         --len;
1359                         n = (n+1)%16;
1360                 }
1361                 if (n==0) GCM_MUL (ctx,Xi);
1362                 else {
1363                         ctx->mres = n;
1364                         return 0;
1365                 }
1366         }
1367 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1368         while (len>=GHASH_CHUNK) {
1369                 GHASH(ctx,in,GHASH_CHUNK);
1370                 (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
1371                 ctr += GHASH_CHUNK/16;
1372                 if (is_endian.little)
1373                         PUTU32(ctx->Yi.c+12,ctr);
1374                 else
1375                         ctx->Yi.d[3] = ctr;
1376                 out += GHASH_CHUNK;
1377                 in  += GHASH_CHUNK;
1378                 len -= GHASH_CHUNK;
1379         }
1380 #endif
1381         if ((i = (len&(size_t)-16))) {
1382                 size_t j=i/16;
1383
1384 #if defined(GHASH)
1385                 GHASH(ctx,in,i);
1386 #else
1387                 while (j--) {
1388                         size_t k;
1389                         for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1390                         GCM_MUL(ctx,Xi);
1391                         in += 16;
1392                 }
1393                 j   = i/16;
1394                 in -= i;
1395 #endif
1396                 (*stream)(in,out,j,key,ctx->Yi.c);
1397                 ctr += (unsigned int)j;
1398                 if (is_endian.little)
1399                         PUTU32(ctx->Yi.c+12,ctr);
1400                 else
1401                         ctx->Yi.d[3] = ctr;
1402                 out += i;
1403                 in  += i;
1404                 len -= i;
1405         }
1406         if (len) {
1407                 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
1408                 ++ctr;
1409                 if (is_endian.little)
1410                         PUTU32(ctx->Yi.c+12,ctr);
1411                 else
1412                         ctx->Yi.d[3] = ctr;
1413                 while (len--) {
1414                         u8 c = in[n];
1415                         ctx->Xi.c[n] ^= c;
1416                         out[n] = c^ctx->EKi.c[n];
1417                         ++n;
1418                 }
1419         }
1420
1421         ctx->mres = n;
1422         return 0;
1423 }
1424
1425 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1426                         size_t len)
1427 {
1428         const union { long one; char little; } is_endian = {1};
1429         u64 alen = ctx->len.u[0]<<3;
1430         u64 clen = ctx->len.u[1]<<3;
1431 #ifdef GCM_FUNCREF_4BIT
1432         void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
1433 #endif
1434
1435         if (ctx->mres || ctx->ares)
1436                 GCM_MUL(ctx,Xi);
1437
1438         if (is_endian.little) {
1439 #ifdef BSWAP8
1440                 alen = BSWAP8(alen);
1441                 clen = BSWAP8(clen);
1442 #else
1443                 u8 *p = ctx->len.c;
1444
1445                 ctx->len.u[0] = alen;
1446                 ctx->len.u[1] = clen;
1447
1448                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1449                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1450 #endif
1451         }
1452
1453         ctx->Xi.u[0] ^= alen;
1454         ctx->Xi.u[1] ^= clen;
1455         GCM_MUL(ctx,Xi);
1456
1457         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1458         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1459
1460         if (tag && len<=sizeof(ctx->Xi))
1461                 return memcmp(ctx->Xi.c,tag,len);
1462         else
1463                 return -1;
1464 }
1465
1466 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1467 {
1468         CRYPTO_gcm128_finish(ctx, NULL, 0);
1469         memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1470 }
1471
1472 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1473 {
1474         GCM128_CONTEXT *ret;
1475
1476         if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1477                 CRYPTO_gcm128_init(ret,key,block);
1478
1479         return ret;
1480 }
1481
1482 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1483 {
1484         if (ctx) {
1485                 OPENSSL_cleanse(ctx,sizeof(*ctx));
1486                 OPENSSL_free(ctx);
1487         }
1488 }
1489
1490 #if defined(SELFTEST)
1491 #include <stdio.h>
1492 #include <openssl/aes.h>
1493
1494 /* Test Case 1 */
1495 static const u8 K1[16],
1496                 *P1=NULL,
1497                 *A1=NULL,
1498                 IV1[12],
1499                 *C1=NULL,
1500                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1501
1502 /* Test Case 2 */
1503 #define K2 K1
1504 #define A2 A1
1505 #define IV2 IV1
1506 static const u8 P2[16],
1507                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1508                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1509
1510 /* Test Case 3 */
1511 #define A3 A2
1512 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1513                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1514                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1515                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1516                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1517                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1518                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1519                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1520                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1521                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1522                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1523
1524 /* Test Case 4 */
1525 #define K4 K3
1526 #define IV4 IV3
1527 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1528                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1529                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1530                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1531                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1532                         0xab,0xad,0xda,0xd2},
1533                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1534                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1535                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1536                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1537                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1538
1539 /* Test Case 5 */
1540 #define K5 K4
1541 #define P5 P4
1542 #define A5 A4
1543 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1544                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1545                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1546                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1547                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1548                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1549
1550 /* Test Case 6 */
1551 #define K6 K5
1552 #define P6 P5
1553 #define A6 A5
1554 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1555                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1556                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1557                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1558                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1559                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1560                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1561                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1562                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1563
1564 /* Test Case 7 */
1565 static const u8 K7[24],
1566                 *P7=NULL,
1567                 *A7=NULL,
1568                 IV7[12],
1569                 *C7=NULL,
1570                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1571
1572 /* Test Case 8 */
1573 #define K8 K7
1574 #define IV8 IV7
1575 #define A8 A7
1576 static const u8 P8[16],
1577                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1578                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1579
1580 /* Test Case 9 */
1581 #define A9 A8
1582 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1583                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1584                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1585                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1586                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1587                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1588                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1589                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1590                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1591                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1592                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1593                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1594
1595 /* Test Case 10 */
1596 #define K10 K9
1597 #define IV10 IV9
1598 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1599                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1600                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1601                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1602                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1603                         0xab,0xad,0xda,0xd2},
1604                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1605                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1606                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1607                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1608                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1609
1610 /* Test Case 11 */
1611 #define K11 K10
1612 #define P11 P10
1613 #define A11 A10
1614 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1615                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1616                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1617                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1618                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1619                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1620
1621 /* Test Case 12 */
1622 #define K12 K11
1623 #define P12 P11
1624 #define A12 A11
1625 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1626                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1627                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1628                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1629                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1630                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1631                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1632                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1633                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1634
1635 /* Test Case 13 */
1636 static const u8 K13[32],
1637                 *P13=NULL,
1638                 *A13=NULL,
1639                 IV13[12],
1640                 *C13=NULL,
1641                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1642
1643 /* Test Case 14 */
1644 #define K14 K13
1645 #define A14 A13
1646 static const u8 P14[16],
1647                 IV14[12],
1648                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1649                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1650
1651 /* Test Case 15 */
1652 #define A15 A14
1653 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1654                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1655                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1656                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1657                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1658                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1659                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1660                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1661                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1662                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1663                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1664                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1665
1666 /* Test Case 16 */
1667 #define K16 K15
1668 #define IV16 IV15
1669 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1670                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1671                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1672                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1673                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1674                         0xab,0xad,0xda,0xd2},
1675                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1676                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1677                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1678                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1679                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1680
1681 /* Test Case 17 */
1682 #define K17 K16
1683 #define P17 P16
1684 #define A17 A16
1685 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1686                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1687                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1688                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1689                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1690                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1691
1692 /* Test Case 18 */
1693 #define K18 K17
1694 #define P18 P17
1695 #define A18 A17
1696 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1697                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1698                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1699                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1700                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1701                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1702                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1703                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1704                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1705
1706 /* Test Case 19 */
1707 #define K19 K1
1708 #define P19 P1
1709 #define IV19 IV1
1710 #define C19 C1
1711 static const u8 A19[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1712                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1713                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1714                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
1715                         0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1716                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1717                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1718                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1719                 T19[]= {0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92};
1720
1721 #define TEST_CASE(n)    do {                                    \
1722         u8 out[sizeof(P##n)];                                   \
1723         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1724         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1725         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1726         memset(out,0,sizeof(out));                              \
1727         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1728         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1729         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1730             (C##n && memcmp(out,C##n,sizeof(out))))             \
1731                 ret++, printf ("encrypt test#%d failed.\n",n);  \
1732         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1733         memset(out,0,sizeof(out));                              \
1734         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1735         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1736         if (CRYPTO_gcm128_finish(&ctx,T##n,16) ||               \
1737             (P##n && memcmp(out,P##n,sizeof(out))))             \
1738                 ret++, printf ("decrypt test#%d failed.\n",n);  \
1739         } while(0)
1740
1741 int main()
1742 {
1743         GCM128_CONTEXT ctx;
1744         AES_KEY key;
1745         int ret=0;
1746
1747         TEST_CASE(1);
1748         TEST_CASE(2);
1749         TEST_CASE(3);
1750         TEST_CASE(4);
1751         TEST_CASE(5);
1752         TEST_CASE(6);
1753         TEST_CASE(7);
1754         TEST_CASE(8);
1755         TEST_CASE(9);
1756         TEST_CASE(10);
1757         TEST_CASE(11);
1758         TEST_CASE(12);
1759         TEST_CASE(13);
1760         TEST_CASE(14);
1761         TEST_CASE(15);
1762         TEST_CASE(16);
1763         TEST_CASE(17);
1764         TEST_CASE(18);
1765         TEST_CASE(19);
1766
1767 #ifdef OPENSSL_CPUID_OBJ
1768         {
1769         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1770         union { u64 u; u8 c[1024]; } buf;
1771         int i;
1772
1773         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1774         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1775         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1776
1777         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1778         start = OPENSSL_rdtsc();
1779         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1780         gcm_t = OPENSSL_rdtsc() - start;
1781
1782         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1783                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1784                         (block128_f)AES_encrypt);
1785         start = OPENSSL_rdtsc();
1786         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1787                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1788                         (block128_f)AES_encrypt);
1789         ctr_t = OPENSSL_rdtsc() - start;
1790
1791         printf("%.2f-%.2f=%.2f\n",
1792                         gcm_t/(double)sizeof(buf),
1793                         ctr_t/(double)sizeof(buf),
1794                         (gcm_t-ctr_t)/(double)sizeof(buf));
1795 #ifdef GHASH
1796         {
1797         void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
1798                                 const u8 *inp,size_t len)       = ctx.ghash;
1799
1800         GHASH((&ctx),buf.c,sizeof(buf));
1801         start = OPENSSL_rdtsc();
1802         for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
1803         gcm_t = OPENSSL_rdtsc() - start;
1804         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1805         }
1806 #endif
1807         }
1808 #endif
1809
1810         return ret;
1811 }
1812 #endif