PR: 2254
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #include "modes_lcl.h"
51 #include <string.h>
52
53 #ifndef MODES_DEBUG
54 # ifndef NDEBUG
55 #  define NDEBUG
56 # endif
57 #endif
58 #include <assert.h>
59
60 typedef struct { u64 hi,lo; } u128;
61
62 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
63 /* redefine, because alignment is ensured */
64 #undef  GETU32
65 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
66 #undef  PUTU32
67 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
68 #endif
69
70 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
71 #define REDUCE1BIT(V)   do { \
72         if (sizeof(size_t)==8) { \
73                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
74                 V.lo  = (V.hi<<63)|(V.lo>>1); \
75                 V.hi  = (V.hi>>1 )^T; \
76         } \
77         else { \
78                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
79                 V.lo  = (V.hi<<63)|(V.lo>>1); \
80                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
81         } \
82 } while(0)
83
84 #ifdef  TABLE_BITS
85 #undef  TABLE_BITS
86 #endif
87 /*
88  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
89  * never be set to 8. 8 is effectively reserved for testing purposes.
90  * Under ideal conditions "8-bit" version should be twice as fast as
91  * "4-bit" one. For gcc-generated x86[_64] code, "8-bit" was observed to
92  * run ~75% faster, closer to 100% for commercial compilers... But the
93  * catch is that "8-bit" procedure consumes 16 times more memory, 4KB
94  * per indivudual key + 1KB shared, and as access to these tables end up
95  * on critical path, real-life execution time would be sensitive to
96  * cache timing. It's not actually proven, but "4-bit" procedure is
97  * believed to provide adequate all-round performance...
98  */
99 #define TABLE_BITS 4
100
101 #if     TABLE_BITS==8
102
103 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
104 {
105         int  i, j;
106         u128 V;
107
108         Htable[0].hi = 0;
109         Htable[0].lo = 0;
110         V.hi = H[0];
111         V.lo = H[1];
112
113         for (Htable[128]=V, i=64; i>0; i>>=1) {
114                 REDUCE1BIT(V);
115                 Htable[i] = V;
116         }
117
118         for (i=2; i<256; i<<=1) {
119                 u128 *Hi = Htable+i, H0 = *Hi;
120                 for (j=1; j<i; ++j) {
121                         Hi[j].hi = H0.hi^Htable[j].hi;
122                         Hi[j].lo = H0.lo^Htable[j].lo;
123                 }
124         }
125 }
126
127 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
128 {
129         u128 Z = { 0, 0};
130         const u8 *xi = (const u8 *)Xi+15;
131         size_t rem, n = *xi;
132         const union { long one; char little; } is_endian = {1};
133         static const size_t rem_8bit[256] = {
134                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
135                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
136                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
137                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
138                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
139                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
140                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
141                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
142                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
143                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
144                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
145                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
146                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
147                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
148                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
149                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
150                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
151                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
152                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
153                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
154                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
155                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
156                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
157                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
158                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
159                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
160                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
161                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
162                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
163                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
164                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
165                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
166                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
167                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
168                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
169                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
170                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
171                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
172                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
173                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
174                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
175                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
176                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
177                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
178                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
179                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
180                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
181                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
182                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
183                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
184                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
185                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
186                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
187                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
188                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
189                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
190                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
191                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
192                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
193                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
194                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
195                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
196                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
197                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
198
199         while (1) {
200                 Z.hi ^= Htable[n].hi;
201                 Z.lo ^= Htable[n].lo;
202
203                 if ((u8 *)Xi==xi)       break;
204
205                 n = *(--xi);
206
207                 rem  = (size_t)Z.lo&0xff;
208                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
209                 Z.hi = (Z.hi>>8);
210                 if (sizeof(size_t)==8)
211                         Z.hi ^= rem_8bit[rem];
212                 else
213                         Z.hi ^= (u64)rem_8bit[rem]<<32;
214         }
215
216         if (is_endian.little) {
217 #ifdef BSWAP8
218                 Xi[0] = BSWAP8(Z.hi);
219                 Xi[1] = BSWAP8(Z.lo);
220 #else
221                 u8 *p = (u8 *)Xi;
222                 u32 v;
223                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
224                 v = (u32)(Z.hi);        PUTU32(p+4,v);
225                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
226                 v = (u32)(Z.lo);        PUTU32(p+12,v);
227 #endif
228         }
229         else {
230                 Xi[0] = Z.hi;
231                 Xi[1] = Z.lo;
232         }
233 }
234 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
235
236 #elif   TABLE_BITS==4
237
238 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
239 {
240         u128 V;
241 #if defined(OPENSSL_SMALL_FOOTPRINT)
242         int  i;
243 #endif
244
245         Htable[0].hi = 0;
246         Htable[0].lo = 0;
247         V.hi = H[0];
248         V.lo = H[1];
249
250 #if defined(OPENSSL_SMALL_FOOTPRINT)
251         for (Htable[8]=V, i=4; i>0; i>>=1) {
252                 REDUCE1BIT(V);
253                 Htable[i] = V;
254         }
255
256         for (i=2; i<16; i<<=1) {
257                 u128 *Hi = Htable+i;
258                 int   j;
259                 for (V=*Hi, j=1; j<i; ++j) {
260                         Hi[j].hi = V.hi^Htable[j].hi;
261                         Hi[j].lo = V.lo^Htable[j].lo;
262                 }
263         }
264 #else
265         Htable[8] = V;
266         REDUCE1BIT(V);
267         Htable[4] = V;
268         REDUCE1BIT(V);
269         Htable[2] = V;
270         REDUCE1BIT(V);
271         Htable[1] = V;
272         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
273         V=Htable[4];
274         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
275         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
276         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
277         V=Htable[8];
278         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
279         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
280         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
281         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
282         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
283         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
284         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
285 #endif
286 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
287         /*
288          * ARM assembler expects specific dword order in Htable.
289          */
290         {
291         int j;
292         const union { long one; char little; } is_endian = {1};
293
294         if (is_endian.little)
295                 for (j=0;j<16;++j) {
296                         V = Htable[j];
297                         Htable[j].hi = V.lo;
298                         Htable[j].lo = V.hi;
299                 }
300         else
301                 for (j=0;j<16;++j) {
302                         V = Htable[j];
303                         Htable[j].hi = V.lo<<32|V.lo>>32;
304                         Htable[j].lo = V.hi<<32|V.hi>>32;
305                 }
306         }
307 #endif
308 }
309
310 #ifndef GHASH_ASM
311 static const size_t rem_4bit[16] = {
312         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
313         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
314         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
315         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
316
317 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
318 {
319         u128 Z;
320         int cnt = 15;
321         size_t rem, nlo, nhi;
322         const union { long one; char little; } is_endian = {1};
323
324         nlo  = ((const u8 *)Xi)[15];
325         nhi  = nlo>>4;
326         nlo &= 0xf;
327
328         Z.hi = Htable[nlo].hi;
329         Z.lo = Htable[nlo].lo;
330
331         while (1) {
332                 rem  = (size_t)Z.lo&0xf;
333                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
334                 Z.hi = (Z.hi>>4);
335                 if (sizeof(size_t)==8)
336                         Z.hi ^= rem_4bit[rem];
337                 else
338                         Z.hi ^= (u64)rem_4bit[rem]<<32;
339
340                 Z.hi ^= Htable[nhi].hi;
341                 Z.lo ^= Htable[nhi].lo;
342
343                 if (--cnt<0)            break;
344
345                 nlo  = ((const u8 *)Xi)[cnt];
346                 nhi  = nlo>>4;
347                 nlo &= 0xf;
348
349                 rem  = (size_t)Z.lo&0xf;
350                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
351                 Z.hi = (Z.hi>>4);
352                 if (sizeof(size_t)==8)
353                         Z.hi ^= rem_4bit[rem];
354                 else
355                         Z.hi ^= (u64)rem_4bit[rem]<<32;
356
357                 Z.hi ^= Htable[nlo].hi;
358                 Z.lo ^= Htable[nlo].lo;
359         }
360
361         if (is_endian.little) {
362 #ifdef BSWAP8
363                 Xi[0] = BSWAP8(Z.hi);
364                 Xi[1] = BSWAP8(Z.lo);
365 #else
366                 u8 *p = (u8 *)Xi;
367                 u32 v;
368                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
369                 v = (u32)(Z.hi);        PUTU32(p+4,v);
370                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
371                 v = (u32)(Z.lo);        PUTU32(p+12,v);
372 #endif
373         }
374         else {
375                 Xi[0] = Z.hi;
376                 Xi[1] = Z.lo;
377         }
378 }
379
380 #if !defined(OPENSSL_SMALL_FOOTPRINT)
381 /*
382  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
383  * details... Compiler-generated code doesn't seem to give any
384  * performance improvement, at least not on x86[_64]. It's here
385  * mostly as reference and a placeholder for possible future
386  * non-trivial optimization[s]...
387  */
388 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
389                                 const u8 *inp,size_t len)
390 {
391     u128 Z;
392     int cnt;
393     size_t rem, nlo, nhi;
394     const union { long one; char little; } is_endian = {1};
395
396     do {
397         cnt  = 15;
398         nlo  = ((const u8 *)Xi)[15];
399         nlo ^= inp[15];
400         nhi  = nlo>>4;
401         nlo &= 0xf;
402
403         Z.hi = Htable[nlo].hi;
404         Z.lo = Htable[nlo].lo;
405
406         while (1) {
407                 rem  = (size_t)Z.lo&0xf;
408                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
409                 Z.hi = (Z.hi>>4);
410                 if (sizeof(size_t)==8)
411                         Z.hi ^= rem_4bit[rem];
412                 else
413                         Z.hi ^= (u64)rem_4bit[rem]<<32;
414
415                 Z.hi ^= Htable[nhi].hi;
416                 Z.lo ^= Htable[nhi].lo;
417
418                 if (--cnt<0)            break;
419
420                 nlo  = ((const u8 *)Xi)[cnt];
421                 nlo ^= inp[cnt];
422                 nhi  = nlo>>4;
423                 nlo &= 0xf;
424
425                 rem  = (size_t)Z.lo&0xf;
426                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
427                 Z.hi = (Z.hi>>4);
428                 if (sizeof(size_t)==8)
429                         Z.hi ^= rem_4bit[rem];
430                 else
431                         Z.hi ^= (u64)rem_4bit[rem]<<32;
432
433                 Z.hi ^= Htable[nlo].hi;
434                 Z.lo ^= Htable[nlo].lo;
435         }
436
437         if (is_endian.little) {
438 #ifdef BSWAP8
439                 Xi[0] = BSWAP8(Z.hi);
440                 Xi[1] = BSWAP8(Z.lo);
441 #else
442                 u8 *p = (u8 *)Xi;
443                 u32 v;
444                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
445                 v = (u32)(Z.hi);        PUTU32(p+4,v);
446                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
447                 v = (u32)(Z.lo);        PUTU32(p+12,v);
448 #endif
449         }
450         else {
451                 Xi[0] = Z.hi;
452                 Xi[1] = Z.lo;
453         }
454     } while (inp+=16, len-=16);
455 }
456 #endif
457 #else
458 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
459 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
460 #endif
461
462 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
463 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
464 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
465 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
466  * trashing effect. In other words idea is to hash data while it's
467  * still in L1 cache after encryption pass... */
468 #define GHASH_CHUNK       1024
469 #endif
470
471 #else   /* TABLE_BITS */
472
473 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
474 {
475         u128 V,Z = { 0,0 };
476         long X;
477         int  i,j;
478         const long *xi = (const long *)Xi;
479         const union { long one; char little; } is_endian = {1};
480
481         V.hi = H[0];    /* H is in host byte order, no byte swapping */
482         V.lo = H[1];
483
484         for (j=0; j<16/sizeof(long); ++j) {
485                 if (is_endian.little) {
486                         if (sizeof(long)==8) {
487 #ifdef BSWAP8
488                                 X = (long)(BSWAP8(xi[j]));
489 #else
490                                 const u8 *p = (const u8 *)(xi+j);
491                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
492 #endif
493                         }
494                         else {
495                                 const u8 *p = (const u8 *)(xi+j);
496                                 X = (long)GETU32(p);
497                         }
498                 }
499                 else
500                         X = xi[j];
501
502                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
503                         u64 M = (u64)(X>>(8*sizeof(long)-1));
504                         Z.hi ^= V.hi&M;
505                         Z.lo ^= V.lo&M;
506
507                         REDUCE1BIT(V);
508                 }
509         }
510
511         if (is_endian.little) {
512 #ifdef BSWAP8
513                 Xi[0] = BSWAP8(Z.hi);
514                 Xi[1] = BSWAP8(Z.lo);
515 #else
516                 u8 *p = (u8 *)Xi;
517                 u32 v;
518                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
519                 v = (u32)(Z.hi);        PUTU32(p+4,v);
520                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
521                 v = (u32)(Z.lo);        PUTU32(p+12,v);
522 #endif
523         }
524         else {
525                 Xi[0] = Z.hi;
526                 Xi[1] = Z.lo;
527         }
528 }
529 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
530
531 #endif
532
533 struct gcm128_context {
534         /* Following 6 names follow names in GCM specification */
535         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
536                                                 Xi,H,len;
537         /* Pre-computed table used by gcm_gmult_* */
538 #if TABLE_BITS==8
539         u128 Htable[256];
540 #else
541         u128 Htable[16];
542         void (*gmult)(u64 Xi[2],const u128 Htable[16]);
543         void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
544 #endif
545         unsigned int res, pad;
546         block128_f block;
547         void *key;
548 };
549
550 #if     TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
551         (defined(__i386)        || defined(__i386__)    || \
552          defined(__x86_64)      || defined(__x86_64__)  || \
553          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
554 # define GHASH_ASM_IAX
555 extern unsigned int OPENSSL_ia32cap_P[2];
556
557 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
558 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
559 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
560
561 # if    defined(__i386) || defined(__i386__) || defined(_M_IX86)
562 #  define GHASH_ASM_X86
563 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
564 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
565
566 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
567 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
568 # endif
569
570 # undef  GCM_MUL
571 # define GCM_MUL(ctx,Xi)   (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
572 # undef  GHASH
573 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
574 #endif
575
576 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
577 {
578         const union { long one; char little; } is_endian = {1};
579
580         memset(ctx,0,sizeof(*ctx));
581         ctx->block = block;
582         ctx->key   = key;
583
584         (*block)(ctx->H.c,ctx->H.c,key);
585
586         if (is_endian.little) {
587                 /* H is stored in host byte order */
588 #ifdef BSWAP8
589                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
590                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
591 #else
592                 u8 *p = ctx->H.c;
593                 u64 hi,lo;
594                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
595                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
596                 ctx->H.u[0] = hi;
597                 ctx->H.u[1] = lo;
598 #endif
599         }
600
601 #if     TABLE_BITS==8
602         gcm_init_8bit(ctx->Htable,ctx->H.u);
603 #elif   TABLE_BITS==4
604 # if    defined(GHASH_ASM_IAX)
605         if (OPENSSL_ia32cap_P[1]&(1<<1)) {
606                 gcm_init_clmul(ctx->Htable,ctx->H.u);
607                 ctx->gmult = gcm_gmult_clmul;
608                 ctx->ghash = gcm_ghash_clmul;
609                 return;
610         }
611         gcm_init_4bit(ctx->Htable,ctx->H.u);
612 #  if   defined(GHASH_ASM_X86)
613         if (OPENSSL_ia32cap_P[0]&(1<<23)) {
614                 ctx->gmult = gcm_gmult_4bit_mmx;
615                 ctx->ghash = gcm_ghash_4bit_mmx;
616         } else {
617                 ctx->gmult = gcm_gmult_4bit_x86;
618                 ctx->ghash = gcm_ghash_4bit_x86;
619         }
620 #  else
621         ctx->gmult = gcm_gmult_4bit;
622         ctx->ghash = gcm_ghash_4bit;
623 #  endif
624 # else
625         gcm_init_4bit(ctx->Htable,ctx->H.u);
626 # endif
627 #endif
628 }
629
630 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
631 {
632         const union { long one; char little; } is_endian = {1};
633         unsigned int ctr;
634
635         ctx->Yi.u[0]  = 0;
636         ctx->Yi.u[1]  = 0;
637         ctx->Xi.u[0]  = 0;
638         ctx->Xi.u[1]  = 0;
639         ctx->len.u[0] = 0;
640         ctx->len.u[1] = 0;
641         ctx->res = 0;
642
643         if (len==12) {
644                 memcpy(ctx->Yi.c,iv,12);
645                 ctx->Yi.c[15]=1;
646                 ctr=1;
647         }
648         else {
649                 size_t i;
650                 u64 len0 = len;
651
652                 while (len>=16) {
653                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
654                         GCM_MUL(ctx,Yi);
655                         iv += 16;
656                         len -= 16;
657                 }
658                 if (len) {
659                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
660                         GCM_MUL(ctx,Yi);
661                 }
662                 len0 <<= 3;
663                 if (is_endian.little) {
664 #ifdef BSWAP8
665                         ctx->Yi.u[1]  ^= BSWAP8(len0);
666 #else
667                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
668                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
669                         ctx->Yi.c[10] ^= (u8)(len0>>40);
670                         ctx->Yi.c[11] ^= (u8)(len0>>32);
671                         ctx->Yi.c[12] ^= (u8)(len0>>24);
672                         ctx->Yi.c[13] ^= (u8)(len0>>16);
673                         ctx->Yi.c[14] ^= (u8)(len0>>8);
674                         ctx->Yi.c[15] ^= (u8)(len0);
675 #endif
676                 }
677                 else
678                         ctx->Yi.u[1]  ^= len0;
679
680                 GCM_MUL(ctx,Yi);
681
682                 if (is_endian.little)
683                         ctr = GETU32(ctx->Yi.c+12);
684                 else
685                         ctr = ctx->Yi.d[3];
686         }
687
688         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
689         ++ctr;
690         if (is_endian.little)
691                 PUTU32(ctx->Yi.c+12,ctr);
692         else
693                 ctx->Yi.d[3] = ctr;
694 }
695
696 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
697 {
698         size_t i;
699
700         ctx->len.u[0] += len;
701
702 #ifdef GHASH
703         if ((i = (len&(size_t)-16))) {
704                 GHASH(ctx,aad,i);
705                 aad += i;
706                 len -= i;
707         }
708 #else
709         while (len>=16) {
710                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
711                 GCM_MUL(ctx,Xi);
712                 aad += 16;
713                 len -= 16;
714         }
715 #endif
716         if (len) {
717                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
718                 GCM_MUL(ctx,Xi);
719         }
720 }
721
722 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
723                 const unsigned char *in, unsigned char *out,
724                 size_t len)
725 {
726         const union { long one; char little; } is_endian = {1};
727         unsigned int n, ctr;
728         size_t i;
729
730         ctx->len.u[1] += len;
731         n   = ctx->res;
732         if (is_endian.little)
733                 ctr = GETU32(ctx->Yi.c+12);
734         else
735                 ctr = ctx->Yi.d[3];
736
737 #if !defined(OPENSSL_SMALL_FOOTPRINT)
738         if (16%sizeof(size_t) == 0) do {        /* always true actually */
739                 if (n) {
740                         while (n && len) {
741                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
742                                 --len;
743                                 n = (n+1)%16;
744                         }
745                         if (n==0) GCM_MUL(ctx,Xi);
746                         else {
747                                 ctx->res = n;
748                                 return;
749                         }
750                 }
751 #if defined(STRICT_ALIGNMENT)
752                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
753                         break;
754 #endif
755 #if defined(GHASH) && defined(GHASH_CHUNK)
756                 while (len>=GHASH_CHUNK) {
757                     size_t j=GHASH_CHUNK;
758
759                     while (j) {
760                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
761                         ++ctr;
762                         if (is_endian.little)
763                                 PUTU32(ctx->Yi.c+12,ctr);
764                         else
765                                 ctx->Yi.d[3] = ctr;
766                         for (i=0; i<16; i+=sizeof(size_t))
767                                 *(size_t *)(out+i) =
768                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
769                         out += 16;
770                         in  += 16;
771                         j   -= 16;
772                     }
773                     GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
774                     len -= GHASH_CHUNK;
775                 }
776                 if ((i = (len&(size_t)-16))) {
777                     size_t j=i;
778
779                     while (len>=16) {
780                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
781                         ++ctr;
782                         if (is_endian.little)
783                                 PUTU32(ctx->Yi.c+12,ctr);
784                         else
785                                 ctx->Yi.d[3] = ctr;
786                         for (i=0; i<16; i+=sizeof(size_t))
787                                 *(size_t *)(out+i) =
788                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
789                         out += 16;
790                         in  += 16;
791                         len -= 16;
792                     }
793                     GHASH(ctx,out-j,j);
794                 }
795 #else
796                 while (len>=16) {
797                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
798                         ++ctr;
799                         if (is_endian.little)
800                                 PUTU32(ctx->Yi.c+12,ctr);
801                         else
802                                 ctx->Yi.d[3] = ctr;
803                         for (i=0; i<16; i+=sizeof(size_t))
804                                 *(size_t *)(ctx->Xi.c+i) ^=
805                                 *(size_t *)(out+i) =
806                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
807                         GCM_MUL(ctx,Xi);
808                         out += 16;
809                         in  += 16;
810                         len -= 16;
811                 }
812 #endif
813                 if (len) {
814                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
815                         ++ctr;
816                         if (is_endian.little)
817                                 PUTU32(ctx->Yi.c+12,ctr);
818                         else
819                                 ctx->Yi.d[3] = ctr;
820                         while (len--) {
821                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
822                                 ++n;
823                         }
824                 }
825
826                 ctx->res = n;
827                 return;
828         } while(0);
829 #endif
830         for (i=0;i<len;++i) {
831                 if (n==0) {
832                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
833                         ++ctr;
834                         if (is_endian.little)
835                                 PUTU32(ctx->Yi.c+12,ctr);
836                         else
837                                 ctx->Yi.d[3] = ctr;
838                 }
839                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
840                 n = (n+1)%16;
841                 if (n==0)
842                         GCM_MUL(ctx,Xi);
843         }
844
845         ctx->res = n;
846 }
847
848 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
849                 const unsigned char *in, unsigned char *out,
850                 size_t len)
851 {
852         const union { long one; char little; } is_endian = {1};
853         unsigned int n, ctr;
854         size_t i;
855
856         ctx->len.u[1] += len;
857         n   = ctx->res;
858         if (is_endian.little)
859                 ctr = GETU32(ctx->Yi.c+12);
860         else
861                 ctr = ctx->Yi.d[3];
862
863 #if !defined(OPENSSL_SMALL_FOOTPRINT)
864         if (16%sizeof(size_t) == 0) do {        /* always true actually */
865                 if (n) {
866                         while (n && len) {
867                                 u8 c = *(in++);
868                                 *(out++) = c^ctx->EKi.c[n];
869                                 ctx->Xi.c[n] ^= c;
870                                 --len;
871                                 n = (n+1)%16;
872                         }
873                         if (n==0) GCM_MUL (ctx,Xi);
874                         else {
875                                 ctx->res = n;
876                                 return;
877                         }
878                 }
879 #if defined(STRICT_ALIGNMENT)
880                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
881                         break;
882 #endif
883 #if defined(GHASH) && defined(GHASH_CHUNK)
884                 while (len>=GHASH_CHUNK) {
885                     size_t j=GHASH_CHUNK;
886
887                     GHASH(ctx,in,GHASH_CHUNK);
888                     while (j) {
889                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
890                         ++ctr;
891                         if (is_endian.little)
892                                 PUTU32(ctx->Yi.c+12,ctr);
893                         else
894                                 ctx->Yi.d[3] = ctr;
895                         for (i=0; i<16; i+=sizeof(size_t))
896                                 *(size_t *)(out+i) =
897                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
898                         out += 16;
899                         in  += 16;
900                         j   -= 16;
901                     }
902                     len -= GHASH_CHUNK;
903                 }
904                 if ((i = (len&(size_t)-16))) {
905                     GHASH(ctx,in,i);
906                     while (len>=16) {
907                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
908                         ++ctr;
909                         if (is_endian.little)
910                                 PUTU32(ctx->Yi.c+12,ctr);
911                         else
912                                 ctx->Yi.d[3] = ctr;
913                         for (i=0; i<16; i+=sizeof(size_t))
914                                 *(size_t *)(out+i) =
915                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
916                         out += 16;
917                         in  += 16;
918                         len -= 16;
919                     }
920                 }
921 #else
922                 while (len>=16) {
923                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
924                         ++ctr;
925                         if (is_endian.little)
926                                 PUTU32(ctx->Yi.c+12,ctr);
927                         else
928                                 ctx->Yi.d[3] = ctr;
929                         for (i=0; i<16; i+=sizeof(size_t)) {
930                                 size_t c = *(size_t *)(in+i);
931                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
932                                 *(size_t *)(ctx->Xi.c+i) ^= c;
933                         }
934                         GCM_MUL(ctx,Xi);
935                         out += 16;
936                         in  += 16;
937                         len -= 16;
938                 }
939 #endif
940                 if (len) {
941                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
942                         ++ctr;
943                         if (is_endian.little)
944                                 PUTU32(ctx->Yi.c+12,ctr);
945                         else
946                                 ctx->Yi.d[3] = ctr;
947                         while (len--) {
948                                 u8 c = in[n];
949                                 ctx->Xi.c[n] ^= c;
950                                 out[n] = c^ctx->EKi.c[n];
951                                 ++n;
952                         }
953                 }
954
955                 ctx->res = n;
956                 return;
957         } while(0);
958 #endif
959         for (i=0;i<len;++i) {
960                 u8 c;
961                 if (n==0) {
962                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
963                         ++ctr;
964                         if (is_endian.little)
965                                 PUTU32(ctx->Yi.c+12,ctr);
966                         else
967                                 ctx->Yi.d[3] = ctr;
968                 }
969                 c = in[i];
970                 out[i] ^= ctx->EKi.c[n];
971                 ctx->Xi.c[n] ^= c;
972                 n = (n+1)%16;
973                 if (n==0)
974                         GCM_MUL(ctx,Xi);
975         }
976
977         ctx->res = n;
978 }
979
980 void CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx)
981 {
982         const union { long one; char little; } is_endian = {1};
983         u64 alen = ctx->len.u[0]<<3;
984         u64 clen = ctx->len.u[1]<<3;
985
986         if (ctx->res)
987                 GCM_MUL(ctx,Xi);
988
989         if (is_endian.little) {
990 #ifdef BSWAP8
991                 alen = BSWAP8(alen);
992                 clen = BSWAP8(clen);
993 #else
994                 u8 *p = ctx->len.c;
995
996                 ctx->len.u[0] = alen;
997                 ctx->len.u[1] = clen;
998
999                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
1000                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1001 #endif
1002         }
1003
1004         ctx->Xi.u[0] ^= alen;
1005         ctx->Xi.u[1] ^= clen;
1006         GCM_MUL(ctx,Xi);
1007
1008         ctx->Xi.u[0] ^= ctx->EK0.u[0];
1009         ctx->Xi.u[1] ^= ctx->EK0.u[1];
1010 }
1011
1012 #if defined(SELFTEST)
1013 #include <stdio.h>
1014 #include <openssl/aes.h>
1015
1016 /* Test Case 1 */
1017 static const u8 K1[16],
1018                 *P1=NULL,
1019                 *A1=NULL,
1020                 IV1[12],
1021                 *C1=NULL,
1022                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1023
1024 /* Test Case 2 */
1025 #define K2 K1
1026 #define A2 A1
1027 #define IV2 IV1
1028 static const u8 P2[16],
1029                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1030                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1031
1032 /* Test Case 3 */
1033 #define A3 A2
1034 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1035                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1036                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1037                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1038                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1039                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1040                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1041                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1042                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1043                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1044                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4,};
1045
1046 /* Test Case 4 */
1047 #define K4 K3
1048 #define IV4 IV3
1049 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1050                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1051                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1052                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1053                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1054                         0xab,0xad,0xda,0xd2},
1055                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1056                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1057                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1058                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1059                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1060
1061 /* Test Case 5 */
1062 #define K5 K4
1063 #define P5 P4
1064 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1065                         0xab,0xad,0xda,0xd2},
1066                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1067                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1068                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1069                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1070                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1071                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1072
1073 /* Test Case 6 */
1074 #define K6 K5
1075 #define P6 P5
1076 #define A6 A5
1077 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1078                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1079                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1080                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1081                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1082                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1083                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1084                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1085                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1086
1087 /* Test Case 7 */
1088 static const u8 K7[24],
1089                 *P7=NULL,
1090                 *A7=NULL,
1091                 IV7[12],
1092                 *C7=NULL,
1093                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1094
1095 /* Test Case 8 */
1096 #define K8 K7
1097 #define IV8 IV7
1098 #define A8 A7
1099 static const u8 P8[16],
1100                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1101                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1102
1103 /* Test Case 9 */
1104 #define A9 A8
1105 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1106                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1107                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1108                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1109                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1110                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1111                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1112                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1113                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1114                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1115                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1116                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1117
1118 /* Test Case 10 */
1119 #define K10 K9
1120 #define IV10 IV9
1121 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1122                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1123                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1124                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1125                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1126                         0xab,0xad,0xda,0xd2},
1127                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1128                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1129                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1130                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1131                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1132
1133 /* Test Case 11 */
1134 #define K11 K10
1135 #define P11 P10
1136 #define A11 A10
1137 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1138                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1139                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1140                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1141                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1142                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1143
1144 /* Test Case 12 */
1145 #define K12 K11
1146 #define P12 P11
1147 #define A12 A11
1148 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1149                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1150                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1151                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1152                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1153                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1154                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1155                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1156                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1157
1158 /* Test Case 13 */
1159 static const u8 K13[32],
1160                 *P13=NULL,
1161                 *A13=NULL,
1162                 IV13[12],
1163                 *C13=NULL,
1164                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1165
1166 /* Test Case 14 */
1167 #define K14 K13
1168 #define A14 A13
1169 static const u8 P14[16],
1170                 IV14[12],
1171                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1172                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1173
1174 /* Test Case 15 */
1175 #define A15 A14
1176 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1177                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1178                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1179                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1180                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1181                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1182                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1183                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1184                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1185                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1186                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1187                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1188
1189 /* Test Case 16 */
1190 #define K16 K15
1191 #define IV16 IV15
1192 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1193                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1194                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1195                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1196                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1197                         0xab,0xad,0xda,0xd2},
1198                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1199                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1200                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1201                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1202                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1203
1204 /* Test Case 17 */
1205 #define K17 K16
1206 #define P17 P16
1207 #define A17 A16
1208 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1209                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1210                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1211                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1212                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1213                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1214
1215 /* Test Case 18 */
1216 #define K18 K17
1217 #define P18 P17
1218 #define A18 A17
1219 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1220                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1221                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1222                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1223                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1224                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1225                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1226                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1227                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1228
1229 #define TEST_CASE(n)    do {                                    \
1230         u8 out[sizeof(P##n)];                                   \
1231         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1232         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1233         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1234         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1235         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1236         CRYPTO_gcm128_finish(&ctx);                             \
1237         if (memcmp(ctx.Xi.c,T##n,16) || (C##n && memcmp(out,C##n,sizeof(out)))) \
1238                 ret++, printf ("encrypt test#%d failed.\n",n);\
1239         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1240         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1241         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1242         CRYPTO_gcm128_finish(&ctx);                             \
1243         if (memcmp(ctx.Xi.c,T##n,16) || (P##n && memcmp(out,P##n,sizeof(out)))) \
1244                 ret++, printf ("decrypt test#%d failed.\n",n);\
1245         } while(0)
1246
1247 int main()
1248 {
1249         GCM128_CONTEXT ctx;
1250         AES_KEY key;
1251         int ret=0;
1252
1253         TEST_CASE(1);
1254         TEST_CASE(2);
1255         TEST_CASE(3);
1256         TEST_CASE(4);
1257         TEST_CASE(5);
1258         TEST_CASE(6);
1259         TEST_CASE(7);
1260         TEST_CASE(8);
1261         TEST_CASE(9);
1262         TEST_CASE(10);
1263         TEST_CASE(11);
1264         TEST_CASE(12);
1265         TEST_CASE(13);
1266         TEST_CASE(14);
1267         TEST_CASE(15);
1268         TEST_CASE(16);
1269         TEST_CASE(17);
1270         TEST_CASE(18);
1271
1272 #ifdef OPENSSL_CPUID_OBJ
1273         {
1274         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1275         union { u64 u; u8 c[1024]; } buf;
1276         int i;
1277
1278         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1279         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1280         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1281
1282         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1283         start = OPENSSL_rdtsc();
1284         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1285         gcm_t = OPENSSL_rdtsc() - start;
1286
1287         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1288                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1289                         (block128_f)AES_encrypt);
1290         start = OPENSSL_rdtsc();
1291         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1292                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1293                         (block128_f)AES_encrypt);
1294         ctr_t = OPENSSL_rdtsc() - start;
1295
1296         printf("%.2f-%.2f=%.2f\n",
1297                         gcm_t/(double)sizeof(buf),
1298                         ctr_t/(double)sizeof(buf),
1299                         (gcm_t-ctr_t)/(double)sizeof(buf));
1300 #ifdef GHASH
1301         GHASH(&ctx,buf.c,sizeof(buf));
1302         start = OPENSSL_rdtsc();
1303         for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1304         gcm_t = OPENSSL_rdtsc() - start;
1305         printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
1306 #endif
1307         }
1308 #endif
1309
1310         return ret;
1311 }
1312 #endif