"Jumbo" update for crypto/modes:
[openssl.git] / crypto / modes / gcm128.c
1 /* ====================================================================
2  * Copyright (c) 2010 The OpenSSL Project.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer. 
10  *
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in
13  *    the documentation and/or other materials provided with the
14  *    distribution.
15  *
16  * 3. All advertising materials mentioning features or use of this
17  *    software must display the following acknowledgment:
18  *    "This product includes software developed by the OpenSSL Project
19  *    for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
20  *
21  * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22  *    endorse or promote products derived from this software without
23  *    prior written permission. For written permission, please contact
24  *    openssl-core@openssl.org.
25  *
26  * 5. Products derived from this software may not be called "OpenSSL"
27  *    nor may "OpenSSL" appear in their names without prior written
28  *    permission of the OpenSSL Project.
29  *
30  * 6. Redistributions of any form whatsoever must retain the following
31  *    acknowledgment:
32  *    "This product includes software developed by the OpenSSL Project
33  *    for use in the OpenSSL Toolkit (http://www.openssl.org/)"
34  *
35  * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36  * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE OpenSSL PROJECT OR
39  * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46  * OF THE POSSIBILITY OF SUCH DAMAGE.
47  * ====================================================================
48  */
49
50 #include "modes_lcl.h"
51 #include <string.h>
52
53 #ifndef MODES_DEBUG
54 # ifndef NDEBUG
55 #  define NDEBUG
56 # endif
57 #endif
58 #include <assert.h>
59
60 typedef struct { u64 hi,lo; } u128;
61
62 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
63 /* redefine, because alignment is ensured */
64 #undef  GETU32
65 #define GETU32(p)       BSWAP4(*(const u32 *)(p))
66 #undef  PUTU32
67 #define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
68 #endif
69
70 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
71 #ifdef  TABLE_BITS
72 #undef  TABLE_BITS
73 #endif
74 /*
75  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
76  * never be set to 8. 8 is effectively reserved for testing purposes.
77  * Under ideal conditions "8-bit" version should be twice as fast as
78  * "4-bit" one. But world is far from ideal. For gcc-generated x86 code,
79  * "8-bit" was observed to run only ~50% faster. On x86_64 observed
80  * improvement was ~75%, much closer to optimal, but the fact of
81  * deviation means that references to pre-computed tables end up on
82  * critical path and as tables are pretty big, 4KB per key+1KB shared,
83  * execution time is sensitive to cache timing. It's not actually
84  * proven, but 4-bit procedure is believed to provide adequate
85  * all-round performance...
86  */  
87 #define TABLE_BITS 4
88
89 #if     TABLE_BITS==8
90
91 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
92 {
93         int  i, j;
94         u128 V;
95
96         Htable[0].hi = 0;
97         Htable[0].lo = 0;
98         V.hi = H[0];
99         V.lo = H[1];
100
101         for (Htable[128]=V, i=64; i>0; i>>=1) {
102                 if (sizeof(size_t)==8) {
103                         u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
104                         V.lo  = (V.hi<<63)|(V.lo>>1);
105                         V.hi  = (V.hi>>1 )^T;
106                 }
107                 else {
108                         u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
109                         V.lo  = (V.hi<<63)|(V.lo>>1);
110                         V.hi  = (V.hi>>1 )^((u64)T<<32);
111                 }
112                 Htable[i] = V;
113         }
114
115         for (i=2; i<256; i<<=1) {
116                 u128 *Hi = Htable+i, H0 = *Hi;
117                 for (j=1; j<i; ++j) {
118                         Hi[j].hi = H0.hi^Htable[j].hi;
119                         Hi[j].lo = H0.lo^Htable[j].lo;
120                 }
121         }
122 }
123
124 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
125 {
126         u128 Z = { 0, 0};
127         const u8 *xi = (const u8 *)Xi+15;
128         size_t rem, n = *xi;
129         const union { long one; char little; } is_endian = {1};
130         static const size_t rem_8bit[256] = {
131                 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
132                 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
133                 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
134                 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
135                 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
136                 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
137                 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
138                 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
139                 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
140                 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
141                 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
142                 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
143                 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
144                 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
145                 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
146                 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
147                 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
148                 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
149                 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
150                 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
151                 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
152                 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
153                 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
154                 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
155                 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
156                 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
157                 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
158                 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
159                 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
160                 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
161                 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
162                 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
163                 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
164                 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
165                 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
166                 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
167                 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
168                 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
169                 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
170                 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
171                 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
172                 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
173                 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
174                 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
175                 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
176                 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
177                 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
178                 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
179                 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
180                 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
181                 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
182                 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
183                 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
184                 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
185                 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
186                 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
187                 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
188                 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
189                 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
190                 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
191                 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
192                 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
193                 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
194                 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
195
196         while (1) {
197                 Z.hi ^= Htable[n].hi;
198                 Z.lo ^= Htable[n].lo;
199
200                 if ((u8 *)Xi==xi)       break;
201
202                 n = *(--xi);
203
204                 rem  = (size_t)Z.lo&0xff;
205                 Z.lo = (Z.hi<<56)|(Z.lo>>8);
206                 Z.hi = (Z.hi>>8);
207                 if (sizeof(size_t)==8)
208                         Z.hi ^= rem_8bit[rem];
209                 else
210                         Z.hi ^= (u64)rem_8bit[rem]<<32;
211         }
212
213         if (is_endian.little) {
214 #ifdef BSWAP8
215                 Xi[0] = BSWAP8(Z.hi);
216                 Xi[1] = BSWAP8(Z.lo);
217 #else
218                 u8 *p = (u8 *)Xi;
219                 u32 v;
220                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
221                 v = (u32)(Z.hi);        PUTU32(p+4,v);
222                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
223                 v = (u32)(Z.lo);        PUTU32(p+12,v);
224 #endif
225         }
226         else {
227                 Xi[0] = Z.hi;
228                 Xi[1] = Z.lo;
229         }
230 }
231 #define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
232
233 #elif   TABLE_BITS==4
234
235 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
236 {
237         u128 V;
238 #if defined(OPENSSL_SMALL_FOOTPRINT)
239         int  i;
240 #endif
241 #define REDUCE(V) do { \
242         if (sizeof(size_t)==8) { \
243                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
244                 V.lo  = (V.hi<<63)|(V.lo>>1); \
245                 V.hi  = (V.hi>>1 )^T; \
246         } \
247         else { \
248                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
249                 V.lo  = (V.hi<<63)|(V.lo>>1); \
250                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
251         } \
252 } while(0)
253
254         Htable[0].hi = 0;
255         Htable[0].lo = 0;
256         V.hi = H[0];
257         V.lo = H[1];
258
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
260         for (Htable[8]=V, i=4; i>0; i>>=1) {
261                 REDUCE(V);
262                 Htable[i] = V;
263         }
264
265         for (i=2; i<16; i<<=1) {
266                 u128 *Hi = Htable+i;
267                 int   j;
268                 for (V=*Hi, j=1; j<i; ++j) {
269                         Hi[j].hi = V.hi^Htable[j].hi;
270                         Hi[j].lo = V.lo^Htable[j].lo;
271                 }
272         }
273 #else
274         Htable[8] = V;
275         REDUCE(V);
276         Htable[4] = V;
277         REDUCE(V);
278         Htable[2] = V;
279         REDUCE(V);
280         Htable[1] = V;
281         Htable[3].hi  = V.hi^Htable[2].hi, Htable[3].lo  = V.lo^Htable[2].lo;
282         V=Htable[4];
283         Htable[5].hi  = V.hi^Htable[1].hi, Htable[5].lo  = V.lo^Htable[1].lo;
284         Htable[6].hi  = V.hi^Htable[2].hi, Htable[6].lo  = V.lo^Htable[2].lo;
285         Htable[7].hi  = V.hi^Htable[3].hi, Htable[7].lo  = V.lo^Htable[3].lo;
286         V=Htable[8];
287         Htable[9].hi  = V.hi^Htable[1].hi, Htable[9].lo  = V.lo^Htable[1].lo;
288         Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
289         Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
290         Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
291         Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
292         Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
293         Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
294 #endif
295 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
296         /*
297          * ARM assembler expects specific dword order in Htable.
298          */
299         {
300         int j;
301         const union { long one; char little; } is_endian = {1};
302
303         if (is_endian.little)
304                 for (j=0;j<16;++j) {
305                         V = Htable[j];
306                         Htable[j].hi = V.lo;
307                         Htable[j].lo = V.hi;
308                 }
309         else
310                 for (j=0;j<16;++j) {
311                         V = Htable[j];
312                         Htable[j].hi = V.lo<<32|V.lo>>32;
313                         Htable[j].lo = V.hi<<32|V.hi>>32;
314                 }
315         }
316 #endif
317 #undef  REDUCE
318 }
319
320 #ifndef GHASH_ASM
321 static const size_t rem_4bit[16] = {
322         PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
323         PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
324         PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
325         PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
326
327 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
328 {
329         u128 Z;
330         int cnt = 15;
331         size_t rem, nlo, nhi;
332         const union { long one; char little; } is_endian = {1};
333
334         nlo  = ((const u8 *)Xi)[15];
335         nhi  = nlo>>4;
336         nlo &= 0xf;
337
338         Z.hi = Htable[nlo].hi;
339         Z.lo = Htable[nlo].lo;
340
341         while (1) {
342                 rem  = (size_t)Z.lo&0xf;
343                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
344                 Z.hi = (Z.hi>>4);
345                 if (sizeof(size_t)==8)
346                         Z.hi ^= rem_4bit[rem];
347                 else
348                         Z.hi ^= (u64)rem_4bit[rem]<<32;
349
350                 Z.hi ^= Htable[nhi].hi;
351                 Z.lo ^= Htable[nhi].lo;
352
353                 if (--cnt<0)            break;
354
355                 nlo  = ((const u8 *)Xi)[cnt];
356                 nhi  = nlo>>4;
357                 nlo &= 0xf;
358
359                 rem  = (size_t)Z.lo&0xf;
360                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
361                 Z.hi = (Z.hi>>4);
362                 if (sizeof(size_t)==8)
363                         Z.hi ^= rem_4bit[rem];
364                 else
365                         Z.hi ^= (u64)rem_4bit[rem]<<32;
366
367                 Z.hi ^= Htable[nlo].hi;
368                 Z.lo ^= Htable[nlo].lo;
369         }
370
371         if (is_endian.little) {
372 #ifdef BSWAP8
373                 Xi[0] = BSWAP8(Z.hi);
374                 Xi[1] = BSWAP8(Z.lo);
375 #else
376                 u8 *p = (u8 *)Xi;
377                 u32 v;
378                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
379                 v = (u32)(Z.hi);        PUTU32(p+4,v);
380                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
381                 v = (u32)(Z.lo);        PUTU32(p+12,v);
382 #endif
383         }
384         else {
385                 Xi[0] = Z.hi;
386                 Xi[1] = Z.lo;
387         }
388 }
389
390 #if !defined(OPENSSL_SMALL_FOOTPRINT)
391 /*
392  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
393  * details... Compiler-generated code doesn't seem to give any
394  * performance improvement, at least not on x86[_64]. It's here
395  * mostly as reference and a placeholder for possible future
396  * non-trivial optimization[s]...
397  */
398 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
399                                 const u8 *inp,size_t len)
400 {
401     u128 Z;
402     int cnt;
403     size_t rem, nlo, nhi;
404     const union { long one; char little; } is_endian = {1};
405
406     do {
407         cnt  = 15;
408         nlo  = ((const u8 *)Xi)[15];
409         nlo ^= inp[15];
410         nhi  = nlo>>4;
411         nlo &= 0xf;
412
413         Z.hi = Htable[nlo].hi;
414         Z.lo = Htable[nlo].lo;
415
416         while (1) {
417                 rem  = (size_t)Z.lo&0xf;
418                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
419                 Z.hi = (Z.hi>>4);
420                 if (sizeof(size_t)==8)
421                         Z.hi ^= rem_4bit[rem];
422                 else
423                         Z.hi ^= (u64)rem_4bit[rem]<<32;
424
425                 Z.hi ^= Htable[nhi].hi;
426                 Z.lo ^= Htable[nhi].lo;
427
428                 if (--cnt<0)            break;
429
430                 nlo  = ((const u8 *)Xi)[cnt];
431                 nlo ^= inp[cnt];
432                 nhi  = nlo>>4;
433                 nlo &= 0xf;
434
435                 rem  = (size_t)Z.lo&0xf;
436                 Z.lo = (Z.hi<<60)|(Z.lo>>4);
437                 Z.hi = (Z.hi>>4);
438                 if (sizeof(size_t)==8)
439                         Z.hi ^= rem_4bit[rem];
440                 else
441                         Z.hi ^= (u64)rem_4bit[rem]<<32;
442
443                 Z.hi ^= Htable[nlo].hi;
444                 Z.lo ^= Htable[nlo].lo;
445         }
446
447         if (is_endian.little) {
448 #ifdef BSWAP8
449                 Xi[0] = BSWAP8(Z.hi);
450                 Xi[1] = BSWAP8(Z.lo);
451 #else
452                 u8 *p = (u8 *)Xi;
453                 u32 v;
454                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
455                 v = (u32)(Z.hi);        PUTU32(p+4,v);
456                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
457                 v = (u32)(Z.lo);        PUTU32(p+12,v);
458 #endif
459         }
460         else {
461                 Xi[0] = Z.hi;
462                 Xi[1] = Z.lo;
463         }
464     } while (inp+=16, len-=16);
465 }
466 #endif
467 #else
468 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
469 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
470 #endif
471
472 #define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
473 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
474 #define GHASH(in,len,ctx) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
475 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
476  * trashing effect. In other words idea is to hash data while it's
477  * still in L1 cache after encryption pass... */
478 #define GHASH_CHUNK       1024
479 #endif
480
481 #else   /* TABLE_BITS */
482
483 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
484 {
485         u128 V,Z = { 0,0 };
486         long X;
487         int  i,j;
488         const long *xi = (const long *)Xi;
489         const union { long one; char little; } is_endian = {1};
490
491         V.hi = H[0];    /* H is in host byte order, no byte swapping */
492         V.lo = H[1];
493
494         for (j=0; j<16/sizeof(long); ++j) {
495                 if (is_endian.little) {
496                         if (sizeof(long)==8) {
497 #ifdef BSWAP8
498                                 X = (long)(BSWAP8(xi[j]));
499 #else
500                                 const u8 *p = (const u8 *)(xi+j);
501                                 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
502 #endif
503                         }
504                         else {
505                                 const u8 *p = (const u8 *)(xi+j);
506                                 X = (long)GETU32(p);
507                         }
508                 }
509                 else
510                         X = xi[j];
511
512                 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
513                         u64 M = (u64)(X>>(8*sizeof(long)-1));
514                         Z.hi ^= V.hi&M;
515                         Z.lo ^= V.lo&M;
516
517                         if (sizeof(size_t)==8) {
518                                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1));
519                                 V.lo  = (V.hi<<63)|(V.lo>>1);
520                                 V.hi  = (V.hi>>1 )^T;
521                         }
522                         else {
523                                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1));
524                                 V.lo  = (V.hi<<63)|(V.lo>>1);
525                                 V.hi  = (V.hi>>1 )^((u64)T<<32);
526                         }
527                                 
528                 }
529         }
530
531         if (is_endian.little) {
532 #ifdef BSWAP8
533                 Xi[0] = BSWAP8(Z.hi);
534                 Xi[1] = BSWAP8(Z.lo);
535 #else
536                 u8 *p = (u8 *)Xi;
537                 u32 v;
538                 v = (u32)(Z.hi>>32);    PUTU32(p,v);
539                 v = (u32)(Z.hi);        PUTU32(p+4,v);
540                 v = (u32)(Z.lo>>32);    PUTU32(p+8,v);
541                 v = (u32)(Z.lo);        PUTU32(p+12,v);
542 #endif
543         }
544         else {
545                 Xi[0] = Z.hi;
546                 Xi[1] = Z.lo;
547         }
548 }
549 #define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
550
551 #endif
552
553 struct gcm128_context {
554         /* Following 6 names follow names in GCM specification */
555         union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
556                                                 Xi,H,len;
557         /* Pre-computed table used by gcm_gmult_* */
558 #if TABLE_BITS==8
559         u128 Htable[256];
560 #else
561         u128 Htable[16];
562 #endif
563         unsigned int res, pad;
564         block128_f block;
565         void *key;
566 };
567
568 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
569 {
570         const union { long one; char little; } is_endian = {1};
571
572         memset(ctx,0,sizeof(*ctx));
573         ctx->block = block;
574         ctx->key   = key;
575
576         (*block)(ctx->H.c,ctx->H.c,key);
577
578         if (is_endian.little) {
579                 /* H is stored in host byte order */
580 #ifdef BSWAP8
581                 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
582                 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
583 #else
584                 u8 *p = ctx->H.c;
585                 u64 hi,lo;
586                 hi = (u64)GETU32(p)  <<32|GETU32(p+4);
587                 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
588                 ctx->H.u[0] = hi;
589                 ctx->H.u[1] = lo;
590 #endif
591         }
592
593 #if     TABLE_BITS==8
594         gcm_init_8bit(ctx->Htable,ctx->H.u);
595 #elif   TABLE_BITS==4
596         gcm_init_4bit(ctx->Htable,ctx->H.u);
597 #endif
598 }
599
600 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
601 {
602         const union { long one; char little; } is_endian = {1};
603         unsigned int ctr;
604
605         ctx->Yi.u[0]  = 0;
606         ctx->Yi.u[1]  = 0;
607         ctx->Xi.u[0]  = 0;
608         ctx->Xi.u[1]  = 0;
609         ctx->len.u[0] = 0;
610         ctx->len.u[1] = 0;
611         ctx->res = 0;
612
613         if (len==12) {
614                 memcpy(ctx->Yi.c,iv,12);
615                 ctx->Yi.c[15]=1;
616                 ctr=1;
617         }
618         else {
619                 size_t i;
620                 u64 len0 = len;
621
622                 while (len>=16) {
623                         for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
624                         GCM_MUL(ctx,Yi);
625                         iv += 16;
626                         len -= 16;
627                 }
628                 if (len) {
629                         for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
630                         GCM_MUL(ctx,Yi);
631                 }
632                 len0 <<= 3;
633                 if (is_endian.little) {
634 #ifdef BSWAP8
635                         ctx->Yi.u[1]  ^= BSWAP8(len0);
636 #else
637                         ctx->Yi.c[8]  ^= (u8)(len0>>56);
638                         ctx->Yi.c[9]  ^= (u8)(len0>>48);
639                         ctx->Yi.c[10] ^= (u8)(len0>>40);
640                         ctx->Yi.c[11] ^= (u8)(len0>>32);
641                         ctx->Yi.c[12] ^= (u8)(len0>>24);
642                         ctx->Yi.c[13] ^= (u8)(len0>>16);
643                         ctx->Yi.c[14] ^= (u8)(len0>>8);
644                         ctx->Yi.c[15] ^= (u8)(len0);
645 #endif
646                 }
647                 else
648                         ctx->Yi.u[1]  ^= len0;
649
650                 GCM_MUL(ctx,Yi);
651
652                 if (is_endian.little)
653                         ctr = GETU32(ctx->Yi.c+12);
654                 else
655                         ctr = ctx->Yi.d[3];
656         }
657
658         (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
659         ++ctr;
660         if (is_endian.little)
661                 PUTU32(ctx->Yi.c+12,ctr);
662         else
663                 ctx->Yi.d[3] = ctr;
664 }
665
666 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
667 {
668         size_t i;
669
670         ctx->len.u[0] += len;
671
672 #ifdef GHASH
673         if ((i = (len&(size_t)-16))) {
674                 GHASH(aad,i,ctx);
675                 aad += i;
676                 len -= i;
677         }
678 #else
679         while (len>=16) {
680                 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
681                 GCM_MUL(ctx,Xi);
682                 aad += 16;
683                 len -= 16;
684         }
685 #endif
686         if (len) {
687                 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
688                 GCM_MUL(ctx,Xi);
689         }
690 }
691
692 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
693                 const unsigned char *in, unsigned char *out,
694                 size_t len)
695 {
696         const union { long one; char little; } is_endian = {1};
697         unsigned int n, ctr;
698         size_t i;
699
700         ctx->len.u[1] += len;
701         n   = ctx->res;
702         if (is_endian.little)
703                 ctr = GETU32(ctx->Yi.c+12);
704         else
705                 ctr = ctx->Yi.d[3];
706
707 #if !defined(OPENSSL_SMALL_FOOTPRINT)
708         if (16%sizeof(size_t) == 0) do {        /* always true actually */
709                 if (n) {
710                         while (n && len) {
711                                 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
712                                 --len;
713                                 n = (n+1)%16;
714                         }
715                         if (n==0) GCM_MUL(ctx,Xi);
716                         else {
717                                 ctx->res = n;
718                                 return;
719                         }
720                 }
721 #if defined(STRICT_ALIGNMENT)
722                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
723                         break;
724 #endif
725 #if defined(GHASH) && defined(GHASH_CHUNK)
726                 while (len>=GHASH_CHUNK) {
727                     size_t j=GHASH_CHUNK;
728
729                     while (j) {
730                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
731                         ++ctr;
732                         if (is_endian.little)
733                                 PUTU32(ctx->Yi.c+12,ctr);
734                         else
735                                 ctx->Yi.d[3] = ctr;
736                         for (i=0; i<16; i+=sizeof(size_t))
737                                 *(size_t *)(out+i) =
738                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
739                         out += 16;
740                         in  += 16;
741                         j   -= 16;
742                     }
743                     GHASH(out-GHASH_CHUNK,GHASH_CHUNK,ctx);
744                     len -= GHASH_CHUNK;
745                 }
746                 if ((i = (len&(size_t)-16))) {
747                     size_t j=i;
748
749                     while (len>=16) {
750                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
751                         ++ctr;
752                         if (is_endian.little)
753                                 PUTU32(ctx->Yi.c+12,ctr);
754                         else
755                                 ctx->Yi.d[3] = ctr;
756                         for (i=0; i<16; i+=sizeof(size_t))
757                                 *(size_t *)(out+i) =
758                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
759                         out += 16;
760                         in  += 16;
761                         len -= 16;
762                     }
763                     GHASH(out-j,j,ctx);
764                 }
765 #else
766                 while (len>=16) {
767                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
768                         ++ctr;
769                         if (is_endian.little)
770                                 PUTU32(ctx->Yi.c+12,ctr);
771                         else
772                                 ctx->Yi.d[3] = ctr;
773                         for (i=0; i<16; i+=sizeof(size_t))
774                                 *(size_t *)(ctx->Xi.c+i) ^=
775                                 *(size_t *)(out+i) =
776                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
777                         GCM_MUL(ctx,Xi);
778                         out += 16;
779                         in  += 16;
780                         len -= 16;
781                 }
782 #endif
783                 if (len) {
784                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
785                         ++ctr;
786                         if (is_endian.little)
787                                 PUTU32(ctx->Yi.c+12,ctr);
788                         else
789                                 ctx->Yi.d[3] = ctr;
790                         while (len--) {
791                                 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
792                                 ++n;
793                         }
794                 }
795
796                 ctx->res = n;
797                 return;
798         } while(0);
799 #endif
800         for (i=0;i<len;++i) {
801                 if (n==0) {
802                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
803                         ++ctr;
804                         if (is_endian.little)
805                                 PUTU32(ctx->Yi.c+12,ctr);
806                         else
807                                 ctx->Yi.d[3] = ctr;
808                 }
809                 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
810                 n = (n+1)%16;
811                 if (n==0)
812                         GCM_MUL(ctx,Xi);
813         }
814
815         ctx->res = n;
816 }
817
818 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
819                 const unsigned char *in, unsigned char *out,
820                 size_t len)
821 {
822         const union { long one; char little; } is_endian = {1};
823         unsigned int n, ctr;
824         size_t i;
825
826         ctx->len.u[1] += len;
827         n   = ctx->res;
828         if (is_endian.little)
829                 ctr = GETU32(ctx->Yi.c+12);
830         else
831                 ctr = ctx->Yi.d[3];
832
833 #if !defined(OPENSSL_SMALL_FOOTPRINT)
834         if (16%sizeof(size_t) == 0) do {        /* always true actually */
835                 if (n) {
836                         while (n && len) {
837                                 u8 c = *(in++);
838                                 *(out++) = c^ctx->EKi.c[n];
839                                 ctx->Xi.c[n] ^= c;
840                                 --len;
841                                 n = (n+1)%16;
842                         }
843                         if (n==0) GCM_MUL (ctx,Xi);
844                         else {
845                                 ctx->res = n;
846                                 return;
847                         }
848                 }
849 #if defined(STRICT_ALIGNMENT)
850                 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
851                         break;
852 #endif
853 #if defined(GHASH) && defined(GHASH_CHUNK)
854                 while (len>=GHASH_CHUNK) {
855                     size_t j=GHASH_CHUNK;
856
857                     GHASH(in,GHASH_CHUNK,ctx);
858                     while (j) {
859                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
860                         ++ctr;
861                         if (is_endian.little)
862                                 PUTU32(ctx->Yi.c+12,ctr);
863                         else
864                                 ctx->Yi.d[3] = ctr;
865                         for (i=0; i<16; i+=sizeof(size_t))
866                                 *(size_t *)(out+i) =
867                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
868                         out += 16;
869                         in  += 16;
870                         j   -= 16;
871                     }
872                     len -= GHASH_CHUNK;
873                 }
874                 if ((i = (len&(size_t)-16))) {
875                     GHASH(in,i,ctx);
876                     while (len>=16) {
877                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
878                         ++ctr;
879                         if (is_endian.little)
880                                 PUTU32(ctx->Yi.c+12,ctr);
881                         else
882                                 ctx->Yi.d[3] = ctr;
883                         for (i=0; i<16; i+=sizeof(size_t))
884                                 *(size_t *)(out+i) =
885                                 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
886                         out += 16;
887                         in  += 16;
888                         len -= 16;
889                     }
890                 }
891 #else
892                 while (len>=16) {
893                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
894                         ++ctr;
895                         if (is_endian.little)
896                                 PUTU32(ctx->Yi.c+12,ctr);
897                         else
898                                 ctx->Yi.d[3] = ctr;
899                         for (i=0; i<16; i+=sizeof(size_t)) {
900                                 size_t c = *(size_t *)(in+i);
901                                 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
902                                 *(size_t *)(ctx->Xi.c+i) ^= c;
903                         }
904                         GCM_MUL(ctx,Xi);
905                         out += 16;
906                         in  += 16;
907                         len -= 16;
908                 }
909 #endif
910                 if (len) {
911                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
912                         ++ctr;
913                         if (is_endian.little)
914                                 PUTU32(ctx->Yi.c+12,ctr);
915                         else
916                                 ctx->Yi.d[3] = ctr;
917                         while (len--) {
918                                 u8 c = in[n];
919                                 ctx->Xi.c[n] ^= c;
920                                 out[n] = c^ctx->EKi.c[n];
921                                 ++n;
922                         }
923                 }
924
925                 ctx->res = n;
926                 return;
927         } while(0);
928 #endif
929         for (i=0;i<len;++i) {
930                 u8 c;
931                 if (n==0) {
932                         (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
933                         ++ctr;
934                         if (is_endian.little)
935                                 PUTU32(ctx->Yi.c+12,ctr);
936                         else
937                                 ctx->Yi.d[3] = ctr;
938                 }
939                 c = in[i];
940                 out[i] ^= ctx->EKi.c[n];
941                 ctx->Xi.c[n] ^= c;
942                 n = (n+1)%16;
943                 if (n==0)
944                         GCM_MUL(ctx,Xi);
945         }
946
947         ctx->res = n;
948 }
949
950 void CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx)
951 {
952         const union { long one; char little; } is_endian = {1};
953         u64 alen = ctx->len.u[0]<<3;
954         u64 clen = ctx->len.u[1]<<3;
955
956         if (ctx->res)
957                 GCM_MUL(ctx,Xi);
958
959         if (is_endian.little) {
960 #ifdef BSWAP8
961                 alen = BSWAP8(alen);
962                 clen = BSWAP8(clen);
963 #else
964                 u8 *p = ctx->len.c;
965
966                 ctx->len.u[0] = alen;
967                 ctx->len.u[1] = clen;
968
969                 alen = (u64)GETU32(p)  <<32|GETU32(p+4);
970                 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
971 #endif
972         }
973
974         ctx->Xi.u[0] ^= alen;
975         ctx->Xi.u[1] ^= clen;
976         GCM_MUL(ctx,Xi);
977
978         ctx->Xi.u[0] ^= ctx->EK0.u[0];
979         ctx->Xi.u[1] ^= ctx->EK0.u[1];
980 }
981
982 #if defined(SELFTEST)
983 #include <stdio.h>
984 #include <openssl/aes.h>
985
986 /* Test Case 1 */
987 static const u8 K1[16],
988                 *P1=NULL,
989                 *A1=NULL,
990                 IV1[12],
991                 *C1=NULL,
992                 T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
993
994 /* Test Case 2 */
995 #define K2 K1
996 #define A2 A1
997 #define IV2 IV1
998 static const u8 P2[16],
999                 C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1000                 T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1001
1002 /* Test Case 3 */
1003 #define A3 A2
1004 static const u8 K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1005                 P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1006                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1007                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1008                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1009                 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1010                 C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1011                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1012                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1013                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1014                 T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4,};
1015
1016 /* Test Case 4 */
1017 #define K4 K3
1018 #define IV4 IV3
1019 static const u8 P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1020                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1021                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1022                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1023                 A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1024                         0xab,0xad,0xda,0xd2},
1025                 C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1026                         0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1027                         0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1028                         0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1029                 T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1030
1031 /* Test Case 5 */
1032 #define K5 K4
1033 #define P5 P4
1034 static const u8 A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1035                         0xab,0xad,0xda,0xd2},
1036                 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1037                 C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1038                         0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1039                         0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1040                         0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1041                 T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1042
1043 /* Test Case 6 */
1044 #define K6 K5
1045 #define P6 P5
1046 #define A6 A5
1047 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1048                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1049                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1050                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1051                 C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1052                         0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1053                         0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1054                         0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1055                 T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1056
1057 /* Test Case 7 */
1058 static const u8 K7[24],
1059                 *P7=NULL,
1060                 *A7=NULL,
1061                 IV7[12],
1062                 *C7=NULL,
1063                 T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1064
1065 /* Test Case 8 */
1066 #define K8 K7
1067 #define IV8 IV7
1068 #define A8 A7
1069 static const u8 P8[16],
1070                 C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1071                 T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1072
1073 /* Test Case 9 */
1074 #define A9 A8
1075 static const u8 K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1076                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1077                 P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1078                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1079                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1080                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1081                 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1082                 C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1083                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1084                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1085                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1086                 T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1087
1088 /* Test Case 10 */
1089 #define K10 K9
1090 #define IV10 IV9
1091 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1092                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1093                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1094                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1095                 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1096                         0xab,0xad,0xda,0xd2},
1097                 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1098                         0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1099                         0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1100                         0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1101                 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1102
1103 /* Test Case 11 */
1104 #define K11 K10
1105 #define P11 P10
1106 #define A11 A10
1107 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1108                 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1109                         0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1110                         0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1111                         0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1112                 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1113
1114 /* Test Case 12 */
1115 #define K12 K11
1116 #define P12 P11
1117 #define A12 A11
1118 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1119                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1120                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1121                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1122                 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1123                         0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1124                         0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1125                         0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1126                 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1127
1128 /* Test Case 13 */
1129 static const u8 K13[32],
1130                 *P13=NULL,
1131                 *A13=NULL,
1132                 IV13[12],
1133                 *C13=NULL,
1134                 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1135
1136 /* Test Case 14 */
1137 #define K14 K13
1138 #define A14 A13
1139 static const u8 P14[16],
1140                 IV14[12],
1141                 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1142                 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1143
1144 /* Test Case 15 */
1145 #define A15 A14
1146 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1147                         0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1148                 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1149                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1150                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1151                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1152                 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1153                 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1154                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1155                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1156                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1157                 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1158
1159 /* Test Case 16 */
1160 #define K16 K15
1161 #define IV16 IV15
1162 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1163                         0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1164                         0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1165                         0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1166                 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1167                         0xab,0xad,0xda,0xd2},
1168                 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1169                         0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1170                         0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1171                         0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1172                 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1173
1174 /* Test Case 17 */
1175 #define K17 K16
1176 #define P17 P16
1177 #define A17 A16
1178 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1179                 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1180                         0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1181                         0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1182                         0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1183                 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1184
1185 /* Test Case 18 */
1186 #define K18 K17
1187 #define P18 P17
1188 #define A18 A17
1189 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1190                         0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1191                         0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1192                         0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1193                 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1194                         0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1195                         0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1196                         0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1197                 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1198
1199 #define TEST_CASE(n)    do {                                    \
1200         u8 out[sizeof(P##n)];                                   \
1201         AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key);          \
1202         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);  \
1203         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1204         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1205         if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out));     \
1206         CRYPTO_gcm128_finish(&ctx);                             \
1207         if (memcmp(ctx.Xi.c,T##n,16) || (C##n && memcmp(out,C##n,sizeof(out)))) \
1208                 ret++, printf ("encrypt test#%d failed.\n",n);\
1209         CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n));          \
1210         if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n));    \
1211         if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out));     \
1212         CRYPTO_gcm128_finish(&ctx);                             \
1213         if (memcmp(ctx.Xi.c,T##n,16) || (P##n && memcmp(out,P##n,sizeof(out)))) \
1214                 ret++, printf ("decrypt test#%d failed.\n",n);\
1215         } while(0)
1216
1217 int main()
1218 {
1219         GCM128_CONTEXT ctx;
1220         AES_KEY key;
1221         int ret=0;
1222
1223         TEST_CASE(1);
1224         TEST_CASE(2);
1225         TEST_CASE(3);
1226         TEST_CASE(4);
1227         TEST_CASE(5);
1228         TEST_CASE(6);
1229         TEST_CASE(7);
1230         TEST_CASE(8);
1231         TEST_CASE(9);
1232         TEST_CASE(10);
1233         TEST_CASE(11);
1234         TEST_CASE(12);
1235         TEST_CASE(13);
1236         TEST_CASE(14);
1237         TEST_CASE(15);
1238         TEST_CASE(16);
1239         TEST_CASE(17);
1240         TEST_CASE(18);
1241
1242 #ifdef OPENSSL_CPUID_OBJ
1243         {
1244         size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1245         union { u64 u; u8 c[1024]; } buf;
1246
1247         AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1248         CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1249         CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1250
1251         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1252         start = OPENSSL_rdtsc();
1253         CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1254         gcm_t = OPENSSL_rdtsc() - start;
1255
1256         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1257                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1258                         (block128_f)AES_encrypt);
1259         start = OPENSSL_rdtsc();
1260         CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1261                         &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1262                         (block128_f)AES_encrypt);
1263         ctr_t = OPENSSL_rdtsc() - start;
1264
1265         printf("%.2f-%.2f=%.2f\n",
1266                         gcm_t/(double)sizeof(buf),
1267                         ctr_t/(double)sizeof(buf),
1268                         (gcm_t-ctr_t)/(double)sizeof(buf));
1269 #ifdef GHASH
1270         GHASH(buf.c,sizeof(buf),&ctx);
1271         start = OPENSSL_rdtsc();
1272         GHASH(buf.c,sizeof(buf),&ctx);
1273         gcm_t = OPENSSL_rdtsc() - start;
1274         printf("%.2f\n",gcm_t/(double)sizeof(buf));
1275 #endif
1276         }
1277 #endif
1278
1279         return ret;
1280 }
1281 #endif