1 /* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
30 * 6. Redistributions of any form whatsoever must retain the following
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
50 #include <openssl/crypto.h>
51 #include "modes_lcl.h"
61 typedef struct { u64 hi,lo; } u128;
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
66 #define GETU32(p) BSWAP4(*(const u32 *)(p))
68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V) do { \
73 if (sizeof(size_t)==8) { \
74 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 V.lo = (V.hi<<63)|(V.lo>>1); \
76 V.hi = (V.hi>>1 )^T; \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
89 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
90 * never be set to 8. 8 is effectively reserved for testing purposes.
91 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
92 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
93 * whole spectrum of possible table driven implementations. Why? In
94 * non-"Shoup's" case memory access pattern is segmented in such manner,
95 * that it's trivial to see that cache timing information can reveal
96 * fair portion of intermediate hash value. Given that ciphertext is
97 * always available to attacker, it's possible for him to attempt to
98 * deduce secret parameter H and if successful, tamper with messages
99 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
100 * not as trivial, but there is no reason to believe that it's resistant
101 * to cache-timing attack. And the thing about "8-bit" implementation is
102 * that it consumes 16 (sixteen) times more memory, 4KB per individual
103 * key + 1KB shared. Well, on pros side it should be twice as fast as
104 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
105 * was observed to run ~75% faster, closer to 100% for commercial
106 * compilers... Yet "4-bit" procedure is preferred, because it's
107 * believed to provide better security-performance balance and adequate
108 * all-round performance. "All-round" refers to things like:
110 * - shorter setup time effectively improves overall timing for
111 * handling short messages;
112 * - larger table allocation can become unbearable because of VM
113 * subsystem penalties (for example on Windows large enough free
114 * results in VM working set trimming, meaning that consequent
115 * malloc would immediately incur working set expansion);
116 * - larger table has larger cache footprint, which can affect
117 * performance of other code paths (not necessarily even from same
118 * thread in Hyper-Threading world);
124 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
134 for (Htable[128]=V, i=64; i>0; i>>=1) {
139 for (i=2; i<256; i<<=1) {
140 u128 *Hi = Htable+i, H0 = *Hi;
141 for (j=1; j<i; ++j) {
142 Hi[j].hi = H0.hi^Htable[j].hi;
143 Hi[j].lo = H0.lo^Htable[j].lo;
148 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
151 const u8 *xi = (const u8 *)Xi+15;
153 const union { long one; char little; } is_endian = {1};
154 static const size_t rem_8bit[256] = {
155 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
156 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
157 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
158 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
159 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
160 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
161 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
162 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
163 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
164 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
165 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
166 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
167 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
168 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
169 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
170 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
171 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
172 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
173 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
174 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
175 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
176 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
177 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
178 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
179 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
180 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
181 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
182 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
183 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
184 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
185 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
186 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
187 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
188 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
189 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
190 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
191 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
192 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
193 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
194 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
195 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
196 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
197 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
198 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
199 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
200 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
201 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
202 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
203 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
204 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
205 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
206 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
207 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
208 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
209 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
210 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
211 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
212 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
213 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
214 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
215 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
216 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
217 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
218 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
221 Z.hi ^= Htable[n].hi;
222 Z.lo ^= Htable[n].lo;
224 if ((u8 *)Xi==xi) break;
228 rem = (size_t)Z.lo&0xff;
229 Z.lo = (Z.hi<<56)|(Z.lo>>8);
231 if (sizeof(size_t)==8)
232 Z.hi ^= rem_8bit[rem];
234 Z.hi ^= (u64)rem_8bit[rem]<<32;
237 if (is_endian.little) {
239 Xi[0] = BSWAP8(Z.hi);
240 Xi[1] = BSWAP8(Z.lo);
244 v = (u32)(Z.hi>>32); PUTU32(p,v);
245 v = (u32)(Z.hi); PUTU32(p+4,v);
246 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
247 v = (u32)(Z.lo); PUTU32(p+12,v);
255 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
259 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
262 #if defined(OPENSSL_SMALL_FOOTPRINT)
271 #if defined(OPENSSL_SMALL_FOOTPRINT)
272 for (Htable[8]=V, i=4; i>0; i>>=1) {
277 for (i=2; i<16; i<<=1) {
280 for (V=*Hi, j=1; j<i; ++j) {
281 Hi[j].hi = V.hi^Htable[j].hi;
282 Hi[j].lo = V.lo^Htable[j].lo;
293 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
295 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
296 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
297 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
299 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
300 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
301 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
302 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
303 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
304 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
305 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
307 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
309 * ARM assembler expects specific dword order in Htable.
313 const union { long one; char little; } is_endian = {1};
315 if (is_endian.little)
324 Htable[j].hi = V.lo<<32|V.lo>>32;
325 Htable[j].lo = V.hi<<32|V.hi>>32;
332 static const size_t rem_4bit[16] = {
333 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
334 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
335 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
336 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
338 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
342 size_t rem, nlo, nhi;
343 const union { long one; char little; } is_endian = {1};
345 nlo = ((const u8 *)Xi)[15];
349 Z.hi = Htable[nlo].hi;
350 Z.lo = Htable[nlo].lo;
353 rem = (size_t)Z.lo&0xf;
354 Z.lo = (Z.hi<<60)|(Z.lo>>4);
356 if (sizeof(size_t)==8)
357 Z.hi ^= rem_4bit[rem];
359 Z.hi ^= (u64)rem_4bit[rem]<<32;
361 Z.hi ^= Htable[nhi].hi;
362 Z.lo ^= Htable[nhi].lo;
366 nlo = ((const u8 *)Xi)[cnt];
370 rem = (size_t)Z.lo&0xf;
371 Z.lo = (Z.hi<<60)|(Z.lo>>4);
373 if (sizeof(size_t)==8)
374 Z.hi ^= rem_4bit[rem];
376 Z.hi ^= (u64)rem_4bit[rem]<<32;
378 Z.hi ^= Htable[nlo].hi;
379 Z.lo ^= Htable[nlo].lo;
382 if (is_endian.little) {
384 Xi[0] = BSWAP8(Z.hi);
385 Xi[1] = BSWAP8(Z.lo);
389 v = (u32)(Z.hi>>32); PUTU32(p,v);
390 v = (u32)(Z.hi); PUTU32(p+4,v);
391 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
392 v = (u32)(Z.lo); PUTU32(p+12,v);
401 #if !defined(OPENSSL_SMALL_FOOTPRINT)
403 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
404 * details... Compiler-generated code doesn't seem to give any
405 * performance improvement, at least not on x86[_64]. It's here
406 * mostly as reference and a placeholder for possible future
407 * non-trivial optimization[s]...
409 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
410 const u8 *inp,size_t len)
414 size_t rem, nlo, nhi;
415 const union { long one; char little; } is_endian = {1};
420 nlo = ((const u8 *)Xi)[15];
425 Z.hi = Htable[nlo].hi;
426 Z.lo = Htable[nlo].lo;
429 rem = (size_t)Z.lo&0xf;
430 Z.lo = (Z.hi<<60)|(Z.lo>>4);
432 if (sizeof(size_t)==8)
433 Z.hi ^= rem_4bit[rem];
435 Z.hi ^= (u64)rem_4bit[rem]<<32;
437 Z.hi ^= Htable[nhi].hi;
438 Z.lo ^= Htable[nhi].lo;
442 nlo = ((const u8 *)Xi)[cnt];
447 rem = (size_t)Z.lo&0xf;
448 Z.lo = (Z.hi<<60)|(Z.lo>>4);
450 if (sizeof(size_t)==8)
451 Z.hi ^= rem_4bit[rem];
453 Z.hi ^= (u64)rem_4bit[rem]<<32;
455 Z.hi ^= Htable[nlo].hi;
456 Z.lo ^= Htable[nlo].lo;
460 * Extra 256+16 bytes per-key plus 512 bytes shared tables
461 * [should] give ~50% improvement... One could have PACK()-ed
462 * the rem_8bit even here, but the priority is to minimize
465 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
466 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
467 static const unsigned short rem_8bit[256] = {
468 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
501 * This pre-processing phase slows down procedure by approximately
502 * same time as it makes each loop spin faster. In other words
503 * single block performance is approximately same as straightforward
504 * "4-bit" implementation, and then it goes only faster...
506 for (cnt=0; cnt<16; ++cnt) {
507 Z.hi = Htable[cnt].hi;
508 Z.lo = Htable[cnt].lo;
509 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510 Hshr4[cnt].hi = (Z.hi>>4);
511 Hshl4[cnt] = (u8)(Z.lo<<4);
515 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516 nlo = ((const u8 *)Xi)[cnt];
521 Z.hi ^= Htable[nlo].hi;
522 Z.lo ^= Htable[nlo].lo;
524 rem = (size_t)Z.lo&0xff;
526 Z.lo = (Z.hi<<56)|(Z.lo>>8);
529 Z.hi ^= Hshr4[nhi].hi;
530 Z.lo ^= Hshr4[nhi].lo;
531 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
534 nlo = ((const u8 *)Xi)[0];
539 Z.hi ^= Htable[nlo].hi;
540 Z.lo ^= Htable[nlo].lo;
542 rem = (size_t)Z.lo&0xf;
544 Z.lo = (Z.hi<<60)|(Z.lo>>4);
547 Z.hi ^= Htable[nhi].hi;
548 Z.lo ^= Htable[nhi].lo;
549 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
552 if (is_endian.little) {
554 Xi[0] = BSWAP8(Z.hi);
555 Xi[1] = BSWAP8(Z.lo);
559 v = (u32)(Z.hi>>32); PUTU32(p,v);
560 v = (u32)(Z.hi); PUTU32(p+4,v);
561 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
562 v = (u32)(Z.lo); PUTU32(p+12,v);
569 } while (inp+=16, len-=16);
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
577 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581 * trashing effect. In other words idea is to hash data while it's
582 * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK (3*1024)
586 #else /* TABLE_BITS */
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
593 const long *xi = (const long *)Xi;
594 const union { long one; char little; } is_endian = {1};
596 V.hi = H[0]; /* H is in host byte order, no byte swapping */
599 for (j=0; j<16/sizeof(long); ++j) {
600 if (is_endian.little) {
601 if (sizeof(long)==8) {
603 X = (long)(BSWAP8(xi[j]));
605 const u8 *p = (const u8 *)(xi+j);
606 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
610 const u8 *p = (const u8 *)(xi+j);
617 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618 u64 M = (u64)(X>>(8*sizeof(long)-1));
626 if (is_endian.little) {
628 Xi[0] = BSWAP8(Z.hi);
629 Xi[1] = BSWAP8(Z.lo);
633 v = (u32)(Z.hi>>32); PUTU32(p,v);
634 v = (u32)(Z.hi); PUTU32(p+4,v);
635 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
636 v = (u32)(Z.lo); PUTU32(p+12,v);
644 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
648 struct gcm128_context {
649 /* Following 6 names follow names in GCM specification */
650 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
652 /* Pre-computed table used by gcm_gmult_* */
657 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
658 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660 unsigned int mres, ares;
665 #if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
666 (defined(__i386) || defined(__i386__) || \
667 defined(__x86_64) || defined(__x86_64__) || \
668 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
669 # define GHASH_ASM_IAX
670 extern unsigned int OPENSSL_ia32cap_P[2];
672 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
673 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
677 # define GHASH_ASM_X86
678 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
679 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
681 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
682 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
686 # define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
688 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
691 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
693 const union { long one; char little; } is_endian = {1};
695 memset(ctx,0,sizeof(*ctx));
699 (*block)(ctx->H.c,ctx->H.c,key);
701 if (is_endian.little) {
702 /* H is stored in host byte order */
704 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
705 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
709 hi = (u64)GETU32(p) <<32|GETU32(p+4);
710 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
717 gcm_init_8bit(ctx->Htable,ctx->H.u);
719 # if defined(GHASH_ASM_IAX) /* both x86 and x86_64 */
720 if (OPENSSL_ia32cap_P[1]&(1<<1)) {
721 gcm_init_clmul(ctx->Htable,ctx->H.u);
722 ctx->gmult = gcm_gmult_clmul;
723 ctx->ghash = gcm_ghash_clmul;
726 gcm_init_4bit(ctx->Htable,ctx->H.u);
727 # if defined(GHASH_ASM_X86) /* x86 only */
728 if (OPENSSL_ia32cap_P[0]&(1<<23)) {
729 ctx->gmult = gcm_gmult_4bit_mmx;
730 ctx->ghash = gcm_ghash_4bit_mmx;
732 ctx->gmult = gcm_gmult_4bit_x86;
733 ctx->ghash = gcm_ghash_4bit_x86;
736 ctx->gmult = gcm_gmult_4bit;
737 ctx->ghash = gcm_ghash_4bit;
740 gcm_init_4bit(ctx->Htable,ctx->H.u);
745 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
747 const union { long one; char little; } is_endian = {1};
754 ctx->len.u[0] = 0; /* AAD length */
755 ctx->len.u[1] = 0; /* message length */
760 memcpy(ctx->Yi.c,iv,12);
769 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
775 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
779 if (is_endian.little) {
781 ctx->Yi.u[1] ^= BSWAP8(len0);
783 ctx->Yi.c[8] ^= (u8)(len0>>56);
784 ctx->Yi.c[9] ^= (u8)(len0>>48);
785 ctx->Yi.c[10] ^= (u8)(len0>>40);
786 ctx->Yi.c[11] ^= (u8)(len0>>32);
787 ctx->Yi.c[12] ^= (u8)(len0>>24);
788 ctx->Yi.c[13] ^= (u8)(len0>>16);
789 ctx->Yi.c[14] ^= (u8)(len0>>8);
790 ctx->Yi.c[15] ^= (u8)(len0);
794 ctx->Yi.u[1] ^= len0;
798 if (is_endian.little)
799 ctr = GETU32(ctx->Yi.c+12);
804 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
806 if (is_endian.little)
807 PUTU32(ctx->Yi.c+12,ctr);
812 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
816 u64 alen = ctx->len.u[0];
818 if (ctx->len.u[1]) return -2;
821 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
823 ctx->len.u[0] = alen;
828 ctx->Xi.c[n] ^= *(aad++);
832 if (n==0) GCM_MUL(ctx,Xi);
840 if ((i = (len&(size_t)-16))) {
847 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
854 n = (unsigned int)len;
855 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
862 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
863 const unsigned char *in, unsigned char *out,
866 const union { long one; char little; } is_endian = {1};
869 u64 mlen = ctx->len.u[1];
872 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
875 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
877 ctx->len.u[1] = mlen;
880 /* First call to encrypt finalizes GHASH(AAD) */
885 if (is_endian.little)
886 ctr = GETU32(ctx->Yi.c+12);
891 #if !defined(OPENSSL_SMALL_FOOTPRINT)
892 if (16%sizeof(size_t) == 0) do { /* always true actually */
895 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
899 if (n==0) GCM_MUL(ctx,Xi);
905 #if defined(STRICT_ALIGNMENT)
906 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
909 #if defined(GHASH) && defined(GHASH_CHUNK)
910 while (len>=GHASH_CHUNK) {
911 size_t j=GHASH_CHUNK;
914 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
916 if (is_endian.little)
917 PUTU32(ctx->Yi.c+12,ctr);
920 for (i=0; i<16; i+=sizeof(size_t))
922 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
927 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
930 if ((i = (len&(size_t)-16))) {
934 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
936 if (is_endian.little)
937 PUTU32(ctx->Yi.c+12,ctr);
940 for (i=0; i<16; i+=sizeof(size_t))
942 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
951 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
953 if (is_endian.little)
954 PUTU32(ctx->Yi.c+12,ctr);
957 for (i=0; i<16; i+=sizeof(size_t))
958 *(size_t *)(ctx->Xi.c+i) ^=
960 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
968 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
970 if (is_endian.little)
971 PUTU32(ctx->Yi.c+12,ctr);
975 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
984 for (i=0;i<len;++i) {
986 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
988 if (is_endian.little)
989 PUTU32(ctx->Yi.c+12,ctr);
993 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1003 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1004 const unsigned char *in, unsigned char *out,
1007 const union { long one; char little; } is_endian = {1};
1008 unsigned int n, ctr;
1010 u64 mlen = ctx->len.u[1];
1013 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1015 ctx->len.u[1] = mlen;
1018 /* First call to decrypt finalizes GHASH(AAD) */
1023 if (is_endian.little)
1024 ctr = GETU32(ctx->Yi.c+12);
1029 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1030 if (16%sizeof(size_t) == 0) do { /* always true actually */
1034 *(out++) = c^ctx->EKi.c[n];
1039 if (n==0) GCM_MUL (ctx,Xi);
1045 #if defined(STRICT_ALIGNMENT)
1046 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1049 #if defined(GHASH) && defined(GHASH_CHUNK)
1050 while (len>=GHASH_CHUNK) {
1051 size_t j=GHASH_CHUNK;
1053 GHASH(ctx,in,GHASH_CHUNK);
1055 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1057 if (is_endian.little)
1058 PUTU32(ctx->Yi.c+12,ctr);
1061 for (i=0; i<16; i+=sizeof(size_t))
1062 *(size_t *)(out+i) =
1063 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1070 if ((i = (len&(size_t)-16))) {
1073 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1075 if (is_endian.little)
1076 PUTU32(ctx->Yi.c+12,ctr);
1079 for (i=0; i<16; i+=sizeof(size_t))
1080 *(size_t *)(out+i) =
1081 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1089 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1091 if (is_endian.little)
1092 PUTU32(ctx->Yi.c+12,ctr);
1095 for (i=0; i<16; i+=sizeof(size_t)) {
1096 size_t c = *(size_t *)(in+i);
1097 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1098 *(size_t *)(ctx->Xi.c+i) ^= c;
1107 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1109 if (is_endian.little)
1110 PUTU32(ctx->Yi.c+12,ctr);
1116 out[n] = c^ctx->EKi.c[n];
1125 for (i=0;i<len;++i) {
1128 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1130 if (is_endian.little)
1131 PUTU32(ctx->Yi.c+12,ctr);
1136 out[i] = c^ctx->EKi.c[n];
1147 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1148 const unsigned char *in, unsigned char *out,
1149 size_t len, ctr128_f stream)
1151 const union { long one; char little; } is_endian = {1};
1152 unsigned int n, ctr;
1154 u64 mlen = ctx->len.u[1];
1157 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1159 ctx->len.u[1] = mlen;
1162 /* First call to encrypt finalizes GHASH(AAD) */
1167 if (is_endian.little)
1168 ctr = GETU32(ctx->Yi.c+12);
1175 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1179 if (n==0) GCM_MUL(ctx,Xi);
1185 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1186 while (len>=GHASH_CHUNK) {
1187 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1188 ctr += GHASH_CHUNK/16;
1189 if (is_endian.little)
1190 PUTU32(ctx->Yi.c+12,ctr);
1193 GHASH(ctx,out,GHASH_CHUNK);
1199 if ((i = (len&(size_t)-16))) {
1202 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1203 ctr += (unsigned int)j;
1204 if (is_endian.little)
1205 PUTU32(ctx->Yi.c+12,ctr);
1215 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1222 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1224 if (is_endian.little)
1225 PUTU32(ctx->Yi.c+12,ctr);
1229 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1238 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1239 const unsigned char *in, unsigned char *out,
1240 size_t len,ctr128_f stream)
1242 const union { long one; char little; } is_endian = {1};
1243 unsigned int n, ctr;
1245 u64 mlen = ctx->len.u[1];
1248 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1250 ctx->len.u[1] = mlen;
1253 /* First call to decrypt finalizes GHASH(AAD) */
1258 if (is_endian.little)
1259 ctr = GETU32(ctx->Yi.c+12);
1267 *(out++) = c^ctx->EKi.c[n];
1272 if (n==0) GCM_MUL (ctx,Xi);
1278 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1279 while (len>=GHASH_CHUNK) {
1280 GHASH(ctx,in,GHASH_CHUNK);
1281 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1282 ctr += GHASH_CHUNK/16;
1283 if (is_endian.little)
1284 PUTU32(ctx->Yi.c+12,ctr);
1292 if ((i = (len&(size_t)-16))) {
1300 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1307 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1308 ctr += (unsigned int)j;
1309 if (is_endian.little)
1310 PUTU32(ctx->Yi.c+12,ctr);
1318 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1320 if (is_endian.little)
1321 PUTU32(ctx->Yi.c+12,ctr);
1327 out[n] = c^ctx->EKi.c[n];
1336 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1339 const union { long one; char little; } is_endian = {1};
1340 u64 alen = ctx->len.u[0]<<3;
1341 u64 clen = ctx->len.u[1]<<3;
1346 if (is_endian.little) {
1348 alen = BSWAP8(alen);
1349 clen = BSWAP8(clen);
1353 ctx->len.u[0] = alen;
1354 ctx->len.u[1] = clen;
1356 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1357 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1361 ctx->Xi.u[0] ^= alen;
1362 ctx->Xi.u[1] ^= clen;
1365 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1366 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1368 if (tag && len<=sizeof(ctx->Xi))
1369 return memcmp(ctx->Xi.c,tag,len);
1374 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1376 CRYPTO_gcm128_finish(ctx, NULL, 0);
1377 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1380 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1382 GCM128_CONTEXT *ret;
1384 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1385 CRYPTO_gcm128_init(ret,key,block);
1390 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1393 OPENSSL_cleanse(ctx,sizeof(*ctx));
1398 #if defined(SELFTEST)
1400 #include <openssl/aes.h>
1403 static const u8 K1[16],
1408 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1414 static const u8 P2[16],
1415 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1416 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1420 static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1421 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1422 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1423 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1424 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1425 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1426 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1427 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1428 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1429 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1430 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1435 static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1436 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1437 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1438 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1439 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1440 0xab,0xad,0xda,0xd2},
1441 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1442 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1443 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1444 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1445 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1450 static const u8 A5[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1451 0xab,0xad,0xda,0xd2},
1452 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1453 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1454 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1455 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1456 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1457 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1463 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1464 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1465 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1466 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1467 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1468 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1469 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1470 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1471 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1474 static const u8 K7[24],
1479 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1485 static const u8 P8[16],
1486 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1487 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1491 static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1492 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1493 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1494 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1495 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1496 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1497 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1498 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1499 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1500 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1501 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1502 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1507 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1508 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1509 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1510 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1511 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1512 0xab,0xad,0xda,0xd2},
1513 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1514 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1515 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1516 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1517 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1523 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1524 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1525 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1526 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1527 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1528 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1534 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1535 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1536 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1537 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1538 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1539 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1540 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1541 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1542 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1545 static const u8 K13[32],
1550 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1555 static const u8 P14[16],
1557 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1558 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1562 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1563 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1564 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1565 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1566 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1567 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1568 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1569 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1570 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1571 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1572 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1573 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1578 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1579 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1580 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1581 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1582 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1583 0xab,0xad,0xda,0xd2},
1584 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1585 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1586 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1587 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1588 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1594 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1595 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1596 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1597 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1598 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1599 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1605 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1606 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1607 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1608 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1609 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1610 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1611 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1612 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1613 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1615 #define TEST_CASE(n) do { \
1616 u8 out[sizeof(P##n)]; \
1617 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1618 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1619 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1620 memset(out,0,sizeof(out)); \
1621 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1622 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1623 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1624 (C##n && memcmp(out,C##n,sizeof(out)))) \
1625 ret++, printf ("encrypt test#%d failed.\n",n); \
1626 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1627 memset(out,0,sizeof(out)); \
1628 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1629 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1630 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1631 (P##n && memcmp(out,P##n,sizeof(out)))) \
1632 ret++, printf ("decrypt test#%d failed.\n",n); \
1660 #ifdef OPENSSL_CPUID_OBJ
1662 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1663 union { u64 u; u8 c[1024]; } buf;
1666 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1667 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1668 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1670 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1671 start = OPENSSL_rdtsc();
1672 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1673 gcm_t = OPENSSL_rdtsc() - start;
1675 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1676 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1677 (block128_f)AES_encrypt);
1678 start = OPENSSL_rdtsc();
1679 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1680 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1681 (block128_f)AES_encrypt);
1682 ctr_t = OPENSSL_rdtsc() - start;
1684 printf("%.2f-%.2f=%.2f\n",
1685 gcm_t/(double)sizeof(buf),
1686 ctr_t/(double)sizeof(buf),
1687 (gcm_t-ctr_t)/(double)sizeof(buf));
1689 GHASH(&ctx,buf.c,sizeof(buf));
1690 start = OPENSSL_rdtsc();
1691 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1692 gcm_t = OPENSSL_rdtsc() - start;
1693 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);