1 /* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
30 * 6. Redistributions of any form whatsoever must retain the following
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
50 #include <openssl/crypto.h>
51 #include "modes_lcl.h"
61 typedef struct { u64 hi,lo; } u128;
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
66 #define GETU32(p) BSWAP4(*(const u32 *)(p))
68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V) do { \
73 if (sizeof(size_t)==8) { \
74 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 V.lo = (V.hi<<63)|(V.lo>>1); \
76 V.hi = (V.hi>>1 )^T; \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
89 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
90 * never be set to 8. 8 is effectively reserved for testing purposes.
91 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
92 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
93 * whole spectrum of possible table driven implementations. Why? In
94 * non-"Shoup's" case memory access pattern is segmented in such manner,
95 * that it's trivial to see that cache timing information can reveal
96 * fair portion of intermediate hash value. Given that ciphertext is
97 * always available to attacker, it's possible for him to attempt to
98 * deduce secret parameter H and if successful, tamper with messages
99 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
100 * not as trivial, but there is no reason to believe that it's resistant
101 * to cache-timing attack. And the thing about "8-bit" implementation is
102 * that it consumes 16 (sixteen) times more memory, 4KB per individual
103 * key + 1KB shared. Well, on pros side it should be twice as fast as
104 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
105 * was observed to run ~75% faster, closer to 100% for commercial
106 * compilers... Yet "4-bit" procedure is preferred, because it's
107 * believed to provide better security-performance balance and adequate
108 * all-round performance. "All-round" refers to things like:
110 * - shorter setup time effectively improves overall timing for
111 * handling short messages;
112 * - larger table allocation can become unbearable because of VM
113 * subsystem penalties (for example on Windows large enough free
114 * results in VM working set trimming, meaning that consequent
115 * malloc would immediately incur working set expansion);
116 * - larger table has larger cache footprint, which can affect
117 * performance of other code paths (not necessarily even from same
118 * thread in Hyper-Threading world);
124 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
134 for (Htable[128]=V, i=64; i>0; i>>=1) {
139 for (i=2; i<256; i<<=1) {
140 u128 *Hi = Htable+i, H0 = *Hi;
141 for (j=1; j<i; ++j) {
142 Hi[j].hi = H0.hi^Htable[j].hi;
143 Hi[j].lo = H0.lo^Htable[j].lo;
148 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
151 const u8 *xi = (const u8 *)Xi+15;
153 const union { long one; char little; } is_endian = {1};
154 static const size_t rem_8bit[256] = {
155 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
156 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
157 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
158 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
159 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
160 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
161 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
162 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
163 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
164 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
165 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
166 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
167 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
168 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
169 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
170 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
171 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
172 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
173 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
174 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
175 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
176 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
177 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
178 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
179 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
180 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
181 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
182 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
183 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
184 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
185 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
186 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
187 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
188 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
189 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
190 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
191 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
192 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
193 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
194 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
195 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
196 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
197 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
198 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
199 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
200 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
201 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
202 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
203 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
204 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
205 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
206 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
207 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
208 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
209 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
210 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
211 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
212 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
213 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
214 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
215 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
216 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
217 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
218 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
221 Z.hi ^= Htable[n].hi;
222 Z.lo ^= Htable[n].lo;
224 if ((u8 *)Xi==xi) break;
228 rem = (size_t)Z.lo&0xff;
229 Z.lo = (Z.hi<<56)|(Z.lo>>8);
231 if (sizeof(size_t)==8)
232 Z.hi ^= rem_8bit[rem];
234 Z.hi ^= (u64)rem_8bit[rem]<<32;
237 if (is_endian.little) {
239 Xi[0] = BSWAP8(Z.hi);
240 Xi[1] = BSWAP8(Z.lo);
244 v = (u32)(Z.hi>>32); PUTU32(p,v);
245 v = (u32)(Z.hi); PUTU32(p+4,v);
246 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
247 v = (u32)(Z.lo); PUTU32(p+12,v);
255 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
259 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
262 #if defined(OPENSSL_SMALL_FOOTPRINT)
271 #if defined(OPENSSL_SMALL_FOOTPRINT)
272 for (Htable[8]=V, i=4; i>0; i>>=1) {
277 for (i=2; i<16; i<<=1) {
280 for (V=*Hi, j=1; j<i; ++j) {
281 Hi[j].hi = V.hi^Htable[j].hi;
282 Hi[j].lo = V.lo^Htable[j].lo;
293 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
295 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
296 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
297 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
299 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
300 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
301 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
302 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
303 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
304 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
305 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
307 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
309 * ARM assembler expects specific dword order in Htable.
313 const union { long one; char little; } is_endian = {1};
315 if (is_endian.little)
324 Htable[j].hi = V.lo<<32|V.lo>>32;
325 Htable[j].lo = V.hi<<32|V.hi>>32;
332 static const size_t rem_4bit[16] = {
333 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
334 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
335 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
336 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
338 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
342 size_t rem, nlo, nhi;
343 const union { long one; char little; } is_endian = {1};
345 nlo = ((const u8 *)Xi)[15];
349 Z.hi = Htable[nlo].hi;
350 Z.lo = Htable[nlo].lo;
353 rem = (size_t)Z.lo&0xf;
354 Z.lo = (Z.hi<<60)|(Z.lo>>4);
356 if (sizeof(size_t)==8)
357 Z.hi ^= rem_4bit[rem];
359 Z.hi ^= (u64)rem_4bit[rem]<<32;
361 Z.hi ^= Htable[nhi].hi;
362 Z.lo ^= Htable[nhi].lo;
366 nlo = ((const u8 *)Xi)[cnt];
370 rem = (size_t)Z.lo&0xf;
371 Z.lo = (Z.hi<<60)|(Z.lo>>4);
373 if (sizeof(size_t)==8)
374 Z.hi ^= rem_4bit[rem];
376 Z.hi ^= (u64)rem_4bit[rem]<<32;
378 Z.hi ^= Htable[nlo].hi;
379 Z.lo ^= Htable[nlo].lo;
382 if (is_endian.little) {
384 Xi[0] = BSWAP8(Z.hi);
385 Xi[1] = BSWAP8(Z.lo);
389 v = (u32)(Z.hi>>32); PUTU32(p,v);
390 v = (u32)(Z.hi); PUTU32(p+4,v);
391 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
392 v = (u32)(Z.lo); PUTU32(p+12,v);
401 #if !defined(OPENSSL_SMALL_FOOTPRINT)
403 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
404 * details... Compiler-generated code doesn't seem to give any
405 * performance improvement, at least not on x86[_64]. It's here
406 * mostly as reference and a placeholder for possible future
407 * non-trivial optimization[s]...
409 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
410 const u8 *inp,size_t len)
414 size_t rem, nlo, nhi;
415 const union { long one; char little; } is_endian = {1};
420 nlo = ((const u8 *)Xi)[15];
425 Z.hi = Htable[nlo].hi;
426 Z.lo = Htable[nlo].lo;
429 rem = (size_t)Z.lo&0xf;
430 Z.lo = (Z.hi<<60)|(Z.lo>>4);
432 if (sizeof(size_t)==8)
433 Z.hi ^= rem_4bit[rem];
435 Z.hi ^= (u64)rem_4bit[rem]<<32;
437 Z.hi ^= Htable[nhi].hi;
438 Z.lo ^= Htable[nhi].lo;
442 nlo = ((const u8 *)Xi)[cnt];
447 rem = (size_t)Z.lo&0xf;
448 Z.lo = (Z.hi<<60)|(Z.lo>>4);
450 if (sizeof(size_t)==8)
451 Z.hi ^= rem_4bit[rem];
453 Z.hi ^= (u64)rem_4bit[rem]<<32;
455 Z.hi ^= Htable[nlo].hi;
456 Z.lo ^= Htable[nlo].lo;
460 * Extra 256+16 bytes per-key plus 512 bytes shared tables
461 * [should] give ~50% improvement... One could have PACK()-ed
462 * the rem_8bit even here, but the priority is to minimize
465 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
466 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
467 static const unsigned short rem_8bit[256] = {
468 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
469 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
470 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
471 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
472 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
473 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
474 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
475 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
476 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
477 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
478 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
479 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
480 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
481 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
482 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
483 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
484 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
485 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
486 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
487 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
488 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
489 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
490 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
491 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
492 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
493 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
494 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
495 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
496 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
497 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
498 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
499 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
501 * This pre-processing phase slows down procedure by approximately
502 * same time as it makes each loop spin faster. In other words
503 * single block performance is approximately same as straightforward
504 * "4-bit" implementation, and then it goes only faster...
506 for (cnt=0; cnt<16; ++cnt) {
507 Z.hi = Htable[cnt].hi;
508 Z.lo = Htable[cnt].lo;
509 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
510 Hshr4[cnt].hi = (Z.hi>>4);
511 Hshl4[cnt] = (u8)(Z.lo<<4);
515 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
516 nlo = ((const u8 *)Xi)[cnt];
521 Z.hi ^= Htable[nlo].hi;
522 Z.lo ^= Htable[nlo].lo;
524 rem = (size_t)Z.lo&0xff;
526 Z.lo = (Z.hi<<56)|(Z.lo>>8);
529 Z.hi ^= Hshr4[nhi].hi;
530 Z.lo ^= Hshr4[nhi].lo;
531 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
534 nlo = ((const u8 *)Xi)[0];
539 Z.hi ^= Htable[nlo].hi;
540 Z.lo ^= Htable[nlo].lo;
542 rem = (size_t)Z.lo&0xf;
544 Z.lo = (Z.hi<<60)|(Z.lo>>4);
547 Z.hi ^= Htable[nhi].hi;
548 Z.lo ^= Htable[nhi].lo;
549 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
552 if (is_endian.little) {
554 Xi[0] = BSWAP8(Z.hi);
555 Xi[1] = BSWAP8(Z.lo);
559 v = (u32)(Z.hi>>32); PUTU32(p,v);
560 v = (u32)(Z.hi); PUTU32(p+4,v);
561 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
562 v = (u32)(Z.lo); PUTU32(p+12,v);
569 } while (inp+=16, len-=16);
573 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
574 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
577 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
578 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
579 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
580 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
581 * trashing effect. In other words idea is to hash data while it's
582 * still in L1 cache after encryption pass... */
583 #define GHASH_CHUNK 1024
586 #else /* TABLE_BITS */
588 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
593 const long *xi = (const long *)Xi;
594 const union { long one; char little; } is_endian = {1};
596 V.hi = H[0]; /* H is in host byte order, no byte swapping */
599 for (j=0; j<16/sizeof(long); ++j) {
600 if (is_endian.little) {
601 if (sizeof(long)==8) {
603 X = (long)(BSWAP8(xi[j]));
605 const u8 *p = (const u8 *)(xi+j);
606 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
610 const u8 *p = (const u8 *)(xi+j);
617 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
618 u64 M = (u64)(X>>(8*sizeof(long)-1));
626 if (is_endian.little) {
628 Xi[0] = BSWAP8(Z.hi);
629 Xi[1] = BSWAP8(Z.lo);
633 v = (u32)(Z.hi>>32); PUTU32(p,v);
634 v = (u32)(Z.hi); PUTU32(p+4,v);
635 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
636 v = (u32)(Z.lo); PUTU32(p+12,v);
644 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
648 struct gcm128_context {
649 /* Following 6 names follow names in GCM specification */
650 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
652 /* Pre-computed table used by gcm_gmult_* */
657 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
658 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
660 unsigned int res, pad;
665 #if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
666 (defined(__i386) || defined(__i386__) || \
667 defined(__x86_64) || defined(__x86_64__) || \
668 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
669 # define GHASH_ASM_IAX
670 extern unsigned int OPENSSL_ia32cap_P[2];
672 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
673 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
674 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
676 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
677 # define GHASH_ASM_X86
678 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
679 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
681 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
682 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
686 # define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
688 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
691 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
693 const union { long one; char little; } is_endian = {1};
695 memset(ctx,0,sizeof(*ctx));
699 (*block)(ctx->H.c,ctx->H.c,key);
701 if (is_endian.little) {
702 /* H is stored in host byte order */
704 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
705 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
709 hi = (u64)GETU32(p) <<32|GETU32(p+4);
710 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
717 gcm_init_8bit(ctx->Htable,ctx->H.u);
719 # if defined(GHASH_ASM_IAX) /* both x86 and x86_64 */
720 if (OPENSSL_ia32cap_P[1]&(1<<1)) {
721 gcm_init_clmul(ctx->Htable,ctx->H.u);
722 ctx->gmult = gcm_gmult_clmul;
723 ctx->ghash = gcm_ghash_clmul;
726 gcm_init_4bit(ctx->Htable,ctx->H.u);
727 # if defined(GHASH_ASM_X86) /* x86 only */
728 if (OPENSSL_ia32cap_P[0]&(1<<23)) {
729 ctx->gmult = gcm_gmult_4bit_mmx;
730 ctx->ghash = gcm_ghash_4bit_mmx;
732 ctx->gmult = gcm_gmult_4bit_x86;
733 ctx->ghash = gcm_ghash_4bit_x86;
736 ctx->gmult = gcm_gmult_4bit;
737 ctx->ghash = gcm_ghash_4bit;
740 gcm_init_4bit(ctx->Htable,ctx->H.u);
745 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
747 const union { long one; char little; } is_endian = {1};
759 memcpy(ctx->Yi.c,iv,12);
768 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
774 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
778 if (is_endian.little) {
780 ctx->Yi.u[1] ^= BSWAP8(len0);
782 ctx->Yi.c[8] ^= (u8)(len0>>56);
783 ctx->Yi.c[9] ^= (u8)(len0>>48);
784 ctx->Yi.c[10] ^= (u8)(len0>>40);
785 ctx->Yi.c[11] ^= (u8)(len0>>32);
786 ctx->Yi.c[12] ^= (u8)(len0>>24);
787 ctx->Yi.c[13] ^= (u8)(len0>>16);
788 ctx->Yi.c[14] ^= (u8)(len0>>8);
789 ctx->Yi.c[15] ^= (u8)(len0);
793 ctx->Yi.u[1] ^= len0;
797 if (is_endian.little)
798 ctr = GETU32(ctx->Yi.c+12);
803 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
805 if (is_endian.little)
806 PUTU32(ctx->Yi.c+12,ctr);
811 void CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
815 ctx->len.u[0] += len;
818 if ((i = (len&(size_t)-16))) {
825 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
832 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
837 void CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
838 const unsigned char *in, unsigned char *out,
841 const union { long one; char little; } is_endian = {1};
845 ctx->len.u[1] += len;
847 if (is_endian.little)
848 ctr = GETU32(ctx->Yi.c+12);
852 #if !defined(OPENSSL_SMALL_FOOTPRINT)
853 if (16%sizeof(size_t) == 0) do { /* always true actually */
856 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
860 if (n==0) GCM_MUL(ctx,Xi);
866 #if defined(STRICT_ALIGNMENT)
867 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
870 #if defined(GHASH) && defined(GHASH_CHUNK)
871 while (len>=GHASH_CHUNK) {
872 size_t j=GHASH_CHUNK;
875 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
877 if (is_endian.little)
878 PUTU32(ctx->Yi.c+12,ctr);
881 for (i=0; i<16; i+=sizeof(size_t))
883 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
888 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
891 if ((i = (len&(size_t)-16))) {
895 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
897 if (is_endian.little)
898 PUTU32(ctx->Yi.c+12,ctr);
901 for (i=0; i<16; i+=sizeof(size_t))
903 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
912 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
914 if (is_endian.little)
915 PUTU32(ctx->Yi.c+12,ctr);
918 for (i=0; i<16; i+=sizeof(size_t))
919 *(size_t *)(ctx->Xi.c+i) ^=
921 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
929 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
931 if (is_endian.little)
932 PUTU32(ctx->Yi.c+12,ctr);
936 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
945 for (i=0;i<len;++i) {
947 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
949 if (is_endian.little)
950 PUTU32(ctx->Yi.c+12,ctr);
954 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
963 void CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
964 const unsigned char *in, unsigned char *out,
967 const union { long one; char little; } is_endian = {1};
971 ctx->len.u[1] += len;
973 if (is_endian.little)
974 ctr = GETU32(ctx->Yi.c+12);
978 #if !defined(OPENSSL_SMALL_FOOTPRINT)
979 if (16%sizeof(size_t) == 0) do { /* always true actually */
983 *(out++) = c^ctx->EKi.c[n];
988 if (n==0) GCM_MUL (ctx,Xi);
994 #if defined(STRICT_ALIGNMENT)
995 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
998 #if defined(GHASH) && defined(GHASH_CHUNK)
999 while (len>=GHASH_CHUNK) {
1000 size_t j=GHASH_CHUNK;
1002 GHASH(ctx,in,GHASH_CHUNK);
1004 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1006 if (is_endian.little)
1007 PUTU32(ctx->Yi.c+12,ctr);
1010 for (i=0; i<16; i+=sizeof(size_t))
1011 *(size_t *)(out+i) =
1012 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1019 if ((i = (len&(size_t)-16))) {
1022 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1024 if (is_endian.little)
1025 PUTU32(ctx->Yi.c+12,ctr);
1028 for (i=0; i<16; i+=sizeof(size_t))
1029 *(size_t *)(out+i) =
1030 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1038 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1040 if (is_endian.little)
1041 PUTU32(ctx->Yi.c+12,ctr);
1044 for (i=0; i<16; i+=sizeof(size_t)) {
1045 size_t c = *(size_t *)(in+i);
1046 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1047 *(size_t *)(ctx->Xi.c+i) ^= c;
1056 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1058 if (is_endian.little)
1059 PUTU32(ctx->Yi.c+12,ctr);
1065 out[n] = c^ctx->EKi.c[n];
1074 for (i=0;i<len;++i) {
1077 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1079 if (is_endian.little)
1080 PUTU32(ctx->Yi.c+12,ctr);
1085 out[i] ^= ctx->EKi.c[n];
1095 void CRYPTO_gcm128_encrypt_crt32(GCM128_CONTEXT *ctx,
1096 const unsigned char *in, unsigned char *out,
1097 size_t len, ctr128_f stream)
1099 const union { long one; char little; } is_endian = {1};
1100 unsigned int n, ctr;
1103 ctx->len.u[1] += len;
1105 if (is_endian.little)
1106 ctr = GETU32(ctx->Yi.c+12);
1112 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1116 if (n==0) GCM_MUL(ctx,Xi);
1122 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1123 while (len>=GHASH_CHUNK) {
1124 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1125 ctr += GHASH_CHUNK/16;
1126 if (is_endian.little)
1127 PUTU32(ctx->Yi.c+12,ctr);
1130 GHASH(ctx,out,GHASH_CHUNK);
1136 if ((i = (len&(size_t)-16))) {
1139 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1141 if (is_endian.little)
1142 PUTU32(ctx->Yi.c+12,ctr);
1152 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1159 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1161 if (is_endian.little)
1162 PUTU32(ctx->Yi.c+12,ctr);
1166 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1174 void CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1175 const unsigned char *in, unsigned char *out,
1176 size_t len,ctr128_f stream)
1178 const union { long one; char little; } is_endian = {1};
1179 unsigned int n, ctr;
1182 ctx->len.u[1] += len;
1184 if (is_endian.little)
1185 ctr = GETU32(ctx->Yi.c+12);
1192 *(out++) = c^ctx->EKi.c[n];
1197 if (n==0) GCM_MUL (ctx,Xi);
1203 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1204 while (len>=GHASH_CHUNK) {
1205 GHASH(ctx,in,GHASH_CHUNK);
1206 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1207 ctr += GHASH_CHUNK/16;
1208 if (is_endian.little)
1209 PUTU32(ctx->Yi.c+12,ctr);
1217 if ((i = (len&(size_t)-16))) {
1225 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1232 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1234 if (is_endian.little)
1235 PUTU32(ctx->Yi.c+12,ctr);
1243 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1245 if (is_endian.little)
1246 PUTU32(ctx->Yi.c+12,ctr);
1252 out[n] = c^ctx->EKi.c[n];
1260 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1263 const union { long one; char little; } is_endian = {1};
1264 u64 alen = ctx->len.u[0]<<3;
1265 u64 clen = ctx->len.u[1]<<3;
1270 if (is_endian.little) {
1272 alen = BSWAP8(alen);
1273 clen = BSWAP8(clen);
1277 ctx->len.u[0] = alen;
1278 ctx->len.u[1] = clen;
1280 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1281 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1285 ctx->Xi.u[0] ^= alen;
1286 ctx->Xi.u[1] ^= clen;
1289 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1290 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1292 if (tag && len<=sizeof(ctx->Xi))
1293 return memcmp(ctx->Xi.c,tag,len);
1298 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1300 GCM128_CONTEXT *ret;
1302 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1303 CRYPTO_gcm128_init(ret,key,block);
1308 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1311 OPENSSL_cleanse(ctx,sizeof(*ctx));
1316 #if defined(SELFTEST)
1318 #include <openssl/aes.h>
1321 static const u8 K1[16],
1326 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1332 static const u8 P2[16],
1333 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1334 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1338 static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1339 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1340 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1341 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1342 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1343 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1344 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1345 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1346 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1347 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1348 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1353 static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1354 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1355 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1356 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1357 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1358 0xab,0xad,0xda,0xd2},
1359 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1360 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1361 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1362 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1363 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1368 static const u8 A5[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1369 0xab,0xad,0xda,0xd2},
1370 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1371 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1372 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1373 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1374 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1375 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1381 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1382 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1383 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1384 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1385 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1386 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1387 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1388 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1389 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1392 static const u8 K7[24],
1397 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1403 static const u8 P8[16],
1404 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1405 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1409 static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1410 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1411 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1412 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1413 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1414 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1415 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1416 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1417 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1418 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1419 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1420 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1425 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1426 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1427 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1428 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1429 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1430 0xab,0xad,0xda,0xd2},
1431 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1432 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1433 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1434 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1435 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1441 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1442 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1443 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1444 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1445 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1446 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1452 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1453 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1454 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1455 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1456 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1457 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1458 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1459 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1460 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1463 static const u8 K13[32],
1468 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1473 static const u8 P14[16],
1475 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1476 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1480 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1481 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1482 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1483 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1484 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1485 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1486 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1487 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1488 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1489 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1490 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1491 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1496 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1497 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1498 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1499 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1500 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1501 0xab,0xad,0xda,0xd2},
1502 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1503 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1504 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1505 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1506 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1512 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1513 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1514 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1515 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1516 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1517 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1523 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1524 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1525 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1526 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1527 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1528 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1529 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1530 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1531 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1533 #define TEST_CASE(n) do { \
1534 u8 out[sizeof(P##n)]; \
1535 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1536 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1537 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1538 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1539 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1540 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1541 (C##n && memcmp(out,C##n,sizeof(out)))) \
1542 ret++, printf ("encrypt test#%d failed.\n",n);\
1543 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1544 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1545 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1546 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1547 (P##n && memcmp(out,P##n,sizeof(out)))) \
1548 ret++, printf ("decrypt test#%d failed.\n",n); \
1576 #ifdef OPENSSL_CPUID_OBJ
1578 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1579 union { u64 u; u8 c[1024]; } buf;
1582 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1583 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1584 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1586 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1587 start = OPENSSL_rdtsc();
1588 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1589 gcm_t = OPENSSL_rdtsc() - start;
1591 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1592 &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1593 (block128_f)AES_encrypt);
1594 start = OPENSSL_rdtsc();
1595 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1596 &key,ctx.Yi.c,ctx.EKi.c,&ctx.res,
1597 (block128_f)AES_encrypt);
1598 ctr_t = OPENSSL_rdtsc() - start;
1600 printf("%.2f-%.2f=%.2f\n",
1601 gcm_t/(double)sizeof(buf),
1602 ctr_t/(double)sizeof(buf),
1603 (gcm_t-ctr_t)/(double)sizeof(buf));
1605 GHASH(&ctx,buf.c,sizeof(buf));
1606 start = OPENSSL_rdtsc();
1607 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1608 gcm_t = OPENSSL_rdtsc() - start;
1609 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);