1 /* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
30 * 6. Redistributions of any form whatsoever must retain the following
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
50 #define OPENSSL_FIPSAPI
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
63 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
64 /* redefine, because alignment is ensured */
66 #define GETU32(p) BSWAP4(*(const u32 *)(p))
68 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
71 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
72 #define REDUCE1BIT(V) do { \
73 if (sizeof(size_t)==8) { \
74 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
75 V.lo = (V.hi<<63)|(V.lo>>1); \
76 V.hi = (V.hi>>1 )^T; \
79 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
80 V.lo = (V.hi<<63)|(V.lo>>1); \
81 V.hi = (V.hi>>1 )^((u64)T<<32); \
86 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
87 * never be set to 8. 8 is effectively reserved for testing purposes.
88 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
89 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
90 * whole spectrum of possible table driven implementations. Why? In
91 * non-"Shoup's" case memory access pattern is segmented in such manner,
92 * that it's trivial to see that cache timing information can reveal
93 * fair portion of intermediate hash value. Given that ciphertext is
94 * always available to attacker, it's possible for him to attempt to
95 * deduce secret parameter H and if successful, tamper with messages
96 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
97 * not as trivial, but there is no reason to believe that it's resistant
98 * to cache-timing attack. And the thing about "8-bit" implementation is
99 * that it consumes 16 (sixteen) times more memory, 4KB per individual
100 * key + 1KB shared. Well, on pros side it should be twice as fast as
101 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
102 * was observed to run ~75% faster, closer to 100% for commercial
103 * compilers... Yet "4-bit" procedure is preferred, because it's
104 * believed to provide better security-performance balance and adequate
105 * all-round performance. "All-round" refers to things like:
107 * - shorter setup time effectively improves overall timing for
108 * handling short messages;
109 * - larger table allocation can become unbearable because of VM
110 * subsystem penalties (for example on Windows large enough free
111 * results in VM working set trimming, meaning that consequent
112 * malloc would immediately incur working set expansion);
113 * - larger table has larger cache footprint, which can affect
114 * performance of other code paths (not necessarily even from same
115 * thread in Hyper-Threading world);
117 * Value of 1 is not appropriate for performance reasons.
121 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
131 for (Htable[128]=V, i=64; i>0; i>>=1) {
136 for (i=2; i<256; i<<=1) {
137 u128 *Hi = Htable+i, H0 = *Hi;
138 for (j=1; j<i; ++j) {
139 Hi[j].hi = H0.hi^Htable[j].hi;
140 Hi[j].lo = H0.lo^Htable[j].lo;
145 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
148 const u8 *xi = (const u8 *)Xi+15;
150 const union { long one; char little; } is_endian = {1};
151 static const size_t rem_8bit[256] = {
152 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
153 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
154 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
155 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
156 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
157 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
158 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
159 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
160 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
161 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
162 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
163 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
164 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
165 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
166 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
167 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
168 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
169 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
170 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
171 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
172 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
173 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
174 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
175 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
176 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
177 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
178 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
179 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
180 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
181 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
182 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
183 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
184 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
185 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
186 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
187 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
188 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
189 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
190 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
191 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
192 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
193 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
194 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
195 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
196 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
197 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
198 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
199 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
200 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
201 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
202 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
203 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
204 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
205 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
206 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
207 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
208 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
209 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
210 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
211 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
212 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
213 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
214 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
215 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
218 Z.hi ^= Htable[n].hi;
219 Z.lo ^= Htable[n].lo;
221 if ((u8 *)Xi==xi) break;
225 rem = (size_t)Z.lo&0xff;
226 Z.lo = (Z.hi<<56)|(Z.lo>>8);
228 if (sizeof(size_t)==8)
229 Z.hi ^= rem_8bit[rem];
231 Z.hi ^= (u64)rem_8bit[rem]<<32;
234 if (is_endian.little) {
236 Xi[0] = BSWAP8(Z.hi);
237 Xi[1] = BSWAP8(Z.lo);
241 v = (u32)(Z.hi>>32); PUTU32(p,v);
242 v = (u32)(Z.hi); PUTU32(p+4,v);
243 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
244 v = (u32)(Z.lo); PUTU32(p+12,v);
252 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
256 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
259 #if defined(OPENSSL_SMALL_FOOTPRINT)
268 #if defined(OPENSSL_SMALL_FOOTPRINT)
269 for (Htable[8]=V, i=4; i>0; i>>=1) {
274 for (i=2; i<16; i<<=1) {
277 for (V=*Hi, j=1; j<i; ++j) {
278 Hi[j].hi = V.hi^Htable[j].hi;
279 Hi[j].lo = V.lo^Htable[j].lo;
290 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
292 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
293 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
294 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
296 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
297 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
298 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
299 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
300 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
301 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
302 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
304 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
306 * ARM assembler expects specific dword order in Htable.
310 const union { long one; char little; } is_endian = {1};
312 if (is_endian.little)
321 Htable[j].hi = V.lo<<32|V.lo>>32;
322 Htable[j].lo = V.hi<<32|V.hi>>32;
329 static const size_t rem_4bit[16] = {
330 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
331 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
332 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
333 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
335 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
339 size_t rem, nlo, nhi;
340 const union { long one; char little; } is_endian = {1};
342 nlo = ((const u8 *)Xi)[15];
346 Z.hi = Htable[nlo].hi;
347 Z.lo = Htable[nlo].lo;
350 rem = (size_t)Z.lo&0xf;
351 Z.lo = (Z.hi<<60)|(Z.lo>>4);
353 if (sizeof(size_t)==8)
354 Z.hi ^= rem_4bit[rem];
356 Z.hi ^= (u64)rem_4bit[rem]<<32;
358 Z.hi ^= Htable[nhi].hi;
359 Z.lo ^= Htable[nhi].lo;
363 nlo = ((const u8 *)Xi)[cnt];
367 rem = (size_t)Z.lo&0xf;
368 Z.lo = (Z.hi<<60)|(Z.lo>>4);
370 if (sizeof(size_t)==8)
371 Z.hi ^= rem_4bit[rem];
373 Z.hi ^= (u64)rem_4bit[rem]<<32;
375 Z.hi ^= Htable[nlo].hi;
376 Z.lo ^= Htable[nlo].lo;
379 if (is_endian.little) {
381 Xi[0] = BSWAP8(Z.hi);
382 Xi[1] = BSWAP8(Z.lo);
386 v = (u32)(Z.hi>>32); PUTU32(p,v);
387 v = (u32)(Z.hi); PUTU32(p+4,v);
388 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
389 v = (u32)(Z.lo); PUTU32(p+12,v);
398 #if !defined(OPENSSL_SMALL_FOOTPRINT)
400 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
401 * details... Compiler-generated code doesn't seem to give any
402 * performance improvement, at least not on x86[_64]. It's here
403 * mostly as reference and a placeholder for possible future
404 * non-trivial optimization[s]...
406 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
407 const u8 *inp,size_t len)
411 size_t rem, nlo, nhi;
412 const union { long one; char little; } is_endian = {1};
417 nlo = ((const u8 *)Xi)[15];
422 Z.hi = Htable[nlo].hi;
423 Z.lo = Htable[nlo].lo;
426 rem = (size_t)Z.lo&0xf;
427 Z.lo = (Z.hi<<60)|(Z.lo>>4);
429 if (sizeof(size_t)==8)
430 Z.hi ^= rem_4bit[rem];
432 Z.hi ^= (u64)rem_4bit[rem]<<32;
434 Z.hi ^= Htable[nhi].hi;
435 Z.lo ^= Htable[nhi].lo;
439 nlo = ((const u8 *)Xi)[cnt];
444 rem = (size_t)Z.lo&0xf;
445 Z.lo = (Z.hi<<60)|(Z.lo>>4);
447 if (sizeof(size_t)==8)
448 Z.hi ^= rem_4bit[rem];
450 Z.hi ^= (u64)rem_4bit[rem]<<32;
452 Z.hi ^= Htable[nlo].hi;
453 Z.lo ^= Htable[nlo].lo;
457 * Extra 256+16 bytes per-key plus 512 bytes shared tables
458 * [should] give ~50% improvement... One could have PACK()-ed
459 * the rem_8bit even here, but the priority is to minimize
462 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
463 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
464 static const unsigned short rem_8bit[256] = {
465 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
466 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
467 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
468 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
469 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
470 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
471 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
472 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
473 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
474 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
475 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
476 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
477 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
478 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
479 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
480 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
481 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
482 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
483 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
484 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
485 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
486 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
487 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
488 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
489 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
490 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
491 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
492 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
493 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
494 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
495 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
496 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
498 * This pre-processing phase slows down procedure by approximately
499 * same time as it makes each loop spin faster. In other words
500 * single block performance is approximately same as straightforward
501 * "4-bit" implementation, and then it goes only faster...
503 for (cnt=0; cnt<16; ++cnt) {
504 Z.hi = Htable[cnt].hi;
505 Z.lo = Htable[cnt].lo;
506 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
507 Hshr4[cnt].hi = (Z.hi>>4);
508 Hshl4[cnt] = (u8)(Z.lo<<4);
512 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
513 nlo = ((const u8 *)Xi)[cnt];
518 Z.hi ^= Htable[nlo].hi;
519 Z.lo ^= Htable[nlo].lo;
521 rem = (size_t)Z.lo&0xff;
523 Z.lo = (Z.hi<<56)|(Z.lo>>8);
526 Z.hi ^= Hshr4[nhi].hi;
527 Z.lo ^= Hshr4[nhi].lo;
528 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
531 nlo = ((const u8 *)Xi)[0];
536 Z.hi ^= Htable[nlo].hi;
537 Z.lo ^= Htable[nlo].lo;
539 rem = (size_t)Z.lo&0xf;
541 Z.lo = (Z.hi<<60)|(Z.lo>>4);
544 Z.hi ^= Htable[nhi].hi;
545 Z.lo ^= Htable[nhi].lo;
546 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
549 if (is_endian.little) {
551 Xi[0] = BSWAP8(Z.hi);
552 Xi[1] = BSWAP8(Z.lo);
556 v = (u32)(Z.hi>>32); PUTU32(p,v);
557 v = (u32)(Z.hi); PUTU32(p+4,v);
558 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
559 v = (u32)(Z.lo); PUTU32(p+12,v);
566 } while (inp+=16, len-=16);
570 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
571 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
574 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
575 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
576 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
577 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
578 * trashing effect. In other words idea is to hash data while it's
579 * still in L1 cache after encryption pass... */
580 #define GHASH_CHUNK (3*1024)
583 #else /* TABLE_BITS */
585 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
590 const long *xi = (const long *)Xi;
591 const union { long one; char little; } is_endian = {1};
593 V.hi = H[0]; /* H is in host byte order, no byte swapping */
596 for (j=0; j<16/sizeof(long); ++j) {
597 if (is_endian.little) {
598 if (sizeof(long)==8) {
600 X = (long)(BSWAP8(xi[j]));
602 const u8 *p = (const u8 *)(xi+j);
603 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
607 const u8 *p = (const u8 *)(xi+j);
614 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
615 u64 M = (u64)(X>>(8*sizeof(long)-1));
623 if (is_endian.little) {
625 Xi[0] = BSWAP8(Z.hi);
626 Xi[1] = BSWAP8(Z.lo);
630 v = (u32)(Z.hi>>32); PUTU32(p,v);
631 v = (u32)(Z.hi); PUTU32(p+4,v);
632 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
633 v = (u32)(Z.lo); PUTU32(p+12,v);
641 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
645 #if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
646 (defined(__i386) || defined(__i386__) || \
647 defined(__x86_64) || defined(__x86_64__) || \
648 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
649 # define GHASH_ASM_X86_OR_64
650 extern unsigned int OPENSSL_ia32cap_P[2];
652 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
653 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
654 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
656 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
657 # define GHASH_ASM_X86
658 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
659 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
661 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
662 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
665 # define GCM_FUNCREF_4BIT
668 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
670 const union { long one; char little; } is_endian = {1};
672 memset(ctx,0,sizeof(*ctx));
676 (*block)(ctx->H.c,ctx->H.c,key);
678 if (is_endian.little) {
679 /* H is stored in host byte order */
681 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
682 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
686 hi = (u64)GETU32(p) <<32|GETU32(p+4);
687 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
694 gcm_init_8bit(ctx->Htable,ctx->H.u);
696 # if defined(GHASH_ASM_X86_OR_64)
697 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
698 if (OPENSSL_ia32cap_P[1]&(1<<1)) {
699 gcm_init_clmul(ctx->Htable,ctx->H.u);
700 ctx->gmult = gcm_gmult_clmul;
701 ctx->ghash = gcm_ghash_clmul;
705 gcm_init_4bit(ctx->Htable,ctx->H.u);
706 # if defined(GHASH_ASM_X86) /* x86 only */
707 if (OPENSSL_ia32cap_P[0]&(1<<23)) {
708 ctx->gmult = gcm_gmult_4bit_mmx;
709 ctx->ghash = gcm_ghash_4bit_mmx;
711 ctx->gmult = gcm_gmult_4bit_x86;
712 ctx->ghash = gcm_ghash_4bit_x86;
715 ctx->gmult = gcm_gmult_4bit;
716 ctx->ghash = gcm_ghash_4bit;
719 gcm_init_4bit(ctx->Htable,ctx->H.u);
724 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
726 const union { long one; char little; } is_endian = {1};
728 #ifdef GCM_FUNCREF_4BIT
729 void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
736 ctx->len.u[0] = 0; /* AAD length */
737 ctx->len.u[1] = 0; /* message length */
742 memcpy(ctx->Yi.c,iv,12);
751 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
757 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
761 if (is_endian.little) {
763 ctx->Yi.u[1] ^= BSWAP8(len0);
765 ctx->Yi.c[8] ^= (u8)(len0>>56);
766 ctx->Yi.c[9] ^= (u8)(len0>>48);
767 ctx->Yi.c[10] ^= (u8)(len0>>40);
768 ctx->Yi.c[11] ^= (u8)(len0>>32);
769 ctx->Yi.c[12] ^= (u8)(len0>>24);
770 ctx->Yi.c[13] ^= (u8)(len0>>16);
771 ctx->Yi.c[14] ^= (u8)(len0>>8);
772 ctx->Yi.c[15] ^= (u8)(len0);
776 ctx->Yi.u[1] ^= len0;
780 if (is_endian.little)
781 ctr = GETU32(ctx->Yi.c+12);
786 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
788 if (is_endian.little)
789 PUTU32(ctx->Yi.c+12,ctr);
794 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
798 u64 alen = ctx->len.u[0];
799 #ifdef GCM_FUNCREF_4BIT
800 void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
802 void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
803 const u8 *inp,size_t len) = ctx->ghash;
807 if (ctx->len.u[1]) return -2;
810 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
812 ctx->len.u[0] = alen;
817 ctx->Xi.c[n] ^= *(aad++);
821 if (n==0) GCM_MUL(ctx,Xi);
829 if ((i = (len&(size_t)-16))) {
836 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
843 n = (unsigned int)len;
844 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
851 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
852 const unsigned char *in, unsigned char *out,
855 const union { long one; char little; } is_endian = {1};
858 u64 mlen = ctx->len.u[1];
859 #ifdef GCM_FUNCREF_4BIT
860 void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
862 void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
863 const u8 *inp,size_t len) = ctx->ghash;
868 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
871 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
873 ctx->len.u[1] = mlen;
876 /* First call to encrypt finalizes GHASH(AAD) */
881 if (is_endian.little)
882 ctr = GETU32(ctx->Yi.c+12);
887 #if !defined(OPENSSL_SMALL_FOOTPRINT)
888 if (16%sizeof(size_t) == 0) do { /* always true actually */
891 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
895 if (n==0) GCM_MUL(ctx,Xi);
901 #if defined(STRICT_ALIGNMENT)
902 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
905 #if defined(GHASH) && defined(GHASH_CHUNK)
906 while (len>=GHASH_CHUNK) {
907 size_t j=GHASH_CHUNK;
910 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
912 if (is_endian.little)
913 PUTU32(ctx->Yi.c+12,ctr);
916 for (i=0; i<16; i+=sizeof(size_t))
918 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
923 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
926 if ((i = (len&(size_t)-16))) {
930 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
932 if (is_endian.little)
933 PUTU32(ctx->Yi.c+12,ctr);
936 for (i=0; i<16; i+=sizeof(size_t))
938 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
947 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
949 if (is_endian.little)
950 PUTU32(ctx->Yi.c+12,ctr);
953 for (i=0; i<16; i+=sizeof(size_t))
954 *(size_t *)(ctx->Xi.c+i) ^=
956 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
964 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
966 if (is_endian.little)
967 PUTU32(ctx->Yi.c+12,ctr);
971 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
980 for (i=0;i<len;++i) {
982 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
984 if (is_endian.little)
985 PUTU32(ctx->Yi.c+12,ctr);
989 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
999 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1000 const unsigned char *in, unsigned char *out,
1003 const union { long one; char little; } is_endian = {1};
1004 unsigned int n, ctr;
1006 u64 mlen = ctx->len.u[1];
1007 #ifdef GCM_FUNCREF_4BIT
1008 void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1010 void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1011 const u8 *inp,size_t len) = ctx->ghash;
1016 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1018 ctx->len.u[1] = mlen;
1021 /* First call to decrypt finalizes GHASH(AAD) */
1026 if (is_endian.little)
1027 ctr = GETU32(ctx->Yi.c+12);
1032 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1033 if (16%sizeof(size_t) == 0) do { /* always true actually */
1037 *(out++) = c^ctx->EKi.c[n];
1042 if (n==0) GCM_MUL (ctx,Xi);
1048 #if defined(STRICT_ALIGNMENT)
1049 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1052 #if defined(GHASH) && defined(GHASH_CHUNK)
1053 while (len>=GHASH_CHUNK) {
1054 size_t j=GHASH_CHUNK;
1056 GHASH(ctx,in,GHASH_CHUNK);
1058 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1060 if (is_endian.little)
1061 PUTU32(ctx->Yi.c+12,ctr);
1064 for (i=0; i<16; i+=sizeof(size_t))
1065 *(size_t *)(out+i) =
1066 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1073 if ((i = (len&(size_t)-16))) {
1076 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1078 if (is_endian.little)
1079 PUTU32(ctx->Yi.c+12,ctr);
1082 for (i=0; i<16; i+=sizeof(size_t))
1083 *(size_t *)(out+i) =
1084 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1092 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1094 if (is_endian.little)
1095 PUTU32(ctx->Yi.c+12,ctr);
1098 for (i=0; i<16; i+=sizeof(size_t)) {
1099 size_t c = *(size_t *)(in+i);
1100 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1101 *(size_t *)(ctx->Xi.c+i) ^= c;
1110 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1112 if (is_endian.little)
1113 PUTU32(ctx->Yi.c+12,ctr);
1119 out[n] = c^ctx->EKi.c[n];
1128 for (i=0;i<len;++i) {
1131 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1133 if (is_endian.little)
1134 PUTU32(ctx->Yi.c+12,ctr);
1139 out[i] = c^ctx->EKi.c[n];
1150 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1151 const unsigned char *in, unsigned char *out,
1152 size_t len, ctr128_f stream)
1154 const union { long one; char little; } is_endian = {1};
1155 unsigned int n, ctr;
1157 u64 mlen = ctx->len.u[1];
1158 #ifdef GCM_FUNCREF_4BIT
1159 void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1161 void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1162 const u8 *inp,size_t len) = ctx->ghash;
1167 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1169 ctx->len.u[1] = mlen;
1172 /* First call to encrypt finalizes GHASH(AAD) */
1177 if (is_endian.little)
1178 ctr = GETU32(ctx->Yi.c+12);
1185 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1189 if (n==0) GCM_MUL(ctx,Xi);
1195 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1196 while (len>=GHASH_CHUNK) {
1197 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1198 ctr += GHASH_CHUNK/16;
1199 if (is_endian.little)
1200 PUTU32(ctx->Yi.c+12,ctr);
1203 GHASH(ctx,out,GHASH_CHUNK);
1209 if ((i = (len&(size_t)-16))) {
1212 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1213 ctr += (unsigned int)j;
1214 if (is_endian.little)
1215 PUTU32(ctx->Yi.c+12,ctr);
1225 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1232 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1234 if (is_endian.little)
1235 PUTU32(ctx->Yi.c+12,ctr);
1239 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1248 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1249 const unsigned char *in, unsigned char *out,
1250 size_t len,ctr128_f stream)
1252 const union { long one; char little; } is_endian = {1};
1253 unsigned int n, ctr;
1255 u64 mlen = ctx->len.u[1];
1256 #ifdef GCM_FUNCREF_4BIT
1257 void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1259 void (*gcm_ghash_4bit)(u64 Xi[2],const u128 Htable[16],
1260 const u8 *inp,size_t len) = ctx->ghash;
1265 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1267 ctx->len.u[1] = mlen;
1270 /* First call to decrypt finalizes GHASH(AAD) */
1275 if (is_endian.little)
1276 ctr = GETU32(ctx->Yi.c+12);
1284 *(out++) = c^ctx->EKi.c[n];
1289 if (n==0) GCM_MUL (ctx,Xi);
1295 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1296 while (len>=GHASH_CHUNK) {
1297 GHASH(ctx,in,GHASH_CHUNK);
1298 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1299 ctr += GHASH_CHUNK/16;
1300 if (is_endian.little)
1301 PUTU32(ctx->Yi.c+12,ctr);
1309 if ((i = (len&(size_t)-16))) {
1317 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1324 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1325 ctr += (unsigned int)j;
1326 if (is_endian.little)
1327 PUTU32(ctx->Yi.c+12,ctr);
1335 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1337 if (is_endian.little)
1338 PUTU32(ctx->Yi.c+12,ctr);
1344 out[n] = c^ctx->EKi.c[n];
1353 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1356 const union { long one; char little; } is_endian = {1};
1357 u64 alen = ctx->len.u[0]<<3;
1358 u64 clen = ctx->len.u[1]<<3;
1359 #ifdef GCM_FUNCREF_4BIT
1360 void (*gcm_gmult_4bit)(u64 Xi[2],const u128 Htable[16]) = ctx->gmult;
1366 if (is_endian.little) {
1368 alen = BSWAP8(alen);
1369 clen = BSWAP8(clen);
1373 ctx->len.u[0] = alen;
1374 ctx->len.u[1] = clen;
1376 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1377 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1381 ctx->Xi.u[0] ^= alen;
1382 ctx->Xi.u[1] ^= clen;
1385 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1386 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1388 if (tag && len<=sizeof(ctx->Xi))
1389 return memcmp(ctx->Xi.c,tag,len);
1394 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1396 CRYPTO_gcm128_finish(ctx, NULL, 0);
1397 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1400 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1402 GCM128_CONTEXT *ret;
1404 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1405 CRYPTO_gcm128_init(ret,key,block);
1410 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1413 OPENSSL_cleanse(ctx,sizeof(*ctx));
1418 #if defined(SELFTEST)
1420 #include <openssl/aes.h>
1423 static const u8 K1[16],
1428 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1434 static const u8 P2[16],
1435 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1436 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1440 static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1441 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1442 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1443 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1444 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1445 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1446 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1447 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1448 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1449 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1450 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1455 static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1456 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1457 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1458 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1459 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1460 0xab,0xad,0xda,0xd2},
1461 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1462 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1463 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1464 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1465 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1471 static const u8 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1472 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1473 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1474 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1475 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1476 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1482 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1483 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1484 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1485 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1486 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1487 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1488 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1489 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1490 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1493 static const u8 K7[24],
1498 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1504 static const u8 P8[16],
1505 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1506 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1510 static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1511 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1512 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1513 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1514 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1515 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1516 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1517 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1518 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1519 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1520 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1521 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1526 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1527 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1528 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1529 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1530 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1531 0xab,0xad,0xda,0xd2},
1532 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1533 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1534 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1535 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1536 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1542 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1543 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1544 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1545 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1546 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1547 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1553 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1554 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1555 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1556 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1557 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1558 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1559 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1560 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1561 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1564 static const u8 K13[32],
1569 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1574 static const u8 P14[16],
1576 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1577 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1581 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1582 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1583 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1584 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1585 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1586 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1587 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1588 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1589 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1590 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1591 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1592 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1597 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1598 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1599 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1600 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1601 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1602 0xab,0xad,0xda,0xd2},
1603 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1604 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1605 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1606 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1607 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1613 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1614 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1615 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1616 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1617 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1618 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1624 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1625 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1626 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1627 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1628 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1629 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1630 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1631 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1632 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1634 #define TEST_CASE(n) do { \
1635 u8 out[sizeof(P##n)]; \
1636 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1637 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1638 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1639 memset(out,0,sizeof(out)); \
1640 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1641 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1642 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1643 (C##n && memcmp(out,C##n,sizeof(out)))) \
1644 ret++, printf ("encrypt test#%d failed.\n",n); \
1645 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1646 memset(out,0,sizeof(out)); \
1647 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1648 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1649 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1650 (P##n && memcmp(out,P##n,sizeof(out)))) \
1651 ret++, printf ("decrypt test#%d failed.\n",n); \
1679 #ifdef OPENSSL_CPUID_OBJ
1681 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1682 union { u64 u; u8 c[1024]; } buf;
1685 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1686 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1687 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1689 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1690 start = OPENSSL_rdtsc();
1691 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1692 gcm_t = OPENSSL_rdtsc() - start;
1694 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1695 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1696 (block128_f)AES_encrypt);
1697 start = OPENSSL_rdtsc();
1698 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1699 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1700 (block128_f)AES_encrypt);
1701 ctr_t = OPENSSL_rdtsc() - start;
1703 printf("%.2f-%.2f=%.2f\n",
1704 gcm_t/(double)sizeof(buf),
1705 ctr_t/(double)sizeof(buf),
1706 (gcm_t-ctr_t)/(double)sizeof(buf));
1708 GHASH(&ctx,buf.c,sizeof(buf));
1709 start = OPENSSL_rdtsc();
1710 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1711 gcm_t = OPENSSL_rdtsc() - start;
1712 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);