1 /* ====================================================================
2 * Copyright (c) 2010 The OpenSSL Project. All rights reserved.
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in
13 * the documentation and/or other materials provided with the
16 * 3. All advertising materials mentioning features or use of this
17 * software must display the following acknowledgment:
18 * "This product includes software developed by the OpenSSL Project
19 * for use in the OpenSSL Toolkit. (http://www.openssl.org/)"
21 * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to
22 * endorse or promote products derived from this software without
23 * prior written permission. For written permission, please contact
24 * openssl-core@openssl.org.
26 * 5. Products derived from this software may not be called "OpenSSL"
27 * nor may "OpenSSL" appear in their names without prior written
28 * permission of the OpenSSL Project.
30 * 6. Redistributions of any form whatsoever must retain the following
32 * "This product includes software developed by the OpenSSL Project
33 * for use in the OpenSSL Toolkit (http://www.openssl.org/)"
35 * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY
36 * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
37 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
38 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR
39 * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
40 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
41 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
42 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
43 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
44 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
45 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
46 * OF THE POSSIBILITY OF SUCH DAMAGE.
47 * ====================================================================
50 #define OPENSSL_FIPSAPI
52 #include <openssl/crypto.h>
53 #include "modes_lcl.h"
63 typedef struct { u64 hi,lo; } u128;
65 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
66 /* redefine, because alignment is ensured */
68 #define GETU32(p) BSWAP4(*(const u32 *)(p))
70 #define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
73 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
74 #define REDUCE1BIT(V) do { \
75 if (sizeof(size_t)==8) { \
76 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
77 V.lo = (V.hi<<63)|(V.lo>>1); \
78 V.hi = (V.hi>>1 )^T; \
81 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
82 V.lo = (V.hi<<63)|(V.lo>>1); \
83 V.hi = (V.hi>>1 )^((u64)T<<32); \
91 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
92 * never be set to 8. 8 is effectively reserved for testing purposes.
93 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
94 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
95 * whole spectrum of possible table driven implementations. Why? In
96 * non-"Shoup's" case memory access pattern is segmented in such manner,
97 * that it's trivial to see that cache timing information can reveal
98 * fair portion of intermediate hash value. Given that ciphertext is
99 * always available to attacker, it's possible for him to attempt to
100 * deduce secret parameter H and if successful, tamper with messages
101 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
102 * not as trivial, but there is no reason to believe that it's resistant
103 * to cache-timing attack. And the thing about "8-bit" implementation is
104 * that it consumes 16 (sixteen) times more memory, 4KB per individual
105 * key + 1KB shared. Well, on pros side it should be twice as fast as
106 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
107 * was observed to run ~75% faster, closer to 100% for commercial
108 * compilers... Yet "4-bit" procedure is preferred, because it's
109 * believed to provide better security-performance balance and adequate
110 * all-round performance. "All-round" refers to things like:
112 * - shorter setup time effectively improves overall timing for
113 * handling short messages;
114 * - larger table allocation can become unbearable because of VM
115 * subsystem penalties (for example on Windows large enough free
116 * results in VM working set trimming, meaning that consequent
117 * malloc would immediately incur working set expansion);
118 * - larger table has larger cache footprint, which can affect
119 * performance of other code paths (not necessarily even from same
120 * thread in Hyper-Threading world);
126 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
136 for (Htable[128]=V, i=64; i>0; i>>=1) {
141 for (i=2; i<256; i<<=1) {
142 u128 *Hi = Htable+i, H0 = *Hi;
143 for (j=1; j<i; ++j) {
144 Hi[j].hi = H0.hi^Htable[j].hi;
145 Hi[j].lo = H0.lo^Htable[j].lo;
150 static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
153 const u8 *xi = (const u8 *)Xi+15;
155 const union { long one; char little; } is_endian = {1};
156 static const size_t rem_8bit[256] = {
157 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
158 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
159 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
160 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
161 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
162 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
163 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
164 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
165 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
166 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
167 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
168 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
169 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
170 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
171 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
172 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
173 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
174 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
175 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
176 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
177 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
178 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
179 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
180 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
181 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
182 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
183 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
184 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
185 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
186 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
187 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
188 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
189 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
190 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
191 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
192 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
193 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
194 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
195 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
196 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
197 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
198 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
199 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
200 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
201 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
202 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
203 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
204 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
205 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
206 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
207 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
208 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
209 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
210 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
211 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
212 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
213 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
214 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
215 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
216 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
217 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
218 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
219 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
220 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };
223 Z.hi ^= Htable[n].hi;
224 Z.lo ^= Htable[n].lo;
226 if ((u8 *)Xi==xi) break;
230 rem = (size_t)Z.lo&0xff;
231 Z.lo = (Z.hi<<56)|(Z.lo>>8);
233 if (sizeof(size_t)==8)
234 Z.hi ^= rem_8bit[rem];
236 Z.hi ^= (u64)rem_8bit[rem]<<32;
239 if (is_endian.little) {
241 Xi[0] = BSWAP8(Z.hi);
242 Xi[1] = BSWAP8(Z.lo);
246 v = (u32)(Z.hi>>32); PUTU32(p,v);
247 v = (u32)(Z.hi); PUTU32(p+4,v);
248 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
249 v = (u32)(Z.lo); PUTU32(p+12,v);
257 #define GCM_MUL(ctx,Xi) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
261 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
264 #if defined(OPENSSL_SMALL_FOOTPRINT)
273 #if defined(OPENSSL_SMALL_FOOTPRINT)
274 for (Htable[8]=V, i=4; i>0; i>>=1) {
279 for (i=2; i<16; i<<=1) {
282 for (V=*Hi, j=1; j<i; ++j) {
283 Hi[j].hi = V.hi^Htable[j].hi;
284 Hi[j].lo = V.lo^Htable[j].lo;
295 Htable[3].hi = V.hi^Htable[2].hi, Htable[3].lo = V.lo^Htable[2].lo;
297 Htable[5].hi = V.hi^Htable[1].hi, Htable[5].lo = V.lo^Htable[1].lo;
298 Htable[6].hi = V.hi^Htable[2].hi, Htable[6].lo = V.lo^Htable[2].lo;
299 Htable[7].hi = V.hi^Htable[3].hi, Htable[7].lo = V.lo^Htable[3].lo;
301 Htable[9].hi = V.hi^Htable[1].hi, Htable[9].lo = V.lo^Htable[1].lo;
302 Htable[10].hi = V.hi^Htable[2].hi, Htable[10].lo = V.lo^Htable[2].lo;
303 Htable[11].hi = V.hi^Htable[3].hi, Htable[11].lo = V.lo^Htable[3].lo;
304 Htable[12].hi = V.hi^Htable[4].hi, Htable[12].lo = V.lo^Htable[4].lo;
305 Htable[13].hi = V.hi^Htable[5].hi, Htable[13].lo = V.lo^Htable[5].lo;
306 Htable[14].hi = V.hi^Htable[6].hi, Htable[14].lo = V.lo^Htable[6].lo;
307 Htable[15].hi = V.hi^Htable[7].hi, Htable[15].lo = V.lo^Htable[7].lo;
309 #if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
311 * ARM assembler expects specific dword order in Htable.
315 const union { long one; char little; } is_endian = {1};
317 if (is_endian.little)
326 Htable[j].hi = V.lo<<32|V.lo>>32;
327 Htable[j].lo = V.hi<<32|V.hi>>32;
334 static const size_t rem_4bit[16] = {
335 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
336 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
337 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
338 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0) };
340 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
344 size_t rem, nlo, nhi;
345 const union { long one; char little; } is_endian = {1};
347 nlo = ((const u8 *)Xi)[15];
351 Z.hi = Htable[nlo].hi;
352 Z.lo = Htable[nlo].lo;
355 rem = (size_t)Z.lo&0xf;
356 Z.lo = (Z.hi<<60)|(Z.lo>>4);
358 if (sizeof(size_t)==8)
359 Z.hi ^= rem_4bit[rem];
361 Z.hi ^= (u64)rem_4bit[rem]<<32;
363 Z.hi ^= Htable[nhi].hi;
364 Z.lo ^= Htable[nhi].lo;
368 nlo = ((const u8 *)Xi)[cnt];
372 rem = (size_t)Z.lo&0xf;
373 Z.lo = (Z.hi<<60)|(Z.lo>>4);
375 if (sizeof(size_t)==8)
376 Z.hi ^= rem_4bit[rem];
378 Z.hi ^= (u64)rem_4bit[rem]<<32;
380 Z.hi ^= Htable[nlo].hi;
381 Z.lo ^= Htable[nlo].lo;
384 if (is_endian.little) {
386 Xi[0] = BSWAP8(Z.hi);
387 Xi[1] = BSWAP8(Z.lo);
391 v = (u32)(Z.hi>>32); PUTU32(p,v);
392 v = (u32)(Z.hi); PUTU32(p+4,v);
393 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
394 v = (u32)(Z.lo); PUTU32(p+12,v);
403 #if !defined(OPENSSL_SMALL_FOOTPRINT)
405 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
406 * details... Compiler-generated code doesn't seem to give any
407 * performance improvement, at least not on x86[_64]. It's here
408 * mostly as reference and a placeholder for possible future
409 * non-trivial optimization[s]...
411 static void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],
412 const u8 *inp,size_t len)
416 size_t rem, nlo, nhi;
417 const union { long one; char little; } is_endian = {1};
422 nlo = ((const u8 *)Xi)[15];
427 Z.hi = Htable[nlo].hi;
428 Z.lo = Htable[nlo].lo;
431 rem = (size_t)Z.lo&0xf;
432 Z.lo = (Z.hi<<60)|(Z.lo>>4);
434 if (sizeof(size_t)==8)
435 Z.hi ^= rem_4bit[rem];
437 Z.hi ^= (u64)rem_4bit[rem]<<32;
439 Z.hi ^= Htable[nhi].hi;
440 Z.lo ^= Htable[nhi].lo;
444 nlo = ((const u8 *)Xi)[cnt];
449 rem = (size_t)Z.lo&0xf;
450 Z.lo = (Z.hi<<60)|(Z.lo>>4);
452 if (sizeof(size_t)==8)
453 Z.hi ^= rem_4bit[rem];
455 Z.hi ^= (u64)rem_4bit[rem]<<32;
457 Z.hi ^= Htable[nlo].hi;
458 Z.lo ^= Htable[nlo].lo;
462 * Extra 256+16 bytes per-key plus 512 bytes shared tables
463 * [should] give ~50% improvement... One could have PACK()-ed
464 * the rem_8bit even here, but the priority is to minimize
467 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
468 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
469 static const unsigned short rem_8bit[256] = {
470 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
471 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
472 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
473 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
474 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
475 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
476 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
477 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
478 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
479 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
480 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
481 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
482 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
483 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
484 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
485 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
486 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
487 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
488 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
489 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
490 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
491 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
492 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
493 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
494 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
495 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
496 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
497 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
498 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
499 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
500 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
501 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE };
503 * This pre-processing phase slows down procedure by approximately
504 * same time as it makes each loop spin faster. In other words
505 * single block performance is approximately same as straightforward
506 * "4-bit" implementation, and then it goes only faster...
508 for (cnt=0; cnt<16; ++cnt) {
509 Z.hi = Htable[cnt].hi;
510 Z.lo = Htable[cnt].lo;
511 Hshr4[cnt].lo = (Z.hi<<60)|(Z.lo>>4);
512 Hshr4[cnt].hi = (Z.hi>>4);
513 Hshl4[cnt] = (u8)(Z.lo<<4);
517 for (Z.lo=0, Z.hi=0, cnt=15; cnt; --cnt) {
518 nlo = ((const u8 *)Xi)[cnt];
523 Z.hi ^= Htable[nlo].hi;
524 Z.lo ^= Htable[nlo].lo;
526 rem = (size_t)Z.lo&0xff;
528 Z.lo = (Z.hi<<56)|(Z.lo>>8);
531 Z.hi ^= Hshr4[nhi].hi;
532 Z.lo ^= Hshr4[nhi].lo;
533 Z.hi ^= (u64)rem_8bit[rem^Hshl4[nhi]]<<48;
536 nlo = ((const u8 *)Xi)[0];
541 Z.hi ^= Htable[nlo].hi;
542 Z.lo ^= Htable[nlo].lo;
544 rem = (size_t)Z.lo&0xf;
546 Z.lo = (Z.hi<<60)|(Z.lo>>4);
549 Z.hi ^= Htable[nhi].hi;
550 Z.lo ^= Htable[nhi].lo;
551 Z.hi ^= ((u64)rem_8bit[rem<<4])<<48;
554 if (is_endian.little) {
556 Xi[0] = BSWAP8(Z.hi);
557 Xi[1] = BSWAP8(Z.lo);
561 v = (u32)(Z.hi>>32); PUTU32(p,v);
562 v = (u32)(Z.hi); PUTU32(p+4,v);
563 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
564 v = (u32)(Z.lo); PUTU32(p+12,v);
571 } while (inp+=16, len-=16);
575 void gcm_gmult_4bit(u64 Xi[2],const u128 Htable[16]);
576 void gcm_ghash_4bit(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
579 #define GCM_MUL(ctx,Xi) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
580 #if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
581 #define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
582 /* GHASH_CHUNK is "stride parameter" missioned to mitigate cache
583 * trashing effect. In other words idea is to hash data while it's
584 * still in L1 cache after encryption pass... */
585 #define GHASH_CHUNK (3*1024)
588 #else /* TABLE_BITS */
590 static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
595 const long *xi = (const long *)Xi;
596 const union { long one; char little; } is_endian = {1};
598 V.hi = H[0]; /* H is in host byte order, no byte swapping */
601 for (j=0; j<16/sizeof(long); ++j) {
602 if (is_endian.little) {
603 if (sizeof(long)==8) {
605 X = (long)(BSWAP8(xi[j]));
607 const u8 *p = (const u8 *)(xi+j);
608 X = (long)((u64)GETU32(p)<<32|GETU32(p+4));
612 const u8 *p = (const u8 *)(xi+j);
619 for (i=0; i<8*sizeof(long); ++i, X<<=1) {
620 u64 M = (u64)(X>>(8*sizeof(long)-1));
628 if (is_endian.little) {
630 Xi[0] = BSWAP8(Z.hi);
631 Xi[1] = BSWAP8(Z.lo);
635 v = (u32)(Z.hi>>32); PUTU32(p,v);
636 v = (u32)(Z.hi); PUTU32(p+4,v);
637 v = (u32)(Z.lo>>32); PUTU32(p+8,v);
638 v = (u32)(Z.lo); PUTU32(p+12,v);
646 #define GCM_MUL(ctx,Xi) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
650 struct gcm128_context {
651 /* Following 6 names follow names in GCM specification */
652 union { u64 u[2]; u32 d[4]; u8 c[16]; } Yi,EKi,EK0,
654 /* Pre-computed table used by gcm_gmult_* */
659 void (*gmult)(u64 Xi[2],const u128 Htable[16]);
660 void (*ghash)(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
662 unsigned int mres, ares;
667 #if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
668 (defined(__i386) || defined(__i386__) || \
669 defined(__x86_64) || defined(__x86_64__) || \
670 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
671 # define GHASH_ASM_IAX
672 extern unsigned int OPENSSL_ia32cap_P[2];
674 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
675 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
676 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
678 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
679 # define GHASH_ASM_X86
680 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
681 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
683 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
684 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
688 # define GCM_MUL(ctx,Xi) (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
690 # define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
693 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
695 const union { long one; char little; } is_endian = {1};
697 memset(ctx,0,sizeof(*ctx));
701 (*block)(ctx->H.c,ctx->H.c,key);
703 if (is_endian.little) {
704 /* H is stored in host byte order */
706 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
707 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
711 hi = (u64)GETU32(p) <<32|GETU32(p+4);
712 lo = (u64)GETU32(p+8)<<32|GETU32(p+12);
719 gcm_init_8bit(ctx->Htable,ctx->H.u);
721 # if defined(GHASH_ASM_IAX) /* both x86 and x86_64 */
722 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
723 if (OPENSSL_ia32cap_P[1]&(1<<1)) {
724 gcm_init_clmul(ctx->Htable,ctx->H.u);
725 ctx->gmult = gcm_gmult_clmul;
726 ctx->ghash = gcm_ghash_clmul;
730 gcm_init_4bit(ctx->Htable,ctx->H.u);
731 # if defined(GHASH_ASM_X86) /* x86 only */
732 if (OPENSSL_ia32cap_P[0]&(1<<23)) {
733 ctx->gmult = gcm_gmult_4bit_mmx;
734 ctx->ghash = gcm_ghash_4bit_mmx;
736 ctx->gmult = gcm_gmult_4bit_x86;
737 ctx->ghash = gcm_ghash_4bit_x86;
740 ctx->gmult = gcm_gmult_4bit;
741 ctx->ghash = gcm_ghash_4bit;
744 gcm_init_4bit(ctx->Htable,ctx->H.u);
749 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
751 const union { long one; char little; } is_endian = {1};
758 ctx->len.u[0] = 0; /* AAD length */
759 ctx->len.u[1] = 0; /* message length */
764 memcpy(ctx->Yi.c,iv,12);
773 for (i=0; i<16; ++i) ctx->Yi.c[i] ^= iv[i];
779 for (i=0; i<len; ++i) ctx->Yi.c[i] ^= iv[i];
783 if (is_endian.little) {
785 ctx->Yi.u[1] ^= BSWAP8(len0);
787 ctx->Yi.c[8] ^= (u8)(len0>>56);
788 ctx->Yi.c[9] ^= (u8)(len0>>48);
789 ctx->Yi.c[10] ^= (u8)(len0>>40);
790 ctx->Yi.c[11] ^= (u8)(len0>>32);
791 ctx->Yi.c[12] ^= (u8)(len0>>24);
792 ctx->Yi.c[13] ^= (u8)(len0>>16);
793 ctx->Yi.c[14] ^= (u8)(len0>>8);
794 ctx->Yi.c[15] ^= (u8)(len0);
798 ctx->Yi.u[1] ^= len0;
802 if (is_endian.little)
803 ctr = GETU32(ctx->Yi.c+12);
808 (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
810 if (is_endian.little)
811 PUTU32(ctx->Yi.c+12,ctr);
816 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
820 u64 alen = ctx->len.u[0];
822 if (ctx->len.u[1]) return -2;
825 if (alen>(U64(1)<<61) || (sizeof(len)==8 && alen<len))
827 ctx->len.u[0] = alen;
832 ctx->Xi.c[n] ^= *(aad++);
836 if (n==0) GCM_MUL(ctx,Xi);
844 if ((i = (len&(size_t)-16))) {
851 for (i=0; i<16; ++i) ctx->Xi.c[i] ^= aad[i];
858 n = (unsigned int)len;
859 for (i=0; i<len; ++i) ctx->Xi.c[i] ^= aad[i];
866 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
867 const unsigned char *in, unsigned char *out,
870 const union { long one; char little; } is_endian = {1};
873 u64 mlen = ctx->len.u[1];
876 n = (unsigned int)mlen%16; /* alternative to ctx->mres */
879 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
881 ctx->len.u[1] = mlen;
884 /* First call to encrypt finalizes GHASH(AAD) */
889 if (is_endian.little)
890 ctr = GETU32(ctx->Yi.c+12);
895 #if !defined(OPENSSL_SMALL_FOOTPRINT)
896 if (16%sizeof(size_t) == 0) do { /* always true actually */
899 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
903 if (n==0) GCM_MUL(ctx,Xi);
909 #if defined(STRICT_ALIGNMENT)
910 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
913 #if defined(GHASH) && defined(GHASH_CHUNK)
914 while (len>=GHASH_CHUNK) {
915 size_t j=GHASH_CHUNK;
918 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
920 if (is_endian.little)
921 PUTU32(ctx->Yi.c+12,ctr);
924 for (i=0; i<16; i+=sizeof(size_t))
926 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
931 GHASH(ctx,out-GHASH_CHUNK,GHASH_CHUNK);
934 if ((i = (len&(size_t)-16))) {
938 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
940 if (is_endian.little)
941 PUTU32(ctx->Yi.c+12,ctr);
944 for (i=0; i<16; i+=sizeof(size_t))
946 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
955 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
957 if (is_endian.little)
958 PUTU32(ctx->Yi.c+12,ctr);
961 for (i=0; i<16; i+=sizeof(size_t))
962 *(size_t *)(ctx->Xi.c+i) ^=
964 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
972 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
974 if (is_endian.little)
975 PUTU32(ctx->Yi.c+12,ctr);
979 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
988 for (i=0;i<len;++i) {
990 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
992 if (is_endian.little)
993 PUTU32(ctx->Yi.c+12,ctr);
997 ctx->Xi.c[n] ^= out[i] = in[i]^ctx->EKi.c[n];
1007 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1008 const unsigned char *in, unsigned char *out,
1011 const union { long one; char little; } is_endian = {1};
1012 unsigned int n, ctr;
1014 u64 mlen = ctx->len.u[1];
1017 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1019 ctx->len.u[1] = mlen;
1022 /* First call to decrypt finalizes GHASH(AAD) */
1027 if (is_endian.little)
1028 ctr = GETU32(ctx->Yi.c+12);
1033 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1034 if (16%sizeof(size_t) == 0) do { /* always true actually */
1038 *(out++) = c^ctx->EKi.c[n];
1043 if (n==0) GCM_MUL (ctx,Xi);
1049 #if defined(STRICT_ALIGNMENT)
1050 if (((size_t)in|(size_t)out)%sizeof(size_t) != 0)
1053 #if defined(GHASH) && defined(GHASH_CHUNK)
1054 while (len>=GHASH_CHUNK) {
1055 size_t j=GHASH_CHUNK;
1057 GHASH(ctx,in,GHASH_CHUNK);
1059 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1061 if (is_endian.little)
1062 PUTU32(ctx->Yi.c+12,ctr);
1065 for (i=0; i<16; i+=sizeof(size_t))
1066 *(size_t *)(out+i) =
1067 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1074 if ((i = (len&(size_t)-16))) {
1077 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1079 if (is_endian.little)
1080 PUTU32(ctx->Yi.c+12,ctr);
1083 for (i=0; i<16; i+=sizeof(size_t))
1084 *(size_t *)(out+i) =
1085 *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
1093 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1095 if (is_endian.little)
1096 PUTU32(ctx->Yi.c+12,ctr);
1099 for (i=0; i<16; i+=sizeof(size_t)) {
1100 size_t c = *(size_t *)(in+i);
1101 *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
1102 *(size_t *)(ctx->Xi.c+i) ^= c;
1111 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1113 if (is_endian.little)
1114 PUTU32(ctx->Yi.c+12,ctr);
1120 out[n] = c^ctx->EKi.c[n];
1129 for (i=0;i<len;++i) {
1132 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1134 if (is_endian.little)
1135 PUTU32(ctx->Yi.c+12,ctr);
1140 out[i] = c^ctx->EKi.c[n];
1151 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1152 const unsigned char *in, unsigned char *out,
1153 size_t len, ctr128_f stream)
1155 const union { long one; char little; } is_endian = {1};
1156 unsigned int n, ctr;
1158 u64 mlen = ctx->len.u[1];
1161 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1163 ctx->len.u[1] = mlen;
1166 /* First call to encrypt finalizes GHASH(AAD) */
1171 if (is_endian.little)
1172 ctr = GETU32(ctx->Yi.c+12);
1179 ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
1183 if (n==0) GCM_MUL(ctx,Xi);
1189 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1190 while (len>=GHASH_CHUNK) {
1191 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1192 ctr += GHASH_CHUNK/16;
1193 if (is_endian.little)
1194 PUTU32(ctx->Yi.c+12,ctr);
1197 GHASH(ctx,out,GHASH_CHUNK);
1203 if ((i = (len&(size_t)-16))) {
1206 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1207 ctr += (unsigned int)j;
1208 if (is_endian.little)
1209 PUTU32(ctx->Yi.c+12,ctr);
1219 for (i=0;i<16;++i) ctx->Xi.c[i] ^= out[i];
1226 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1228 if (is_endian.little)
1229 PUTU32(ctx->Yi.c+12,ctr);
1233 ctx->Xi.c[n] ^= out[n] = in[n]^ctx->EKi.c[n];
1242 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1243 const unsigned char *in, unsigned char *out,
1244 size_t len,ctr128_f stream)
1246 const union { long one; char little; } is_endian = {1};
1247 unsigned int n, ctr;
1249 u64 mlen = ctx->len.u[1];
1252 if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
1254 ctx->len.u[1] = mlen;
1257 /* First call to decrypt finalizes GHASH(AAD) */
1262 if (is_endian.little)
1263 ctr = GETU32(ctx->Yi.c+12);
1271 *(out++) = c^ctx->EKi.c[n];
1276 if (n==0) GCM_MUL (ctx,Xi);
1282 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1283 while (len>=GHASH_CHUNK) {
1284 GHASH(ctx,in,GHASH_CHUNK);
1285 (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
1286 ctr += GHASH_CHUNK/16;
1287 if (is_endian.little)
1288 PUTU32(ctx->Yi.c+12,ctr);
1296 if ((i = (len&(size_t)-16))) {
1304 for (k=0;k<16;++k) ctx->Xi.c[k] ^= in[k];
1311 (*stream)(in,out,j,ctx->key,ctx->Yi.c);
1312 ctr += (unsigned int)j;
1313 if (is_endian.little)
1314 PUTU32(ctx->Yi.c+12,ctr);
1322 (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
1324 if (is_endian.little)
1325 PUTU32(ctx->Yi.c+12,ctr);
1331 out[n] = c^ctx->EKi.c[n];
1340 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
1343 const union { long one; char little; } is_endian = {1};
1344 u64 alen = ctx->len.u[0]<<3;
1345 u64 clen = ctx->len.u[1]<<3;
1350 if (is_endian.little) {
1352 alen = BSWAP8(alen);
1353 clen = BSWAP8(clen);
1357 ctx->len.u[0] = alen;
1358 ctx->len.u[1] = clen;
1360 alen = (u64)GETU32(p) <<32|GETU32(p+4);
1361 clen = (u64)GETU32(p+8)<<32|GETU32(p+12);
1365 ctx->Xi.u[0] ^= alen;
1366 ctx->Xi.u[1] ^= clen;
1369 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1370 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1372 if (tag && len<=sizeof(ctx->Xi))
1373 return memcmp(ctx->Xi.c,tag,len);
1378 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1380 CRYPTO_gcm128_finish(ctx, NULL, 0);
1381 memcpy(tag, ctx->Xi.c, len<=sizeof(ctx->Xi.c)?len:sizeof(ctx->Xi.c));
1384 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1386 GCM128_CONTEXT *ret;
1388 if ((ret = (GCM128_CONTEXT *)OPENSSL_malloc(sizeof(GCM128_CONTEXT))))
1389 CRYPTO_gcm128_init(ret,key,block);
1394 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1397 OPENSSL_cleanse(ctx,sizeof(*ctx));
1402 #if defined(SELFTEST)
1404 #include <openssl/aes.h>
1407 static const u8 K1[16],
1412 T1[]= {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
1418 static const u8 P2[16],
1419 C2[]= {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
1420 T2[]= {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
1424 static const u8 K3[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1425 P3[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1426 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1427 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1428 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1429 IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1430 C3[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1431 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1432 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1433 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
1434 T3[]= {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
1439 static const u8 P4[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1440 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1441 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1442 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1443 A4[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1444 0xab,0xad,0xda,0xd2},
1445 C4[]= {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
1446 0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
1447 0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
1448 0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
1449 T4[]= {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
1454 static const u8 A5[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1455 0xab,0xad,0xda,0xd2},
1456 IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1457 C5[]= {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
1458 0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
1459 0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
1460 0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
1461 T5[]= {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
1467 static const u8 IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1468 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1469 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1470 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1471 C6[]= {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
1472 0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
1473 0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
1474 0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
1475 T6[]= {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
1478 static const u8 K7[24],
1483 T7[]= {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
1489 static const u8 P8[16],
1490 C8[]= {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
1491 T8[]= {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
1495 static const u8 K9[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1496 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
1497 P9[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1498 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1499 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1500 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1501 IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1502 C9[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1503 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1504 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1505 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
1506 T9[]= {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
1511 static const u8 P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1512 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1513 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1514 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1515 A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1516 0xab,0xad,0xda,0xd2},
1517 C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
1518 0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
1519 0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
1520 0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
1521 T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
1527 static const u8 IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1528 C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
1529 0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
1530 0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
1531 0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
1532 T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
1538 static const u8 IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1539 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1540 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1541 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1542 C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
1543 0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
1544 0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
1545 0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
1546 T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
1549 static const u8 K13[32],
1554 T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
1559 static const u8 P14[16],
1561 C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
1562 T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
1566 static const u8 K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
1567 0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
1568 P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1569 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1570 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1571 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
1572 IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
1573 C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1574 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1575 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1576 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
1577 T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
1582 static const u8 P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
1583 0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
1584 0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
1585 0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
1586 A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
1587 0xab,0xad,0xda,0xd2},
1588 C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
1589 0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
1590 0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
1591 0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
1592 T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
1598 static const u8 IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
1599 C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
1600 0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
1601 0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
1602 0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
1603 T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
1609 static const u8 IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
1610 0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
1611 0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
1612 0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
1613 C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
1614 0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
1615 0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
1616 0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
1617 T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
1619 #define TEST_CASE(n) do { \
1620 u8 out[sizeof(P##n)]; \
1621 AES_set_encrypt_key(K##n,sizeof(K##n)*8,&key); \
1622 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt); \
1623 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1624 memset(out,0,sizeof(out)); \
1625 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1626 if (P##n) CRYPTO_gcm128_encrypt(&ctx,P##n,out,sizeof(out)); \
1627 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1628 (C##n && memcmp(out,C##n,sizeof(out)))) \
1629 ret++, printf ("encrypt test#%d failed.\n",n); \
1630 CRYPTO_gcm128_setiv(&ctx,IV##n,sizeof(IV##n)); \
1631 memset(out,0,sizeof(out)); \
1632 if (A##n) CRYPTO_gcm128_aad(&ctx,A##n,sizeof(A##n)); \
1633 if (C##n) CRYPTO_gcm128_decrypt(&ctx,C##n,out,sizeof(out)); \
1634 if (CRYPTO_gcm128_finish(&ctx,T##n,16) || \
1635 (P##n && memcmp(out,P##n,sizeof(out)))) \
1636 ret++, printf ("decrypt test#%d failed.\n",n); \
1664 #ifdef OPENSSL_CPUID_OBJ
1666 size_t start,stop,gcm_t,ctr_t,OPENSSL_rdtsc();
1667 union { u64 u; u8 c[1024]; } buf;
1670 AES_set_encrypt_key(K1,sizeof(K1)*8,&key);
1671 CRYPTO_gcm128_init(&ctx,&key,(block128_f)AES_encrypt);
1672 CRYPTO_gcm128_setiv(&ctx,IV1,sizeof(IV1));
1674 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1675 start = OPENSSL_rdtsc();
1676 CRYPTO_gcm128_encrypt(&ctx,buf.c,buf.c,sizeof(buf));
1677 gcm_t = OPENSSL_rdtsc() - start;
1679 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1680 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1681 (block128_f)AES_encrypt);
1682 start = OPENSSL_rdtsc();
1683 CRYPTO_ctr128_encrypt(buf.c,buf.c,sizeof(buf),
1684 &key,ctx.Yi.c,ctx.EKi.c,&ctx.mres,
1685 (block128_f)AES_encrypt);
1686 ctr_t = OPENSSL_rdtsc() - start;
1688 printf("%.2f-%.2f=%.2f\n",
1689 gcm_t/(double)sizeof(buf),
1690 ctr_t/(double)sizeof(buf),
1691 (gcm_t-ctr_t)/(double)sizeof(buf));
1693 GHASH(&ctx,buf.c,sizeof(buf));
1694 start = OPENSSL_rdtsc();
1695 for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
1696 gcm_t = OPENSSL_rdtsc() - start;
1697 printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);