2 * Copyright 2010-2021 The OpenSSL Project Authors. All Rights Reserved.
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
11 #include <openssl/crypto.h>
12 #include "internal/cryptlib.h"
13 #include "internal/endian.h"
14 #include "crypto/modes.h"
16 #if defined(__GNUC__) && !defined(STRICT_ALIGNMENT)
17 typedef size_t size_t_aX __attribute((__aligned__(1)));
19 typedef size_t size_t_aX;
22 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
23 /* redefine, because alignment is ensured */
25 # define GETU32(p) BSWAP4(*(const u32 *)(p))
27 # define PUTU32(p,v) *(u32 *)(p) = BSWAP4(v)
30 #define PACK(s) ((size_t)(s)<<(sizeof(size_t)*8-16))
31 #define REDUCE1BIT(V) do { \
32 if (sizeof(size_t)==8) { \
33 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
34 V.lo = (V.hi<<63)|(V.lo>>1); \
35 V.hi = (V.hi>>1 )^T; \
38 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
39 V.lo = (V.hi<<63)|(V.lo>>1); \
40 V.hi = (V.hi>>1 )^((u64)T<<32); \
45 * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
46 * never be set to 8. 8 is effectively reserved for testing purposes.
47 * TABLE_BITS>1 are lookup-table-driven implementations referred to as
48 * "Shoup's" in GCM specification. In other words OpenSSL does not cover
49 * whole spectrum of possible table driven implementations. Why? In
50 * non-"Shoup's" case memory access pattern is segmented in such manner,
51 * that it's trivial to see that cache timing information can reveal
52 * fair portion of intermediate hash value. Given that ciphertext is
53 * always available to attacker, it's possible for him to attempt to
54 * deduce secret parameter H and if successful, tamper with messages
55 * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
56 * not as trivial, but there is no reason to believe that it's resistant
57 * to cache-timing attack. And the thing about "8-bit" implementation is
58 * that it consumes 16 (sixteen) times more memory, 4KB per individual
59 * key + 1KB shared. Well, on pros side it should be twice as fast as
60 * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
61 * was observed to run ~75% faster, closer to 100% for commercial
62 * compilers... Yet "4-bit" procedure is preferred, because it's
63 * believed to provide better security-performance balance and adequate
64 * all-round performance. "All-round" refers to things like:
66 * - shorter setup time effectively improves overall timing for
67 * handling short messages;
68 * - larger table allocation can become unbearable because of VM
69 * subsystem penalties (for example on Windows large enough free
70 * results in VM working set trimming, meaning that consequent
71 * malloc would immediately incur working set expansion);
72 * - larger table has larger cache footprint, which can affect
73 * performance of other code paths (not necessarily even from same
74 * thread in Hyper-Threading world);
76 * Value of 1 is not appropriate for performance reasons.
80 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
90 for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
95 for (i = 2; i < 256; i <<= 1) {
96 u128 *Hi = Htable + i, H0 = *Hi;
97 for (j = 1; j < i; ++j) {
98 Hi[j].hi = H0.hi ^ Htable[j].hi;
99 Hi[j].lo = H0.lo ^ Htable[j].lo;
104 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
107 const u8 *xi = (const u8 *)Xi + 15;
110 static const size_t rem_8bit[256] = {
111 PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
112 PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
113 PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
114 PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
115 PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
116 PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
117 PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
118 PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
119 PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
120 PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
121 PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
122 PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
123 PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
124 PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
125 PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
126 PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
127 PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
128 PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
129 PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
130 PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
131 PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
132 PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
133 PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
134 PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
135 PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
136 PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
137 PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
138 PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
139 PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
140 PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
141 PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
142 PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
143 PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
144 PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
145 PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
146 PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
147 PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
148 PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
149 PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
150 PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
151 PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
152 PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
153 PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
154 PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
155 PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
156 PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
157 PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
158 PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
159 PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
160 PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
161 PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
162 PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
163 PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
164 PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
165 PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
166 PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
167 PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
168 PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
169 PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
170 PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
171 PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
172 PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
173 PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
174 PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
178 Z.hi ^= Htable[n].hi;
179 Z.lo ^= Htable[n].lo;
186 rem = (size_t)Z.lo & 0xff;
187 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
189 if (sizeof(size_t) == 8)
190 Z.hi ^= rem_8bit[rem];
192 Z.hi ^= (u64)rem_8bit[rem] << 32;
195 if (IS_LITTLE_ENDIAN) {
197 Xi[0] = BSWAP8(Z.hi);
198 Xi[1] = BSWAP8(Z.lo);
202 v = (u32)(Z.hi >> 32);
206 v = (u32)(Z.lo >> 32);
217 # define GCM_MUL(ctx) gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
221 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
224 # if defined(OPENSSL_SMALL_FOOTPRINT)
233 # if defined(OPENSSL_SMALL_FOOTPRINT)
234 for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
239 for (i = 2; i < 16; i <<= 1) {
240 u128 *Hi = Htable + i;
242 for (V = *Hi, j = 1; j < i; ++j) {
243 Hi[j].hi = V.hi ^ Htable[j].hi;
244 Hi[j].lo = V.lo ^ Htable[j].lo;
255 Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
257 Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
258 Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
259 Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
261 Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
262 Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
263 Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
264 Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
265 Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
266 Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
267 Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
269 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
271 * ARM assembler expects specific dword order in Htable.
277 if (IS_LITTLE_ENDIAN)
278 for (j = 0; j < 16; ++j) {
283 for (j = 0; j < 16; ++j) {
285 Htable[j].hi = V.lo << 32 | V.lo >> 32;
286 Htable[j].lo = V.hi << 32 | V.hi >> 32;
293 static const size_t rem_4bit[16] = {
294 PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
295 PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
296 PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
297 PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
300 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
304 size_t rem, nlo, nhi;
307 nlo = ((const u8 *)Xi)[15];
311 Z.hi = Htable[nlo].hi;
312 Z.lo = Htable[nlo].lo;
315 rem = (size_t)Z.lo & 0xf;
316 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
318 if (sizeof(size_t) == 8)
319 Z.hi ^= rem_4bit[rem];
321 Z.hi ^= (u64)rem_4bit[rem] << 32;
323 Z.hi ^= Htable[nhi].hi;
324 Z.lo ^= Htable[nhi].lo;
329 nlo = ((const u8 *)Xi)[cnt];
333 rem = (size_t)Z.lo & 0xf;
334 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
336 if (sizeof(size_t) == 8)
337 Z.hi ^= rem_4bit[rem];
339 Z.hi ^= (u64)rem_4bit[rem] << 32;
341 Z.hi ^= Htable[nlo].hi;
342 Z.lo ^= Htable[nlo].lo;
345 if (IS_LITTLE_ENDIAN) {
347 Xi[0] = BSWAP8(Z.hi);
348 Xi[1] = BSWAP8(Z.lo);
352 v = (u32)(Z.hi >> 32);
356 v = (u32)(Z.lo >> 32);
367 # if !defined(OPENSSL_SMALL_FOOTPRINT)
369 * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
370 * details... Compiler-generated code doesn't seem to give any
371 * performance improvement, at least not on x86[_64]. It's here
372 * mostly as reference and a placeholder for possible future
373 * non-trivial optimization[s]...
375 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
376 const u8 *inp, size_t len)
380 size_t rem, nlo, nhi;
386 nlo = ((const u8 *)Xi)[15];
391 Z.hi = Htable[nlo].hi;
392 Z.lo = Htable[nlo].lo;
395 rem = (size_t)Z.lo & 0xf;
396 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
398 if (sizeof(size_t) == 8)
399 Z.hi ^= rem_4bit[rem];
401 Z.hi ^= (u64)rem_4bit[rem] << 32;
403 Z.hi ^= Htable[nhi].hi;
404 Z.lo ^= Htable[nhi].lo;
409 nlo = ((const u8 *)Xi)[cnt];
414 rem = (size_t)Z.lo & 0xf;
415 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
417 if (sizeof(size_t) == 8)
418 Z.hi ^= rem_4bit[rem];
420 Z.hi ^= (u64)rem_4bit[rem] << 32;
422 Z.hi ^= Htable[nlo].hi;
423 Z.lo ^= Htable[nlo].lo;
427 * Extra 256+16 bytes per-key plus 512 bytes shared tables
428 * [should] give ~50% improvement... One could have PACK()-ed
429 * the rem_8bit even here, but the priority is to minimize
432 u128 Hshr4[16]; /* Htable shifted right by 4 bits */
433 u8 Hshl4[16]; /* Htable shifted left by 4 bits */
434 static const unsigned short rem_8bit[256] = {
435 0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
436 0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
437 0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
438 0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
439 0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
440 0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
441 0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
442 0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
443 0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
444 0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
445 0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
446 0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
447 0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
448 0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
449 0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
450 0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
451 0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
452 0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
453 0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
454 0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
455 0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
456 0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
457 0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
458 0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
459 0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
460 0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
461 0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
462 0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
463 0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
464 0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
465 0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
466 0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
469 * This pre-processing phase slows down procedure by approximately
470 * same time as it makes each loop spin faster. In other words
471 * single block performance is approximately same as straightforward
472 * "4-bit" implementation, and then it goes only faster...
474 for (cnt = 0; cnt < 16; ++cnt) {
475 Z.hi = Htable[cnt].hi;
476 Z.lo = Htable[cnt].lo;
477 Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
478 Hshr4[cnt].hi = (Z.hi >> 4);
479 Hshl4[cnt] = (u8)(Z.lo << 4);
483 for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
484 nlo = ((const u8 *)Xi)[cnt];
489 Z.hi ^= Htable[nlo].hi;
490 Z.lo ^= Htable[nlo].lo;
492 rem = (size_t)Z.lo & 0xff;
494 Z.lo = (Z.hi << 56) | (Z.lo >> 8);
497 Z.hi ^= Hshr4[nhi].hi;
498 Z.lo ^= Hshr4[nhi].lo;
499 Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
502 nlo = ((const u8 *)Xi)[0];
507 Z.hi ^= Htable[nlo].hi;
508 Z.lo ^= Htable[nlo].lo;
510 rem = (size_t)Z.lo & 0xf;
512 Z.lo = (Z.hi << 60) | (Z.lo >> 4);
515 Z.hi ^= Htable[nhi].hi;
516 Z.lo ^= Htable[nhi].lo;
517 Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
520 if (IS_LITTLE_ENDIAN) {
522 Xi[0] = BSWAP8(Z.hi);
523 Xi[1] = BSWAP8(Z.lo);
527 v = (u32)(Z.hi >> 32);
531 v = (u32)(Z.lo >> 32);
540 } while (inp += 16, len -= 16);
544 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
545 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
549 # define GCM_MUL(ctx) gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
550 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
551 # define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
553 * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
554 * effect. In other words idea is to hash data while it's still in L1 cache
555 * after encryption pass...
557 # define GHASH_CHUNK (3*1024)
560 #else /* TABLE_BITS */
562 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
564 u128 V, Z = { 0, 0 };
567 const long *xi = (const long *)Xi;
570 V.hi = H[0]; /* H is in host byte order, no byte swapping */
573 for (j = 0; j < 16 / sizeof(long); ++j) {
574 if (IS_LITTLE_ENDIAN) {
575 if (sizeof(long) == 8) {
577 X = (long)(BSWAP8(xi[j]));
579 const u8 *p = (const u8 *)(xi + j);
580 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
583 const u8 *p = (const u8 *)(xi + j);
589 for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
590 u64 M = (u64)(X >> (8 * sizeof(long) - 1));
598 if (IS_LITTLE_ENDIAN) {
600 Xi[0] = BSWAP8(Z.hi);
601 Xi[1] = BSWAP8(Z.lo);
605 v = (u32)(Z.hi >> 32);
609 v = (u32)(Z.lo >> 32);
620 # define GCM_MUL(ctx) gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
624 #if TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
625 # if !defined(I386_ONLY) && \
626 (defined(__i386) || defined(__i386__) || \
627 defined(__x86_64) || defined(__x86_64__) || \
628 defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
629 # define GHASH_ASM_X86_OR_64
630 # define GCM_FUNCREF_4BIT
632 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
633 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
634 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
637 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
638 # define gcm_init_avx gcm_init_clmul
639 # define gcm_gmult_avx gcm_gmult_clmul
640 # define gcm_ghash_avx gcm_ghash_clmul
642 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
643 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
644 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
648 # if defined(__i386) || defined(__i386__) || defined(_M_IX86)
649 # define GHASH_ASM_X86
650 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
651 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
654 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
655 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
658 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
659 # include "arm_arch.h"
660 # if __ARM_MAX_ARCH__>=7
661 # define GHASH_ASM_ARM
662 # define GCM_FUNCREF_4BIT
663 # define PMULL_CAPABLE (OPENSSL_armcap_P & ARMV8_PMULL)
664 # if defined(__arm__) || defined(__arm)
665 # define NEON_CAPABLE (OPENSSL_armcap_P & ARMV7_NEON)
667 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
668 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
669 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
671 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
672 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
673 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
676 # elif defined(__sparc__) || defined(__sparc)
677 # include "crypto/sparc_arch.h"
678 # define GHASH_ASM_SPARC
679 # define GCM_FUNCREF_4BIT
680 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
681 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
682 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
684 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
685 # include "crypto/ppc_arch.h"
686 # define GHASH_ASM_PPC
687 # define GCM_FUNCREF_4BIT
688 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
689 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
690 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
695 #ifdef GCM_FUNCREF_4BIT
697 # define GCM_MUL(ctx) (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
700 # define GHASH(ctx,in,len) (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
704 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
708 memset(ctx, 0, sizeof(*ctx));
712 (*block) (ctx->H.c, ctx->H.c, key);
714 if (IS_LITTLE_ENDIAN) {
715 /* H is stored in host byte order */
717 ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
718 ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
722 hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
723 lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
729 gcm_init_8bit(ctx->Htable, ctx->H.u);
732 # define CTX__GHASH(f) (ctx->ghash = (f))
734 # define CTX__GHASH(f) (ctx->ghash = NULL)
736 # if defined(GHASH_ASM_X86_OR_64)
737 # if !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
738 if (OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
739 if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
740 gcm_init_avx(ctx->Htable, ctx->H.u);
741 ctx->gmult = gcm_gmult_avx;
742 CTX__GHASH(gcm_ghash_avx);
744 gcm_init_clmul(ctx->Htable, ctx->H.u);
745 ctx->gmult = gcm_gmult_clmul;
746 CTX__GHASH(gcm_ghash_clmul);
751 gcm_init_4bit(ctx->Htable, ctx->H.u);
752 # if defined(GHASH_ASM_X86) /* x86 only */
753 # if defined(OPENSSL_IA32_SSE2)
754 if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
756 if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
758 ctx->gmult = gcm_gmult_4bit_mmx;
759 CTX__GHASH(gcm_ghash_4bit_mmx);
761 ctx->gmult = gcm_gmult_4bit_x86;
762 CTX__GHASH(gcm_ghash_4bit_x86);
765 ctx->gmult = gcm_gmult_4bit;
766 CTX__GHASH(gcm_ghash_4bit);
768 # elif defined(GHASH_ASM_ARM)
769 # ifdef PMULL_CAPABLE
771 gcm_init_v8(ctx->Htable, ctx->H.u);
772 ctx->gmult = gcm_gmult_v8;
773 CTX__GHASH(gcm_ghash_v8);
778 gcm_init_neon(ctx->Htable, ctx->H.u);
779 ctx->gmult = gcm_gmult_neon;
780 CTX__GHASH(gcm_ghash_neon);
784 gcm_init_4bit(ctx->Htable, ctx->H.u);
785 ctx->gmult = gcm_gmult_4bit;
786 CTX__GHASH(gcm_ghash_4bit);
788 # elif defined(GHASH_ASM_SPARC)
789 if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
790 gcm_init_vis3(ctx->Htable, ctx->H.u);
791 ctx->gmult = gcm_gmult_vis3;
792 CTX__GHASH(gcm_ghash_vis3);
794 gcm_init_4bit(ctx->Htable, ctx->H.u);
795 ctx->gmult = gcm_gmult_4bit;
796 CTX__GHASH(gcm_ghash_4bit);
798 # elif defined(GHASH_ASM_PPC)
799 if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
800 gcm_init_p8(ctx->Htable, ctx->H.u);
801 ctx->gmult = gcm_gmult_p8;
802 CTX__GHASH(gcm_ghash_p8);
804 gcm_init_4bit(ctx->Htable, ctx->H.u);
805 ctx->gmult = gcm_gmult_4bit;
806 CTX__GHASH(gcm_ghash_4bit);
809 gcm_init_4bit(ctx->Htable, ctx->H.u);
815 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
820 #ifdef GCM_FUNCREF_4BIT
821 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
824 ctx->len.u[0] = 0; /* AAD length */
825 ctx->len.u[1] = 0; /* message length */
830 memcpy(ctx->Yi.c, iv, 12);
840 /* Borrow ctx->Xi to calculate initial Yi */
845 for (i = 0; i < 16; ++i)
846 ctx->Xi.c[i] ^= iv[i];
852 for (i = 0; i < len; ++i)
853 ctx->Xi.c[i] ^= iv[i];
857 if (IS_LITTLE_ENDIAN) {
859 ctx->Xi.u[1] ^= BSWAP8(len0);
861 ctx->Xi.c[8] ^= (u8)(len0 >> 56);
862 ctx->Xi.c[9] ^= (u8)(len0 >> 48);
863 ctx->Xi.c[10] ^= (u8)(len0 >> 40);
864 ctx->Xi.c[11] ^= (u8)(len0 >> 32);
865 ctx->Xi.c[12] ^= (u8)(len0 >> 24);
866 ctx->Xi.c[13] ^= (u8)(len0 >> 16);
867 ctx->Xi.c[14] ^= (u8)(len0 >> 8);
868 ctx->Xi.c[15] ^= (u8)(len0);
871 ctx->Xi.u[1] ^= len0;
876 if (IS_LITTLE_ENDIAN)
878 ctr = BSWAP4(ctx->Xi.d[3]);
880 ctr = GETU32(ctx->Xi.c + 12);
885 /* Copy borrowed Xi to Yi */
886 ctx->Yi.u[0] = ctx->Xi.u[0];
887 ctx->Yi.u[1] = ctx->Xi.u[1];
893 (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
895 if (IS_LITTLE_ENDIAN)
897 ctx->Yi.d[3] = BSWAP4(ctr);
899 PUTU32(ctx->Yi.c + 12, ctr);
905 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
910 u64 alen = ctx->len.u[0];
911 #ifdef GCM_FUNCREF_4BIT
912 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
914 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
915 const u8 *inp, size_t len) = ctx->ghash;
923 if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
925 ctx->len.u[0] = alen;
930 ctx->Xi.c[n] ^= *(aad++);
942 if ((i = (len & (size_t)-16))) {
949 for (i = 0; i < 16; ++i)
950 ctx->Xi.c[i] ^= aad[i];
957 n = (unsigned int)len;
958 for (i = 0; i < len; ++i)
959 ctx->Xi.c[i] ^= aad[i];
966 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
967 const unsigned char *in, unsigned char *out,
971 unsigned int n, ctr, mres;
973 u64 mlen = ctx->len.u[1];
974 block128_f block = ctx->block;
975 void *key = ctx->key;
976 #ifdef GCM_FUNCREF_4BIT
977 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
978 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
979 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
980 const u8 *inp, size_t len) = ctx->ghash;
985 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
987 ctx->len.u[1] = mlen;
992 /* First call to encrypt finalizes GHASH(AAD) */
993 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
999 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1002 mres = sizeof(ctx->Xi);
1009 if (IS_LITTLE_ENDIAN)
1011 ctr = BSWAP4(ctx->Yi.d[3]);
1013 ctr = GETU32(ctx->Yi.c + 12);
1019 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1020 if (16 % sizeof(size_t) == 0) { /* always true actually */
1025 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1030 GHASH(ctx, ctx->Xn, mres);
1038 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1051 # if defined(STRICT_ALIGNMENT)
1052 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1056 if (len >= 16 && mres) {
1057 GHASH(ctx, ctx->Xn, mres);
1060 # if defined(GHASH_CHUNK)
1061 while (len >= GHASH_CHUNK) {
1062 size_t j = GHASH_CHUNK;
1065 size_t_aX *out_t = (size_t_aX *)out;
1066 const size_t_aX *in_t = (const size_t_aX *)in;
1068 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1070 if (IS_LITTLE_ENDIAN)
1072 ctx->Yi.d[3] = BSWAP4(ctr);
1074 PUTU32(ctx->Yi.c + 12, ctr);
1078 for (i = 0; i < 16 / sizeof(size_t); ++i)
1079 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1084 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1088 if ((i = (len & (size_t)-16))) {
1092 size_t_aX *out_t = (size_t_aX *)out;
1093 const size_t_aX *in_t = (const size_t_aX *)in;
1095 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1097 if (IS_LITTLE_ENDIAN)
1099 ctx->Yi.d[3] = BSWAP4(ctr);
1101 PUTU32(ctx->Yi.c + 12, ctr);
1105 for (i = 0; i < 16 / sizeof(size_t); ++i)
1106 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1111 GHASH(ctx, out - j, j);
1115 size_t *out_t = (size_t *)out;
1116 const size_t *in_t = (const size_t *)in;
1118 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1120 if (IS_LITTLE_ENDIAN)
1122 ctx->Yi.d[3] = BSWAP4(ctr);
1124 PUTU32(ctx->Yi.c + 12, ctr);
1128 for (i = 0; i < 16 / sizeof(size_t); ++i)
1129 ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1137 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1139 if (IS_LITTLE_ENDIAN)
1141 ctx->Yi.d[3] = BSWAP4(ctr);
1143 PUTU32(ctx->Yi.c + 12, ctr);
1149 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1154 ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1166 for (i = 0; i < len; ++i) {
1168 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1170 if (IS_LITTLE_ENDIAN)
1172 ctx->Yi.d[3] = BSWAP4(ctr);
1174 PUTU32(ctx->Yi.c + 12, ctr);
1179 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1180 ctx->Xn[mres++] = out[i] = in[i] ^ ctx->EKi.c[n];
1182 if (mres == sizeof(ctx->Xn)) {
1183 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1187 ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1188 mres = n = (n + 1) % 16;
1198 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1199 const unsigned char *in, unsigned char *out,
1203 unsigned int n, ctr, mres;
1205 u64 mlen = ctx->len.u[1];
1206 block128_f block = ctx->block;
1207 void *key = ctx->key;
1208 #ifdef GCM_FUNCREF_4BIT
1209 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1210 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1211 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1212 const u8 *inp, size_t len) = ctx->ghash;
1217 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1219 ctx->len.u[1] = mlen;
1224 /* First call to decrypt finalizes GHASH(AAD) */
1225 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1231 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1234 mres = sizeof(ctx->Xi);
1241 if (IS_LITTLE_ENDIAN)
1243 ctr = BSWAP4(ctx->Yi.d[3]);
1245 ctr = GETU32(ctx->Yi.c + 12);
1251 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1252 if (16 % sizeof(size_t) == 0) { /* always true actually */
1257 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1262 GHASH(ctx, ctx->Xn, mres);
1271 *(out++) = c ^ ctx->EKi.c[n];
1285 # if defined(STRICT_ALIGNMENT)
1286 if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1290 if (len >= 16 && mres) {
1291 GHASH(ctx, ctx->Xn, mres);
1294 # if defined(GHASH_CHUNK)
1295 while (len >= GHASH_CHUNK) {
1296 size_t j = GHASH_CHUNK;
1298 GHASH(ctx, in, GHASH_CHUNK);
1300 size_t_aX *out_t = (size_t_aX *)out;
1301 const size_t_aX *in_t = (const size_t_aX *)in;
1303 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1305 if (IS_LITTLE_ENDIAN)
1307 ctx->Yi.d[3] = BSWAP4(ctr);
1309 PUTU32(ctx->Yi.c + 12, ctr);
1313 for (i = 0; i < 16 / sizeof(size_t); ++i)
1314 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1322 if ((i = (len & (size_t)-16))) {
1325 size_t_aX *out_t = (size_t_aX *)out;
1326 const size_t_aX *in_t = (const size_t_aX *)in;
1328 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1330 if (IS_LITTLE_ENDIAN)
1332 ctx->Yi.d[3] = BSWAP4(ctr);
1334 PUTU32(ctx->Yi.c + 12, ctr);
1338 for (i = 0; i < 16 / sizeof(size_t); ++i)
1339 out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1347 size_t *out_t = (size_t *)out;
1348 const size_t *in_t = (const size_t *)in;
1350 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1352 if (IS_LITTLE_ENDIAN)
1354 ctx->Yi.d[3] = BSWAP4(ctr);
1356 PUTU32(ctx->Yi.c + 12, ctr);
1360 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1362 out_t[i] = c ^ ctx->EKi.t[i];
1372 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1374 if (IS_LITTLE_ENDIAN)
1376 ctx->Yi.d[3] = BSWAP4(ctr);
1378 PUTU32(ctx->Yi.c + 12, ctr);
1384 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1391 out[n] = c ^ ctx->EKi.c[n];
1403 for (i = 0; i < len; ++i) {
1406 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1408 if (IS_LITTLE_ENDIAN)
1410 ctx->Yi.d[3] = BSWAP4(ctr);
1412 PUTU32(ctx->Yi.c + 12, ctr);
1417 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1418 out[i] = (ctx->Xn[mres++] = c = in[i]) ^ ctx->EKi.c[n];
1420 if (mres == sizeof(ctx->Xn)) {
1421 GHASH(ctx,ctx->Xn,sizeof(ctx->Xn));
1426 out[i] = c ^ ctx->EKi.c[n];
1428 mres = n = (n + 1) % 16;
1438 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1439 const unsigned char *in, unsigned char *out,
1440 size_t len, ctr128_f stream)
1442 #if defined(OPENSSL_SMALL_FOOTPRINT)
1443 return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1446 unsigned int n, ctr, mres;
1448 u64 mlen = ctx->len.u[1];
1449 void *key = ctx->key;
1450 # ifdef GCM_FUNCREF_4BIT
1451 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1453 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1454 const u8 *inp, size_t len) = ctx->ghash;
1459 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1461 ctx->len.u[1] = mlen;
1466 /* First call to encrypt finalizes GHASH(AAD) */
1473 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1476 mres = sizeof(ctx->Xi);
1483 if (IS_LITTLE_ENDIAN)
1485 ctr = BSWAP4(ctx->Yi.d[3]);
1487 ctr = GETU32(ctx->Yi.c + 12);
1496 ctx->Xn[mres++] = *(out++) = *(in++) ^ ctx->EKi.c[n];
1501 GHASH(ctx, ctx->Xn, mres);
1509 ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1523 if (len >= 16 && mres) {
1524 GHASH(ctx, ctx->Xn, mres);
1527 # if defined(GHASH_CHUNK)
1528 while (len >= GHASH_CHUNK) {
1529 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1530 ctr += GHASH_CHUNK / 16;
1531 if (IS_LITTLE_ENDIAN)
1533 ctx->Yi.d[3] = BSWAP4(ctr);
1535 PUTU32(ctx->Yi.c + 12, ctr);
1539 GHASH(ctx, out, GHASH_CHUNK);
1546 if ((i = (len & (size_t)-16))) {
1549 (*stream) (in, out, j, key, ctx->Yi.c);
1550 ctr += (unsigned int)j;
1551 if (IS_LITTLE_ENDIAN)
1553 ctx->Yi.d[3] = BSWAP4(ctr);
1555 PUTU32(ctx->Yi.c + 12, ctr);
1566 for (i = 0; i < 16; ++i)
1567 ctx->Xi.c[i] ^= out[i];
1574 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1576 if (IS_LITTLE_ENDIAN)
1578 ctx->Yi.d[3] = BSWAP4(ctr);
1580 PUTU32(ctx->Yi.c + 12, ctr);
1586 ctx->Xn[mres++] = out[n] = in[n] ^ ctx->EKi.c[n];
1588 ctx->Xi.c[mres++] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1599 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1600 const unsigned char *in, unsigned char *out,
1601 size_t len, ctr128_f stream)
1603 #if defined(OPENSSL_SMALL_FOOTPRINT)
1604 return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1607 unsigned int n, ctr, mres;
1609 u64 mlen = ctx->len.u[1];
1610 void *key = ctx->key;
1611 # ifdef GCM_FUNCREF_4BIT
1612 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1614 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1615 const u8 *inp, size_t len) = ctx->ghash;
1620 if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1622 ctx->len.u[1] = mlen;
1627 /* First call to decrypt finalizes GHASH(AAD) */
1634 memcpy(ctx->Xn, ctx->Xi.c, sizeof(ctx->Xi));
1637 mres = sizeof(ctx->Xi);
1644 if (IS_LITTLE_ENDIAN)
1646 ctr = BSWAP4(ctx->Yi.d[3]);
1648 ctr = GETU32(ctx->Yi.c + 12);
1657 *(out++) = (ctx->Xn[mres++] = *(in++)) ^ ctx->EKi.c[n];
1662 GHASH(ctx, ctx->Xn, mres);
1671 *(out++) = c ^ ctx->EKi.c[n];
1686 if (len >= 16 && mres) {
1687 GHASH(ctx, ctx->Xn, mres);
1690 # if defined(GHASH_CHUNK)
1691 while (len >= GHASH_CHUNK) {
1692 GHASH(ctx, in, GHASH_CHUNK);
1693 (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1694 ctr += GHASH_CHUNK / 16;
1695 if (IS_LITTLE_ENDIAN)
1697 ctx->Yi.d[3] = BSWAP4(ctr);
1699 PUTU32(ctx->Yi.c + 12, ctr);
1709 if ((i = (len & (size_t)-16))) {
1717 for (k = 0; k < 16; ++k)
1718 ctx->Xi.c[k] ^= in[k];
1725 (*stream) (in, out, j, key, ctx->Yi.c);
1726 ctr += (unsigned int)j;
1727 if (IS_LITTLE_ENDIAN)
1729 ctx->Yi.d[3] = BSWAP4(ctr);
1731 PUTU32(ctx->Yi.c + 12, ctr);
1740 (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1742 if (IS_LITTLE_ENDIAN)
1744 ctx->Yi.d[3] = BSWAP4(ctr);
1746 PUTU32(ctx->Yi.c + 12, ctr);
1752 out[n] = (ctx->Xn[mres++] = in[n]) ^ ctx->EKi.c[n];
1755 ctx->Xi.c[mres++] ^= c;
1756 out[n] = c ^ ctx->EKi.c[n];
1767 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1771 u64 alen = ctx->len.u[0] << 3;
1772 u64 clen = ctx->len.u[1] << 3;
1773 #ifdef GCM_FUNCREF_4BIT
1774 void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1775 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1776 void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1777 const u8 *inp, size_t len) = ctx->ghash;
1781 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1783 unsigned int mres = ctx->mres;
1786 unsigned blocks = (mres + 15) & -16;
1788 memset(ctx->Xn + mres, 0, blocks - mres);
1790 if (mres == sizeof(ctx->Xn)) {
1791 GHASH(ctx, ctx->Xn, mres);
1794 } else if (ctx->ares) {
1798 if (ctx->mres || ctx->ares)
1802 if (IS_LITTLE_ENDIAN) {
1804 alen = BSWAP8(alen);
1805 clen = BSWAP8(clen);
1809 ctx->len.u[0] = alen;
1810 ctx->len.u[1] = clen;
1812 alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1813 clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1817 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1820 memcpy(ctx->Xn + mres, &bitlen, sizeof(bitlen));
1821 mres += sizeof(bitlen);
1822 GHASH(ctx, ctx->Xn, mres);
1824 ctx->Xi.u[0] ^= alen;
1825 ctx->Xi.u[1] ^= clen;
1829 ctx->Xi.u[0] ^= ctx->EK0.u[0];
1830 ctx->Xi.u[1] ^= ctx->EK0.u[1];
1832 if (tag && len <= sizeof(ctx->Xi))
1833 return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1838 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1840 CRYPTO_gcm128_finish(ctx, NULL, 0);
1841 memcpy(tag, ctx->Xi.c,
1842 len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1845 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1847 GCM128_CONTEXT *ret;
1849 if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1850 CRYPTO_gcm128_init(ret, key, block);
1855 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1857 OPENSSL_clear_free(ctx, sizeof(*ctx));