crypto/modes/gcm128.c

   1 /*
   2  * Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
   3  *
   4  * Licensed under the OpenSSL license (the "License").  You may not use
   5  * this file except in compliance with the License.  You can obtain a copy
   6  * in the file LICENSE in the source distribution or at
   7  * https://www.openssl.org/source/license.html
   8  */
   9
  10 #include <openssl/crypto.h>
  11 #include "modes_lcl.h"
  12 #include <string.h>
  13
  14 #if defined(BSWAP4) && defined(STRICT_ALIGNMENT)
  15 /* redefine, because alignment is ensured */
  16 # undef  GETU32
  17 # define GETU32(p)       BSWAP4(*(const u32 *)(p))
  18 # undef  PUTU32
  19 # define PUTU32(p,v)     *(u32 *)(p) = BSWAP4(v)
  20 #endif
  21
  22 #define PACK(s)         ((size_t)(s)<<(sizeof(size_t)*8-16))
  23 #define REDUCE1BIT(V)   do { \
  24         if (sizeof(size_t)==8) { \
  25                 u64 T = U64(0xe100000000000000) & (0-(V.lo&1)); \
  26                 V.lo  = (V.hi<<63)|(V.lo>>1); \
  27                 V.hi  = (V.hi>>1 )^T; \
  28         } \
  29         else { \
  30                 u32 T = 0xe1000000U & (0-(u32)(V.lo&1)); \
  31                 V.lo  = (V.hi<<63)|(V.lo>>1); \
  32                 V.hi  = (V.hi>>1 )^((u64)T<<32); \
  33         } \
  34 } while(0)
  35
  36 /*-
  37  * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
  38  * never be set to 8. 8 is effectively reserved for testing purposes.
  39  * TABLE_BITS>1 are lookup-table-driven implementations referred to as
  40  * "Shoup's" in GCM specification. In other words OpenSSL does not cover
  41  * whole spectrum of possible table driven implementations. Why? In
  42  * non-"Shoup's" case memory access pattern is segmented in such manner,
  43  * that it's trivial to see that cache timing information can reveal
  44  * fair portion of intermediate hash value. Given that ciphertext is
  45  * always available to attacker, it's possible for him to attempt to
  46  * deduce secret parameter H and if successful, tamper with messages
  47  * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
  48  * not as trivial, but there is no reason to believe that it's resistant
  49  * to cache-timing attack. And the thing about "8-bit" implementation is
  50  * that it consumes 16 (sixteen) times more memory, 4KB per individual
  51  * key + 1KB shared. Well, on pros side it should be twice as fast as
  52  * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
  53  * was observed to run ~75% faster, closer to 100% for commercial
  54  * compilers... Yet "4-bit" procedure is preferred, because it's
  55  * believed to provide better security-performance balance and adequate
  56  * all-round performance. "All-round" refers to things like:
  57  *
  58  * - shorter setup time effectively improves overall timing for
  59  *   handling short messages;
  60  * - larger table allocation can become unbearable because of VM
  61  *   subsystem penalties (for example on Windows large enough free
  62  *   results in VM working set trimming, meaning that consequent
  63  *   malloc would immediately incur working set expansion);
  64  * - larger table has larger cache footprint, which can affect
  65  *   performance of other code paths (not necessarily even from same
  66  *   thread in Hyper-Threading world);
  67  *
  68  * Value of 1 is not appropriate for performance reasons.
  69  */
  70 #if     TABLE_BITS==8
  71
  72 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
  73 {
  74     int i, j;
  75     u128 V;
  76
  77     Htable[0].hi = 0;
  78     Htable[0].lo = 0;
  79     V.hi = H[0];
  80     V.lo = H[1];
  81
  82     for (Htable[128] = V, i = 64; i > 0; i >>= 1) {
  83         REDUCE1BIT(V);
  84         Htable[i] = V;
  85     }
  86
  87     for (i = 2; i < 256; i <<= 1) {
  88         u128 *Hi = Htable + i, H0 = *Hi;
  89         for (j = 1; j < i; ++j) {
  90             Hi[j].hi = H0.hi ^ Htable[j].hi;
  91             Hi[j].lo = H0.lo ^ Htable[j].lo;
  92         }
  93     }
  94 }
  95
  96 static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
  97 {
  98     u128 Z = { 0, 0 };
  99     const u8 *xi = (const u8 *)Xi + 15;
 100     size_t rem, n = *xi;
 101     const union {
 102         long one;
 103         char little;
 104     } is_endian = { 1 };
 105     static const size_t rem_8bit[256] = {
 106         PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
 107         PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
 108         PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
 109         PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
 110         PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
 111         PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
 112         PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
 113         PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
 114         PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
 115         PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
 116         PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
 117         PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
 118         PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
 119         PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
 120         PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
 121         PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
 122         PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
 123         PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
 124         PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
 125         PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
 126         PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
 127         PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
 128         PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
 129         PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
 130         PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
 131         PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
 132         PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
 133         PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
 134         PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
 135         PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
 136         PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
 137         PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
 138         PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
 139         PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
 140         PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
 141         PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
 142         PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
 143         PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
 144         PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
 145         PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
 146         PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
 147         PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
 148         PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
 149         PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
 150         PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
 151         PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
 152         PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
 153         PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
 154         PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
 155         PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
 156         PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
 157         PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
 158         PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
 159         PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
 160         PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
 161         PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
 162         PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
 163         PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
 164         PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
 165         PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
 166         PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
 167         PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
 168         PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
 169         PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE)
 170     };
 171
 172     while (1) {
 173         Z.hi ^= Htable[n].hi;
 174         Z.lo ^= Htable[n].lo;
 175
 176         if ((u8 *)Xi == xi)
 177             break;
 178
 179         n = *(--xi);
 180
 181         rem = (size_t)Z.lo & 0xff;
 182         Z.lo = (Z.hi << 56) | (Z.lo >> 8);
 183         Z.hi = (Z.hi >> 8);
 184         if (sizeof(size_t) == 8)
 185             Z.hi ^= rem_8bit[rem];
 186         else
 187             Z.hi ^= (u64)rem_8bit[rem] << 32;
 188     }
 189
 190     if (is_endian.little) {
 191 # ifdef BSWAP8
 192         Xi[0] = BSWAP8(Z.hi);
 193         Xi[1] = BSWAP8(Z.lo);
 194 # else
 195         u8 *p = (u8 *)Xi;
 196         u32 v;
 197         v = (u32)(Z.hi >> 32);
 198         PUTU32(p, v);
 199         v = (u32)(Z.hi);
 200         PUTU32(p + 4, v);
 201         v = (u32)(Z.lo >> 32);
 202         PUTU32(p + 8, v);
 203         v = (u32)(Z.lo);
 204         PUTU32(p + 12, v);
 205 # endif
 206     } else {
 207         Xi[0] = Z.hi;
 208         Xi[1] = Z.lo;
 209     }
 210 }
 211
 212 # define GCM_MUL(ctx,Xi)   gcm_gmult_8bit(ctx->Xi.u,ctx->Htable)
 213
 214 #elif   TABLE_BITS==4
 215
 216 static void gcm_init_4bit(u128 Htable[16], u64 H[2])
 217 {
 218     u128 V;
 219 # if defined(OPENSSL_SMALL_FOOTPRINT)
 220     int i;
 221 # endif
 222
 223     Htable[0].hi = 0;
 224     Htable[0].lo = 0;
 225     V.hi = H[0];
 226     V.lo = H[1];
 227
 228 # if defined(OPENSSL_SMALL_FOOTPRINT)
 229     for (Htable[8] = V, i = 4; i > 0; i >>= 1) {
 230         REDUCE1BIT(V);
 231         Htable[i] = V;
 232     }
 233
 234     for (i = 2; i < 16; i <<= 1) {
 235         u128 *Hi = Htable + i;
 236         int j;
 237         for (V = *Hi, j = 1; j < i; ++j) {
 238             Hi[j].hi = V.hi ^ Htable[j].hi;
 239             Hi[j].lo = V.lo ^ Htable[j].lo;
 240         }
 241     }
 242 # else
 243     Htable[8] = V;
 244     REDUCE1BIT(V);
 245     Htable[4] = V;
 246     REDUCE1BIT(V);
 247     Htable[2] = V;
 248     REDUCE1BIT(V);
 249     Htable[1] = V;
 250     Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo;
 251     V = Htable[4];
 252     Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo;
 253     Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo;
 254     Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo;
 255     V = Htable[8];
 256     Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo;
 257     Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo;
 258     Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo;
 259     Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo;
 260     Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo;
 261     Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo;
 262     Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo;
 263 # endif
 264 # if defined(GHASH_ASM) && (defined(__arm__) || defined(__arm))
 265     /*
 266      * ARM assembler expects specific dword order in Htable.
 267      */
 268     {
 269         int j;
 270         const union {
 271             long one;
 272             char little;
 273         } is_endian = { 1 };
 274
 275         if (is_endian.little)
 276             for (j = 0; j < 16; ++j) {
 277                 V = Htable[j];
 278                 Htable[j].hi = V.lo;
 279                 Htable[j].lo = V.hi;
 280         } else
 281             for (j = 0; j < 16; ++j) {
 282                 V = Htable[j];
 283                 Htable[j].hi = V.lo << 32 | V.lo >> 32;
 284                 Htable[j].lo = V.hi << 32 | V.hi >> 32;
 285             }
 286     }
 287 # endif
 288 }
 289
 290 # ifndef GHASH_ASM
 291 static const size_t rem_4bit[16] = {
 292     PACK(0x0000), PACK(0x1C20), PACK(0x3840), PACK(0x2460),
 293     PACK(0x7080), PACK(0x6CA0), PACK(0x48C0), PACK(0x54E0),
 294     PACK(0xE100), PACK(0xFD20), PACK(0xD940), PACK(0xC560),
 295     PACK(0x9180), PACK(0x8DA0), PACK(0xA9C0), PACK(0xB5E0)
 296 };
 297
 298 static void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16])
 299 {
 300     u128 Z;
 301     int cnt = 15;
 302     size_t rem, nlo, nhi;
 303     const union {
 304         long one;
 305         char little;
 306     } is_endian = { 1 };
 307
 308     nlo = ((const u8 *)Xi)[15];
 309     nhi = nlo >> 4;
 310     nlo &= 0xf;
 311
 312     Z.hi = Htable[nlo].hi;
 313     Z.lo = Htable[nlo].lo;
 314
 315     while (1) {
 316         rem = (size_t)Z.lo & 0xf;
 317         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 318         Z.hi = (Z.hi >> 4);
 319         if (sizeof(size_t) == 8)
 320             Z.hi ^= rem_4bit[rem];
 321         else
 322             Z.hi ^= (u64)rem_4bit[rem] << 32;
 323
 324         Z.hi ^= Htable[nhi].hi;
 325         Z.lo ^= Htable[nhi].lo;
 326
 327         if (--cnt < 0)
 328             break;
 329
 330         nlo = ((const u8 *)Xi)[cnt];
 331         nhi = nlo >> 4;
 332         nlo &= 0xf;
 333
 334         rem = (size_t)Z.lo & 0xf;
 335         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 336         Z.hi = (Z.hi >> 4);
 337         if (sizeof(size_t) == 8)
 338             Z.hi ^= rem_4bit[rem];
 339         else
 340             Z.hi ^= (u64)rem_4bit[rem] << 32;
 341
 342         Z.hi ^= Htable[nlo].hi;
 343         Z.lo ^= Htable[nlo].lo;
 344     }
 345
 346     if (is_endian.little) {
 347 #  ifdef BSWAP8
 348         Xi[0] = BSWAP8(Z.hi);
 349         Xi[1] = BSWAP8(Z.lo);
 350 #  else
 351         u8 *p = (u8 *)Xi;
 352         u32 v;
 353         v = (u32)(Z.hi >> 32);
 354         PUTU32(p, v);
 355         v = (u32)(Z.hi);
 356         PUTU32(p + 4, v);
 357         v = (u32)(Z.lo >> 32);
 358         PUTU32(p + 8, v);
 359         v = (u32)(Z.lo);
 360         PUTU32(p + 12, v);
 361 #  endif
 362     } else {
 363         Xi[0] = Z.hi;
 364         Xi[1] = Z.lo;
 365     }
 366 }
 367
 368 #  if !defined(OPENSSL_SMALL_FOOTPRINT)
 369 /*
 370  * Streamed gcm_mult_4bit, see CRYPTO_gcm128_[en|de]crypt for
 371  * details... Compiler-generated code doesn't seem to give any
 372  * performance improvement, at least not on x86[_64]. It's here
 373  * mostly as reference and a placeholder for possible future
 374  * non-trivial optimization[s]...
 375  */
 376 static void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16],
 377                            const u8 *inp, size_t len)
 378 {
 379     u128 Z;
 380     int cnt;
 381     size_t rem, nlo, nhi;
 382     const union {
 383         long one;
 384         char little;
 385     } is_endian = { 1 };
 386
 387 #   if 1
 388     do {
 389         cnt = 15;
 390         nlo = ((const u8 *)Xi)[15];
 391         nlo ^= inp[15];
 392         nhi = nlo >> 4;
 393         nlo &= 0xf;
 394
 395         Z.hi = Htable[nlo].hi;
 396         Z.lo = Htable[nlo].lo;
 397
 398         while (1) {
 399             rem = (size_t)Z.lo & 0xf;
 400             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 401             Z.hi = (Z.hi >> 4);
 402             if (sizeof(size_t) == 8)
 403                 Z.hi ^= rem_4bit[rem];
 404             else
 405                 Z.hi ^= (u64)rem_4bit[rem] << 32;
 406
 407             Z.hi ^= Htable[nhi].hi;
 408             Z.lo ^= Htable[nhi].lo;
 409
 410             if (--cnt < 0)
 411                 break;
 412
 413             nlo = ((const u8 *)Xi)[cnt];
 414             nlo ^= inp[cnt];
 415             nhi = nlo >> 4;
 416             nlo &= 0xf;
 417
 418             rem = (size_t)Z.lo & 0xf;
 419             Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 420             Z.hi = (Z.hi >> 4);
 421             if (sizeof(size_t) == 8)
 422                 Z.hi ^= rem_4bit[rem];
 423             else
 424                 Z.hi ^= (u64)rem_4bit[rem] << 32;
 425
 426             Z.hi ^= Htable[nlo].hi;
 427             Z.lo ^= Htable[nlo].lo;
 428         }
 429 #   else
 430     /*
 431      * Extra 256+16 bytes per-key plus 512 bytes shared tables
 432      * [should] give ~50% improvement... One could have PACK()-ed
 433      * the rem_8bit even here, but the priority is to minimize
 434      * cache footprint...
 435      */
 436     u128 Hshr4[16];             /* Htable shifted right by 4 bits */
 437     u8 Hshl4[16];               /* Htable shifted left by 4 bits */
 438     static const unsigned short rem_8bit[256] = {
 439         0x0000, 0x01C2, 0x0384, 0x0246, 0x0708, 0x06CA, 0x048C, 0x054E,
 440         0x0E10, 0x0FD2, 0x0D94, 0x0C56, 0x0918, 0x08DA, 0x0A9C, 0x0B5E,
 441         0x1C20, 0x1DE2, 0x1FA4, 0x1E66, 0x1B28, 0x1AEA, 0x18AC, 0x196E,
 442         0x1230, 0x13F2, 0x11B4, 0x1076, 0x1538, 0x14FA, 0x16BC, 0x177E,
 443         0x3840, 0x3982, 0x3BC4, 0x3A06, 0x3F48, 0x3E8A, 0x3CCC, 0x3D0E,
 444         0x3650, 0x3792, 0x35D4, 0x3416, 0x3158, 0x309A, 0x32DC, 0x331E,
 445         0x2460, 0x25A2, 0x27E4, 0x2626, 0x2368, 0x22AA, 0x20EC, 0x212E,
 446         0x2A70, 0x2BB2, 0x29F4, 0x2836, 0x2D78, 0x2CBA, 0x2EFC, 0x2F3E,
 447         0x7080, 0x7142, 0x7304, 0x72C6, 0x7788, 0x764A, 0x740C, 0x75CE,
 448         0x7E90, 0x7F52, 0x7D14, 0x7CD6, 0x7998, 0x785A, 0x7A1C, 0x7BDE,
 449         0x6CA0, 0x6D62, 0x6F24, 0x6EE6, 0x6BA8, 0x6A6A, 0x682C, 0x69EE,
 450         0x62B0, 0x6372, 0x6134, 0x60F6, 0x65B8, 0x647A, 0x663C, 0x67FE,
 451         0x48C0, 0x4902, 0x4B44, 0x4A86, 0x4FC8, 0x4E0A, 0x4C4C, 0x4D8E,
 452         0x46D0, 0x4712, 0x4554, 0x4496, 0x41D8, 0x401A, 0x425C, 0x439E,
 453         0x54E0, 0x5522, 0x5764, 0x56A6, 0x53E8, 0x522A, 0x506C, 0x51AE,
 454         0x5AF0, 0x5B32, 0x5974, 0x58B6, 0x5DF8, 0x5C3A, 0x5E7C, 0x5FBE,
 455         0xE100, 0xE0C2, 0xE284, 0xE346, 0xE608, 0xE7CA, 0xE58C, 0xE44E,
 456         0xEF10, 0xEED2, 0xEC94, 0xED56, 0xE818, 0xE9DA, 0xEB9C, 0xEA5E,
 457         0xFD20, 0xFCE2, 0xFEA4, 0xFF66, 0xFA28, 0xFBEA, 0xF9AC, 0xF86E,
 458         0xF330, 0xF2F2, 0xF0B4, 0xF176, 0xF438, 0xF5FA, 0xF7BC, 0xF67E,
 459         0xD940, 0xD882, 0xDAC4, 0xDB06, 0xDE48, 0xDF8A, 0xDDCC, 0xDC0E,
 460         0xD750, 0xD692, 0xD4D4, 0xD516, 0xD058, 0xD19A, 0xD3DC, 0xD21E,
 461         0xC560, 0xC4A2, 0xC6E4, 0xC726, 0xC268, 0xC3AA, 0xC1EC, 0xC02E,
 462         0xCB70, 0xCAB2, 0xC8F4, 0xC936, 0xCC78, 0xCDBA, 0xCFFC, 0xCE3E,
 463         0x9180, 0x9042, 0x9204, 0x93C6, 0x9688, 0x974A, 0x950C, 0x94CE,
 464         0x9F90, 0x9E52, 0x9C14, 0x9DD6, 0x9898, 0x995A, 0x9B1C, 0x9ADE,
 465         0x8DA0, 0x8C62, 0x8E24, 0x8FE6, 0x8AA8, 0x8B6A, 0x892C, 0x88EE,
 466         0x83B0, 0x8272, 0x8034, 0x81F6, 0x84B8, 0x857A, 0x873C, 0x86FE,
 467         0xA9C0, 0xA802, 0xAA44, 0xAB86, 0xAEC8, 0xAF0A, 0xAD4C, 0xAC8E,
 468         0xA7D0, 0xA612, 0xA454, 0xA596, 0xA0D8, 0xA11A, 0xA35C, 0xA29E,
 469         0xB5E0, 0xB422, 0xB664, 0xB7A6, 0xB2E8, 0xB32A, 0xB16C, 0xB0AE,
 470         0xBBF0, 0xBA32, 0xB874, 0xB9B6, 0xBCF8, 0xBD3A, 0xBF7C, 0xBEBE
 471     };
 472     /*
 473      * This pre-processing phase slows down procedure by approximately
 474      * same time as it makes each loop spin faster. In other words
 475      * single block performance is approximately same as straightforward
 476      * "4-bit" implementation, and then it goes only faster...
 477      */
 478     for (cnt = 0; cnt < 16; ++cnt) {
 479         Z.hi = Htable[cnt].hi;
 480         Z.lo = Htable[cnt].lo;
 481         Hshr4[cnt].lo = (Z.hi << 60) | (Z.lo >> 4);
 482         Hshr4[cnt].hi = (Z.hi >> 4);
 483         Hshl4[cnt] = (u8)(Z.lo << 4);
 484     }
 485
 486     do {
 487         for (Z.lo = 0, Z.hi = 0, cnt = 15; cnt; --cnt) {
 488             nlo = ((const u8 *)Xi)[cnt];
 489             nlo ^= inp[cnt];
 490             nhi = nlo >> 4;
 491             nlo &= 0xf;
 492
 493             Z.hi ^= Htable[nlo].hi;
 494             Z.lo ^= Htable[nlo].lo;
 495
 496             rem = (size_t)Z.lo & 0xff;
 497
 498             Z.lo = (Z.hi << 56) | (Z.lo >> 8);
 499             Z.hi = (Z.hi >> 8);
 500
 501             Z.hi ^= Hshr4[nhi].hi;
 502             Z.lo ^= Hshr4[nhi].lo;
 503             Z.hi ^= (u64)rem_8bit[rem ^ Hshl4[nhi]] << 48;
 504         }
 505
 506         nlo = ((const u8 *)Xi)[0];
 507         nlo ^= inp[0];
 508         nhi = nlo >> 4;
 509         nlo &= 0xf;
 510
 511         Z.hi ^= Htable[nlo].hi;
 512         Z.lo ^= Htable[nlo].lo;
 513
 514         rem = (size_t)Z.lo & 0xf;
 515
 516         Z.lo = (Z.hi << 60) | (Z.lo >> 4);
 517         Z.hi = (Z.hi >> 4);
 518
 519         Z.hi ^= Htable[nhi].hi;
 520         Z.lo ^= Htable[nhi].lo;
 521         Z.hi ^= ((u64)rem_8bit[rem << 4]) << 48;
 522 #   endif
 523
 524         if (is_endian.little) {
 525 #   ifdef BSWAP8
 526             Xi[0] = BSWAP8(Z.hi);
 527             Xi[1] = BSWAP8(Z.lo);
 528 #   else
 529             u8 *p = (u8 *)Xi;
 530             u32 v;
 531             v = (u32)(Z.hi >> 32);
 532             PUTU32(p, v);
 533             v = (u32)(Z.hi);
 534             PUTU32(p + 4, v);
 535             v = (u32)(Z.lo >> 32);
 536             PUTU32(p + 8, v);
 537             v = (u32)(Z.lo);
 538             PUTU32(p + 12, v);
 539 #   endif
 540         } else {
 541             Xi[0] = Z.hi;
 542             Xi[1] = Z.lo;
 543         }
 544     } while (inp += 16, len -= 16);
 545 }
 546 #  endif
 547 # else
 548 void gcm_gmult_4bit(u64 Xi[2], const u128 Htable[16]);
 549 void gcm_ghash_4bit(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 550                     size_t len);
 551 # endif
 552
 553 # define GCM_MUL(ctx,Xi)   gcm_gmult_4bit(ctx->Xi.u,ctx->Htable)
 554 # if defined(GHASH_ASM) || !defined(OPENSSL_SMALL_FOOTPRINT)
 555 #  define GHASH(ctx,in,len) gcm_ghash_4bit((ctx)->Xi.u,(ctx)->Htable,in,len)
 556 /*
 557  * GHASH_CHUNK is "stride parameter" missioned to mitigate cache trashing
 558  * effect. In other words idea is to hash data while it's still in L1 cache
 559  * after encryption pass...
 560  */
 561 #  define GHASH_CHUNK       (3*1024)
 562 # endif
 563
 564 #else                           /* TABLE_BITS */
 565
 566 static void gcm_gmult_1bit(u64 Xi[2], const u64 H[2])
 567 {
 568     u128 V, Z = { 0, 0 };
 569     long X;
 570     int i, j;
 571     const long *xi = (const long *)Xi;
 572     const union {
 573         long one;
 574         char little;
 575     } is_endian = { 1 };
 576
 577     V.hi = H[0];                /* H is in host byte order, no byte swapping */
 578     V.lo = H[1];
 579
 580     for (j = 0; j < 16 / sizeof(long); ++j) {
 581         if (is_endian.little) {
 582             if (sizeof(long) == 8) {
 583 # ifdef BSWAP8
 584                 X = (long)(BSWAP8(xi[j]));
 585 # else
 586                 const u8 *p = (const u8 *)(xi + j);
 587                 X = (long)((u64)GETU32(p) << 32 | GETU32(p + 4));
 588 # endif
 589             } else {
 590                 const u8 *p = (const u8 *)(xi + j);
 591                 X = (long)GETU32(p);
 592             }
 593         } else
 594             X = xi[j];
 595
 596         for (i = 0; i < 8 * sizeof(long); ++i, X <<= 1) {
 597             u64 M = (u64)(X >> (8 * sizeof(long) - 1));
 598             Z.hi ^= V.hi & M;
 599             Z.lo ^= V.lo & M;
 600
 601             REDUCE1BIT(V);
 602         }
 603     }
 604
 605     if (is_endian.little) {
 606 # ifdef BSWAP8
 607         Xi[0] = BSWAP8(Z.hi);
 608         Xi[1] = BSWAP8(Z.lo);
 609 # else
 610         u8 *p = (u8 *)Xi;
 611         u32 v;
 612         v = (u32)(Z.hi >> 32);
 613         PUTU32(p, v);
 614         v = (u32)(Z.hi);
 615         PUTU32(p + 4, v);
 616         v = (u32)(Z.lo >> 32);
 617         PUTU32(p + 8, v);
 618         v = (u32)(Z.lo);
 619         PUTU32(p + 12, v);
 620 # endif
 621     } else {
 622         Xi[0] = Z.hi;
 623         Xi[1] = Z.lo;
 624     }
 625 }
 626
 627 # define GCM_MUL(ctx,Xi)   gcm_gmult_1bit(ctx->Xi.u,ctx->H.u)
 628
 629 #endif
 630
 631 #if     TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
 632 # if    !defined(I386_ONLY) && \
 633         (defined(__i386)        || defined(__i386__)    || \
 634          defined(__x86_64)      || defined(__x86_64__)  || \
 635          defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
 636 #  define GHASH_ASM_X86_OR_64
 637 #  define GCM_FUNCREF_4BIT
 638 extern unsigned int OPENSSL_ia32cap_P[];
 639
 640 void gcm_init_clmul(u128 Htable[16], const u64 Xi[2]);
 641 void gcm_gmult_clmul(u64 Xi[2], const u128 Htable[16]);
 642 void gcm_ghash_clmul(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 643                      size_t len);
 644
 645 #  if defined(__i386) || defined(__i386__) || defined(_M_IX86)
 646 #   define gcm_init_avx   gcm_init_clmul
 647 #   define gcm_gmult_avx  gcm_gmult_clmul
 648 #   define gcm_ghash_avx  gcm_ghash_clmul
 649 #  else
 650 void gcm_init_avx(u128 Htable[16], const u64 Xi[2]);
 651 void gcm_gmult_avx(u64 Xi[2], const u128 Htable[16]);
 652 void gcm_ghash_avx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 653                    size_t len);
 654 #  endif
 655
 656 #  if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
 657 #   define GHASH_ASM_X86
 658 void gcm_gmult_4bit_mmx(u64 Xi[2], const u128 Htable[16]);
 659 void gcm_ghash_4bit_mmx(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 660                         size_t len);
 661
 662 void gcm_gmult_4bit_x86(u64 Xi[2], const u128 Htable[16]);
 663 void gcm_ghash_4bit_x86(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 664                         size_t len);
 665 #  endif
 666 # elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
 667 #  include "arm_arch.h"
 668 #  if __ARM_MAX_ARCH__>=7
 669 #   define GHASH_ASM_ARM
 670 #   define GCM_FUNCREF_4BIT
 671 #   define PMULL_CAPABLE        (OPENSSL_armcap_P & ARMV8_PMULL)
 672 #   if defined(__arm__) || defined(__arm)
 673 #    define NEON_CAPABLE        (OPENSSL_armcap_P & ARMV7_NEON)
 674 #   endif
 675 void gcm_init_neon(u128 Htable[16], const u64 Xi[2]);
 676 void gcm_gmult_neon(u64 Xi[2], const u128 Htable[16]);
 677 void gcm_ghash_neon(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 678                     size_t len);
 679 void gcm_init_v8(u128 Htable[16], const u64 Xi[2]);
 680 void gcm_gmult_v8(u64 Xi[2], const u128 Htable[16]);
 681 void gcm_ghash_v8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 682                   size_t len);
 683 #  endif
 684 # elif defined(__sparc__) || defined(__sparc)
 685 #  include "sparc_arch.h"
 686 #  define GHASH_ASM_SPARC
 687 #  define GCM_FUNCREF_4BIT
 688 extern unsigned int OPENSSL_sparcv9cap_P[];
 689 void gcm_init_vis3(u128 Htable[16], const u64 Xi[2]);
 690 void gcm_gmult_vis3(u64 Xi[2], const u128 Htable[16]);
 691 void gcm_ghash_vis3(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 692                     size_t len);
 693 # elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
 694 #  include "ppc_arch.h"
 695 #  define GHASH_ASM_PPC
 696 #  define GCM_FUNCREF_4BIT
 697 void gcm_init_p8(u128 Htable[16], const u64 Xi[2]);
 698 void gcm_gmult_p8(u64 Xi[2], const u128 Htable[16]);
 699 void gcm_ghash_p8(u64 Xi[2], const u128 Htable[16], const u8 *inp,
 700                   size_t len);
 701 # endif
 702 #endif
 703
 704 #ifdef GCM_FUNCREF_4BIT
 705 # undef  GCM_MUL
 706 # define GCM_MUL(ctx,Xi)        (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
 707 # ifdef GHASH
 708 #  undef  GHASH
 709 #  define GHASH(ctx,in,len)     (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
 710 # endif
 711 #endif
 712
 713 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx, void *key, block128_f block)
 714 {
 715     const union {
 716         long one;
 717         char little;
 718     } is_endian = { 1 };
 719
 720     memset(ctx, 0, sizeof(*ctx));
 721     ctx->block = block;
 722     ctx->key = key;
 723
 724     (*block) (ctx->H.c, ctx->H.c, key);
 725
 726     if (is_endian.little) {
 727         /* H is stored in host byte order */
 728 #ifdef BSWAP8
 729         ctx->H.u[0] = BSWAP8(ctx->H.u[0]);
 730         ctx->H.u[1] = BSWAP8(ctx->H.u[1]);
 731 #else
 732         u8 *p = ctx->H.c;
 733         u64 hi, lo;
 734         hi = (u64)GETU32(p) << 32 | GETU32(p + 4);
 735         lo = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
 736         ctx->H.u[0] = hi;
 737         ctx->H.u[1] = lo;
 738 #endif
 739     }
 740 #if     TABLE_BITS==8
 741     gcm_init_8bit(ctx->Htable, ctx->H.u);
 742 #elif   TABLE_BITS==4
 743 # if    defined(GHASH)
 744 #  define CTX__GHASH(f) (ctx->ghash = (f))
 745 # else
 746 #  define CTX__GHASH(f) (ctx->ghash = NULL)
 747 # endif
 748 # if    defined(GHASH_ASM_X86_OR_64)
 749 #  if   !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
 750     if (OPENSSL_ia32cap_P[0] & (1 << 24) && /* check FXSR bit */
 751         OPENSSL_ia32cap_P[1] & (1 << 1)) { /* check PCLMULQDQ bit */
 752         if (((OPENSSL_ia32cap_P[1] >> 22) & 0x41) == 0x41) { /* AVX+MOVBE */
 753             gcm_init_avx(ctx->Htable, ctx->H.u);
 754             ctx->gmult = gcm_gmult_avx;
 755             CTX__GHASH(gcm_ghash_avx);
 756         } else {
 757             gcm_init_clmul(ctx->Htable, ctx->H.u);
 758             ctx->gmult = gcm_gmult_clmul;
 759             CTX__GHASH(gcm_ghash_clmul);
 760         }
 761         return;
 762     }
 763 #  endif
 764     gcm_init_4bit(ctx->Htable, ctx->H.u);
 765 #  if   defined(GHASH_ASM_X86)  /* x86 only */
 766 #   if  defined(OPENSSL_IA32_SSE2)
 767     if (OPENSSL_ia32cap_P[0] & (1 << 25)) { /* check SSE bit */
 768 #   else
 769     if (OPENSSL_ia32cap_P[0] & (1 << 23)) { /* check MMX bit */
 770 #   endif
 771         ctx->gmult = gcm_gmult_4bit_mmx;
 772         CTX__GHASH(gcm_ghash_4bit_mmx);
 773     } else {
 774         ctx->gmult = gcm_gmult_4bit_x86;
 775         CTX__GHASH(gcm_ghash_4bit_x86);
 776     }
 777 #  else
 778     ctx->gmult = gcm_gmult_4bit;
 779     CTX__GHASH(gcm_ghash_4bit);
 780 #  endif
 781 # elif  defined(GHASH_ASM_ARM)
 782 #  ifdef PMULL_CAPABLE
 783     if (PMULL_CAPABLE) {
 784         gcm_init_v8(ctx->Htable, ctx->H.u);
 785         ctx->gmult = gcm_gmult_v8;
 786         CTX__GHASH(gcm_ghash_v8);
 787     } else
 788 #  endif
 789 #  ifdef NEON_CAPABLE
 790     if (NEON_CAPABLE) {
 791         gcm_init_neon(ctx->Htable, ctx->H.u);
 792         ctx->gmult = gcm_gmult_neon;
 793         CTX__GHASH(gcm_ghash_neon);
 794     } else
 795 #  endif
 796     {
 797         gcm_init_4bit(ctx->Htable, ctx->H.u);
 798         ctx->gmult = gcm_gmult_4bit;
 799         CTX__GHASH(gcm_ghash_4bit);
 800     }
 801 # elif  defined(GHASH_ASM_SPARC)
 802     if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
 803         gcm_init_vis3(ctx->Htable, ctx->H.u);
 804         ctx->gmult = gcm_gmult_vis3;
 805         CTX__GHASH(gcm_ghash_vis3);
 806     } else {
 807         gcm_init_4bit(ctx->Htable, ctx->H.u);
 808         ctx->gmult = gcm_gmult_4bit;
 809         CTX__GHASH(gcm_ghash_4bit);
 810     }
 811 # elif  defined(GHASH_ASM_PPC)
 812     if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
 813         gcm_init_p8(ctx->Htable, ctx->H.u);
 814         ctx->gmult = gcm_gmult_p8;
 815         CTX__GHASH(gcm_ghash_p8);
 816     } else {
 817         gcm_init_4bit(ctx->Htable, ctx->H.u);
 818         ctx->gmult = gcm_gmult_4bit;
 819         CTX__GHASH(gcm_ghash_4bit);
 820     }
 821 # else
 822     gcm_init_4bit(ctx->Htable, ctx->H.u);
 823 # endif
 824 # undef CTX__GHASH
 825 #endif
 826 }
 827
 828 void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const unsigned char *iv,
 829                          size_t len)
 830 {
 831     const union {
 832         long one;
 833         char little;
 834     } is_endian = { 1 };
 835     unsigned int ctr;
 836 #ifdef GCM_FUNCREF_4BIT
 837     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 838 #endif
 839
 840     ctx->Yi.u[0] = 0;
 841     ctx->Yi.u[1] = 0;
 842     ctx->Xi.u[0] = 0;
 843     ctx->Xi.u[1] = 0;
 844     ctx->len.u[0] = 0;          /* AAD length */
 845     ctx->len.u[1] = 0;          /* message length */
 846     ctx->ares = 0;
 847     ctx->mres = 0;
 848
 849     if (len == 12) {
 850         memcpy(ctx->Yi.c, iv, 12);
 851         ctx->Yi.c[15] = 1;
 852         ctr = 1;
 853     } else {
 854         size_t i;
 855         u64 len0 = len;
 856
 857         while (len >= 16) {
 858             for (i = 0; i < 16; ++i)
 859                 ctx->Yi.c[i] ^= iv[i];
 860             GCM_MUL(ctx, Yi);
 861             iv += 16;
 862             len -= 16;
 863         }
 864         if (len) {
 865             for (i = 0; i < len; ++i)
 866                 ctx->Yi.c[i] ^= iv[i];
 867             GCM_MUL(ctx, Yi);
 868         }
 869         len0 <<= 3;
 870         if (is_endian.little) {
 871 #ifdef BSWAP8
 872             ctx->Yi.u[1] ^= BSWAP8(len0);
 873 #else
 874             ctx->Yi.c[8] ^= (u8)(len0 >> 56);
 875             ctx->Yi.c[9] ^= (u8)(len0 >> 48);
 876             ctx->Yi.c[10] ^= (u8)(len0 >> 40);
 877             ctx->Yi.c[11] ^= (u8)(len0 >> 32);
 878             ctx->Yi.c[12] ^= (u8)(len0 >> 24);
 879             ctx->Yi.c[13] ^= (u8)(len0 >> 16);
 880             ctx->Yi.c[14] ^= (u8)(len0 >> 8);
 881             ctx->Yi.c[15] ^= (u8)(len0);
 882 #endif
 883         } else
 884             ctx->Yi.u[1] ^= len0;
 885
 886         GCM_MUL(ctx, Yi);
 887
 888         if (is_endian.little)
 889 #ifdef BSWAP4
 890             ctr = BSWAP4(ctx->Yi.d[3]);
 891 #else
 892             ctr = GETU32(ctx->Yi.c + 12);
 893 #endif
 894         else
 895             ctr = ctx->Yi.d[3];
 896     }
 897
 898     (*ctx->block) (ctx->Yi.c, ctx->EK0.c, ctx->key);
 899     ++ctr;
 900     if (is_endian.little)
 901 #ifdef BSWAP4
 902         ctx->Yi.d[3] = BSWAP4(ctr);
 903 #else
 904         PUTU32(ctx->Yi.c + 12, ctr);
 905 #endif
 906     else
 907         ctx->Yi.d[3] = ctr;
 908 }
 909
 910 int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const unsigned char *aad,
 911                       size_t len)
 912 {
 913     size_t i;
 914     unsigned int n;
 915     u64 alen = ctx->len.u[0];
 916 #ifdef GCM_FUNCREF_4BIT
 917     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 918 # ifdef GHASH
 919     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
 920                          const u8 *inp, size_t len) = ctx->ghash;
 921 # endif
 922 #endif
 923
 924     if (ctx->len.u[1])
 925         return -2;
 926
 927     alen += len;
 928     if (alen > (U64(1) << 61) || (sizeof(len) == 8 && alen < len))
 929         return -1;
 930     ctx->len.u[0] = alen;
 931
 932     n = ctx->ares;
 933     if (n) {
 934         while (n && len) {
 935             ctx->Xi.c[n] ^= *(aad++);
 936             --len;
 937             n = (n + 1) % 16;
 938         }
 939         if (n == 0)
 940             GCM_MUL(ctx, Xi);
 941         else {
 942             ctx->ares = n;
 943             return 0;
 944         }
 945     }
 946 #ifdef GHASH
 947     if ((i = (len & (size_t)-16))) {
 948         GHASH(ctx, aad, i);
 949         aad += i;
 950         len -= i;
 951     }
 952 #else
 953     while (len >= 16) {
 954         for (i = 0; i < 16; ++i)
 955             ctx->Xi.c[i] ^= aad[i];
 956         GCM_MUL(ctx, Xi);
 957         aad += 16;
 958         len -= 16;
 959     }
 960 #endif
 961     if (len) {
 962         n = (unsigned int)len;
 963         for (i = 0; i < len; ++i)
 964             ctx->Xi.c[i] ^= aad[i];
 965     }
 966
 967     ctx->ares = n;
 968     return 0;
 969 }
 970
 971 int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
 972                           const unsigned char *in, unsigned char *out,
 973                           size_t len)
 974 {
 975     const union {
 976         long one;
 977         char little;
 978     } is_endian = { 1 };
 979     unsigned int n, ctr;
 980     size_t i;
 981     u64 mlen = ctx->len.u[1];
 982     block128_f block = ctx->block;
 983     void *key = ctx->key;
 984 #ifdef GCM_FUNCREF_4BIT
 985     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
 986 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
 987     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
 988                          const u8 *inp, size_t len) = ctx->ghash;
 989 # endif
 990 #endif
 991
 992     mlen += len;
 993     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
 994         return -1;
 995     ctx->len.u[1] = mlen;
 996
 997     if (ctx->ares) {
 998         /* First call to encrypt finalizes GHASH(AAD) */
 999         GCM_MUL(ctx, Xi);
1000         ctx->ares = 0;
1001     }
1002
1003     if (is_endian.little)
1004 #ifdef BSWAP4
1005         ctr = BSWAP4(ctx->Yi.d[3]);
1006 #else
1007         ctr = GETU32(ctx->Yi.c + 12);
1008 #endif
1009     else
1010         ctr = ctx->Yi.d[3];
1011
1012     n = ctx->mres;
1013 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1014     if (16 % sizeof(size_t) == 0) { /* always true actually */
1015         do {
1016             if (n) {
1017                 while (n && len) {
1018                     ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1019                     --len;
1020                     n = (n + 1) % 16;
1021                 }
1022                 if (n == 0)
1023                     GCM_MUL(ctx, Xi);
1024                 else {
1025                     ctx->mres = n;
1026                     return 0;
1027                 }
1028             }
1029 # if defined(STRICT_ALIGNMENT)
1030             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1031                 break;
1032 # endif
1033 # if defined(GHASH)
1034 #  if defined(GHASH_CHUNK)
1035             while (len >= GHASH_CHUNK) {
1036                 size_t j = GHASH_CHUNK;
1037
1038                 while (j) {
1039                     size_t *out_t = (size_t *)out;
1040                     const size_t *in_t = (const size_t *)in;
1041
1042                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1043                     ++ctr;
1044                     if (is_endian.little)
1045 #   ifdef BSWAP4
1046                         ctx->Yi.d[3] = BSWAP4(ctr);
1047 #   else
1048                         PUTU32(ctx->Yi.c + 12, ctr);
1049 #   endif
1050                     else
1051                         ctx->Yi.d[3] = ctr;
1052                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1053                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1054                     out += 16;
1055                     in += 16;
1056                     j -= 16;
1057                 }
1058                 GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK);
1059                 len -= GHASH_CHUNK;
1060             }
1061 #  endif
1062             if ((i = (len & (size_t)-16))) {
1063                 size_t j = i;
1064
1065                 while (len >= 16) {
1066                     size_t *out_t = (size_t *)out;
1067                     const size_t *in_t = (const size_t *)in;
1068
1069                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1070                     ++ctr;
1071                     if (is_endian.little)
1072 #  ifdef BSWAP4
1073                         ctx->Yi.d[3] = BSWAP4(ctr);
1074 #  else
1075                         PUTU32(ctx->Yi.c + 12, ctr);
1076 #  endif
1077                     else
1078                         ctx->Yi.d[3] = ctr;
1079                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1080                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1081                     out += 16;
1082                     in += 16;
1083                     len -= 16;
1084                 }
1085                 GHASH(ctx, out - j, j);
1086             }
1087 # else
1088             while (len >= 16) {
1089                 size_t *out_t = (size_t *)out;
1090                 const size_t *in_t = (const size_t *)in;
1091
1092                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1093                 ++ctr;
1094                 if (is_endian.little)
1095 #  ifdef BSWAP4
1096                     ctx->Yi.d[3] = BSWAP4(ctr);
1097 #  else
1098                     PUTU32(ctx->Yi.c + 12, ctr);
1099 #  endif
1100                 else
1101                     ctx->Yi.d[3] = ctr;
1102                 for (i = 0; i < 16 / sizeof(size_t); ++i)
1103                     ctx->Xi.t[i] ^= out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1104                 GCM_MUL(ctx, Xi);
1105                 out += 16;
1106                 in += 16;
1107                 len -= 16;
1108             }
1109 # endif
1110             if (len) {
1111                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1112                 ++ctr;
1113                 if (is_endian.little)
1114 # ifdef BSWAP4
1115                     ctx->Yi.d[3] = BSWAP4(ctr);
1116 # else
1117                     PUTU32(ctx->Yi.c + 12, ctr);
1118 # endif
1119                 else
1120                     ctx->Yi.d[3] = ctr;
1121                 while (len--) {
1122                     ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1123                     ++n;
1124                 }
1125             }
1126
1127             ctx->mres = n;
1128             return 0;
1129         } while (0);
1130     }
1131 #endif
1132     for (i = 0; i < len; ++i) {
1133         if (n == 0) {
1134             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1135             ++ctr;
1136             if (is_endian.little)
1137 #ifdef BSWAP4
1138                 ctx->Yi.d[3] = BSWAP4(ctr);
1139 #else
1140                 PUTU32(ctx->Yi.c + 12, ctr);
1141 #endif
1142             else
1143                 ctx->Yi.d[3] = ctr;
1144         }
1145         ctx->Xi.c[n] ^= out[i] = in[i] ^ ctx->EKi.c[n];
1146         n = (n + 1) % 16;
1147         if (n == 0)
1148             GCM_MUL(ctx, Xi);
1149     }
1150
1151     ctx->mres = n;
1152     return 0;
1153 }
1154
1155 int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
1156                           const unsigned char *in, unsigned char *out,
1157                           size_t len)
1158 {
1159     const union {
1160         long one;
1161         char little;
1162     } is_endian = { 1 };
1163     unsigned int n, ctr;
1164     size_t i;
1165     u64 mlen = ctx->len.u[1];
1166     block128_f block = ctx->block;
1167     void *key = ctx->key;
1168 #ifdef GCM_FUNCREF_4BIT
1169     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1170 # if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
1171     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1172                          const u8 *inp, size_t len) = ctx->ghash;
1173 # endif
1174 #endif
1175
1176     mlen += len;
1177     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1178         return -1;
1179     ctx->len.u[1] = mlen;
1180
1181     if (ctx->ares) {
1182         /* First call to decrypt finalizes GHASH(AAD) */
1183         GCM_MUL(ctx, Xi);
1184         ctx->ares = 0;
1185     }
1186
1187     if (is_endian.little)
1188 #ifdef BSWAP4
1189         ctr = BSWAP4(ctx->Yi.d[3]);
1190 #else
1191         ctr = GETU32(ctx->Yi.c + 12);
1192 #endif
1193     else
1194         ctr = ctx->Yi.d[3];
1195
1196     n = ctx->mres;
1197 #if !defined(OPENSSL_SMALL_FOOTPRINT)
1198     if (16 % sizeof(size_t) == 0) { /* always true actually */
1199         do {
1200             if (n) {
1201                 while (n && len) {
1202                     u8 c = *(in++);
1203                     *(out++) = c ^ ctx->EKi.c[n];
1204                     ctx->Xi.c[n] ^= c;
1205                     --len;
1206                     n = (n + 1) % 16;
1207                 }
1208                 if (n == 0)
1209                     GCM_MUL(ctx, Xi);
1210                 else {
1211                     ctx->mres = n;
1212                     return 0;
1213                 }
1214             }
1215 # if defined(STRICT_ALIGNMENT)
1216             if (((size_t)in | (size_t)out) % sizeof(size_t) != 0)
1217                 break;
1218 # endif
1219 # if defined(GHASH)
1220 #  if defined(GHASH_CHUNK)
1221             while (len >= GHASH_CHUNK) {
1222                 size_t j = GHASH_CHUNK;
1223
1224                 GHASH(ctx, in, GHASH_CHUNK);
1225                 while (j) {
1226                     size_t *out_t = (size_t *)out;
1227                     const size_t *in_t = (const size_t *)in;
1228
1229                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1230                     ++ctr;
1231                     if (is_endian.little)
1232 #   ifdef BSWAP4
1233                         ctx->Yi.d[3] = BSWAP4(ctr);
1234 #   else
1235                         PUTU32(ctx->Yi.c + 12, ctr);
1236 #   endif
1237                     else
1238                         ctx->Yi.d[3] = ctr;
1239                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1240                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1241                     out += 16;
1242                     in += 16;
1243                     j -= 16;
1244                 }
1245                 len -= GHASH_CHUNK;
1246             }
1247 #  endif
1248             if ((i = (len & (size_t)-16))) {
1249                 GHASH(ctx, in, i);
1250                 while (len >= 16) {
1251                     size_t *out_t = (size_t *)out;
1252                     const size_t *in_t = (const size_t *)in;
1253
1254                     (*block) (ctx->Yi.c, ctx->EKi.c, key);
1255                     ++ctr;
1256                     if (is_endian.little)
1257 #  ifdef BSWAP4
1258                         ctx->Yi.d[3] = BSWAP4(ctr);
1259 #  else
1260                         PUTU32(ctx->Yi.c + 12, ctr);
1261 #  endif
1262                     else
1263                         ctx->Yi.d[3] = ctr;
1264                     for (i = 0; i < 16 / sizeof(size_t); ++i)
1265                         out_t[i] = in_t[i] ^ ctx->EKi.t[i];
1266                     out += 16;
1267                     in += 16;
1268                     len -= 16;
1269                 }
1270             }
1271 # else
1272             while (len >= 16) {
1273                 size_t *out_t = (size_t *)out;
1274                 const size_t *in_t = (const size_t *)in;
1275
1276                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1277                 ++ctr;
1278                 if (is_endian.little)
1279 #  ifdef BSWAP4
1280                     ctx->Yi.d[3] = BSWAP4(ctr);
1281 #  else
1282                     PUTU32(ctx->Yi.c + 12, ctr);
1283 #  endif
1284                 else
1285                     ctx->Yi.d[3] = ctr;
1286                 for (i = 0; i < 16 / sizeof(size_t); ++i) {
1287                     size_t c = in[i];
1288                     out[i] = c ^ ctx->EKi.t[i];
1289                     ctx->Xi.t[i] ^= c;
1290                 }
1291                 GCM_MUL(ctx, Xi);
1292                 out += 16;
1293                 in += 16;
1294                 len -= 16;
1295             }
1296 # endif
1297             if (len) {
1298                 (*block) (ctx->Yi.c, ctx->EKi.c, key);
1299                 ++ctr;
1300                 if (is_endian.little)
1301 # ifdef BSWAP4
1302                     ctx->Yi.d[3] = BSWAP4(ctr);
1303 # else
1304                     PUTU32(ctx->Yi.c + 12, ctr);
1305 # endif
1306                 else
1307                     ctx->Yi.d[3] = ctr;
1308                 while (len--) {
1309                     u8 c = in[n];
1310                     ctx->Xi.c[n] ^= c;
1311                     out[n] = c ^ ctx->EKi.c[n];
1312                     ++n;
1313                 }
1314             }
1315
1316             ctx->mres = n;
1317             return 0;
1318         } while (0);
1319     }
1320 #endif
1321     for (i = 0; i < len; ++i) {
1322         u8 c;
1323         if (n == 0) {
1324             (*block) (ctx->Yi.c, ctx->EKi.c, key);
1325             ++ctr;
1326             if (is_endian.little)
1327 #ifdef BSWAP4
1328                 ctx->Yi.d[3] = BSWAP4(ctr);
1329 #else
1330                 PUTU32(ctx->Yi.c + 12, ctr);
1331 #endif
1332             else
1333                 ctx->Yi.d[3] = ctr;
1334         }
1335         c = in[i];
1336         out[i] = c ^ ctx->EKi.c[n];
1337         ctx->Xi.c[n] ^= c;
1338         n = (n + 1) % 16;
1339         if (n == 0)
1340             GCM_MUL(ctx, Xi);
1341     }
1342
1343     ctx->mres = n;
1344     return 0;
1345 }
1346
1347 int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
1348                                 const unsigned char *in, unsigned char *out,
1349                                 size_t len, ctr128_f stream)
1350 {
1351 #if defined(OPENSSL_SMALL_FOOTPRINT)
1352     return CRYPTO_gcm128_encrypt(ctx, in, out, len);
1353 #else
1354     const union {
1355         long one;
1356         char little;
1357     } is_endian = { 1 };
1358     unsigned int n, ctr;
1359     size_t i;
1360     u64 mlen = ctx->len.u[1];
1361     void *key = ctx->key;
1362 # ifdef GCM_FUNCREF_4BIT
1363     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1364 #  ifdef GHASH
1365     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1366                          const u8 *inp, size_t len) = ctx->ghash;
1367 #  endif
1368 # endif
1369
1370     mlen += len;
1371     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1372         return -1;
1373     ctx->len.u[1] = mlen;
1374
1375     if (ctx->ares) {
1376         /* First call to encrypt finalizes GHASH(AAD) */
1377         GCM_MUL(ctx, Xi);
1378         ctx->ares = 0;
1379     }
1380
1381     if (is_endian.little)
1382 # ifdef BSWAP4
1383         ctr = BSWAP4(ctx->Yi.d[3]);
1384 # else
1385         ctr = GETU32(ctx->Yi.c + 12);
1386 # endif
1387     else
1388         ctr = ctx->Yi.d[3];
1389
1390     n = ctx->mres;
1391     if (n) {
1392         while (n && len) {
1393             ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n];
1394             --len;
1395             n = (n + 1) % 16;
1396         }
1397         if (n == 0)
1398             GCM_MUL(ctx, Xi);
1399         else {
1400             ctx->mres = n;
1401             return 0;
1402         }
1403     }
1404 # if defined(GHASH) && defined(GHASH_CHUNK)
1405     while (len >= GHASH_CHUNK) {
1406         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1407         ctr += GHASH_CHUNK / 16;
1408         if (is_endian.little)
1409 #  ifdef BSWAP4
1410             ctx->Yi.d[3] = BSWAP4(ctr);
1411 #  else
1412             PUTU32(ctx->Yi.c + 12, ctr);
1413 #  endif
1414         else
1415             ctx->Yi.d[3] = ctr;
1416         GHASH(ctx, out, GHASH_CHUNK);
1417         out += GHASH_CHUNK;
1418         in += GHASH_CHUNK;
1419         len -= GHASH_CHUNK;
1420     }
1421 # endif
1422     if ((i = (len & (size_t)-16))) {
1423         size_t j = i / 16;
1424
1425         (*stream) (in, out, j, key, ctx->Yi.c);
1426         ctr += (unsigned int)j;
1427         if (is_endian.little)
1428 # ifdef BSWAP4
1429             ctx->Yi.d[3] = BSWAP4(ctr);
1430 # else
1431             PUTU32(ctx->Yi.c + 12, ctr);
1432 # endif
1433         else
1434             ctx->Yi.d[3] = ctr;
1435         in += i;
1436         len -= i;
1437 # if defined(GHASH)
1438         GHASH(ctx, out, i);
1439         out += i;
1440 # else
1441         while (j--) {
1442             for (i = 0; i < 16; ++i)
1443                 ctx->Xi.c[i] ^= out[i];
1444             GCM_MUL(ctx, Xi);
1445             out += 16;
1446         }
1447 # endif
1448     }
1449     if (len) {
1450         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1451         ++ctr;
1452         if (is_endian.little)
1453 # ifdef BSWAP4
1454             ctx->Yi.d[3] = BSWAP4(ctr);
1455 # else
1456             PUTU32(ctx->Yi.c + 12, ctr);
1457 # endif
1458         else
1459             ctx->Yi.d[3] = ctr;
1460         while (len--) {
1461             ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n];
1462             ++n;
1463         }
1464     }
1465
1466     ctx->mres = n;
1467     return 0;
1468 #endif
1469 }
1470
1471 int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
1472                                 const unsigned char *in, unsigned char *out,
1473                                 size_t len, ctr128_f stream)
1474 {
1475 #if defined(OPENSSL_SMALL_FOOTPRINT)
1476     return CRYPTO_gcm128_decrypt(ctx, in, out, len);
1477 #else
1478     const union {
1479         long one;
1480         char little;
1481     } is_endian = { 1 };
1482     unsigned int n, ctr;
1483     size_t i;
1484     u64 mlen = ctx->len.u[1];
1485     void *key = ctx->key;
1486 # ifdef GCM_FUNCREF_4BIT
1487     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1488 #  ifdef GHASH
1489     void (*gcm_ghash_p) (u64 Xi[2], const u128 Htable[16],
1490                          const u8 *inp, size_t len) = ctx->ghash;
1491 #  endif
1492 # endif
1493
1494     mlen += len;
1495     if (mlen > ((U64(1) << 36) - 32) || (sizeof(len) == 8 && mlen < len))
1496         return -1;
1497     ctx->len.u[1] = mlen;
1498
1499     if (ctx->ares) {
1500         /* First call to decrypt finalizes GHASH(AAD) */
1501         GCM_MUL(ctx, Xi);
1502         ctx->ares = 0;
1503     }
1504
1505     if (is_endian.little)
1506 # ifdef BSWAP4
1507         ctr = BSWAP4(ctx->Yi.d[3]);
1508 # else
1509         ctr = GETU32(ctx->Yi.c + 12);
1510 # endif
1511     else
1512         ctr = ctx->Yi.d[3];
1513
1514     n = ctx->mres;
1515     if (n) {
1516         while (n && len) {
1517             u8 c = *(in++);
1518             *(out++) = c ^ ctx->EKi.c[n];
1519             ctx->Xi.c[n] ^= c;
1520             --len;
1521             n = (n + 1) % 16;
1522         }
1523         if (n == 0)
1524             GCM_MUL(ctx, Xi);
1525         else {
1526             ctx->mres = n;
1527             return 0;
1528         }
1529     }
1530 # if defined(GHASH) && defined(GHASH_CHUNK)
1531     while (len >= GHASH_CHUNK) {
1532         GHASH(ctx, in, GHASH_CHUNK);
1533         (*stream) (in, out, GHASH_CHUNK / 16, key, ctx->Yi.c);
1534         ctr += GHASH_CHUNK / 16;
1535         if (is_endian.little)
1536 #  ifdef BSWAP4
1537             ctx->Yi.d[3] = BSWAP4(ctr);
1538 #  else
1539             PUTU32(ctx->Yi.c + 12, ctr);
1540 #  endif
1541         else
1542             ctx->Yi.d[3] = ctr;
1543         out += GHASH_CHUNK;
1544         in += GHASH_CHUNK;
1545         len -= GHASH_CHUNK;
1546     }
1547 # endif
1548     if ((i = (len & (size_t)-16))) {
1549         size_t j = i / 16;
1550
1551 # if defined(GHASH)
1552         GHASH(ctx, in, i);
1553 # else
1554         while (j--) {
1555             size_t k;
1556             for (k = 0; k < 16; ++k)
1557                 ctx->Xi.c[k] ^= in[k];
1558             GCM_MUL(ctx, Xi);
1559             in += 16;
1560         }
1561         j = i / 16;
1562         in -= i;
1563 # endif
1564         (*stream) (in, out, j, key, ctx->Yi.c);
1565         ctr += (unsigned int)j;
1566         if (is_endian.little)
1567 # ifdef BSWAP4
1568             ctx->Yi.d[3] = BSWAP4(ctr);
1569 # else
1570             PUTU32(ctx->Yi.c + 12, ctr);
1571 # endif
1572         else
1573             ctx->Yi.d[3] = ctr;
1574         out += i;
1575         in += i;
1576         len -= i;
1577     }
1578     if (len) {
1579         (*ctx->block) (ctx->Yi.c, ctx->EKi.c, key);
1580         ++ctr;
1581         if (is_endian.little)
1582 # ifdef BSWAP4
1583             ctx->Yi.d[3] = BSWAP4(ctr);
1584 # else
1585             PUTU32(ctx->Yi.c + 12, ctr);
1586 # endif
1587         else
1588             ctx->Yi.d[3] = ctr;
1589         while (len--) {
1590             u8 c = in[n];
1591             ctx->Xi.c[n] ^= c;
1592             out[n] = c ^ ctx->EKi.c[n];
1593             ++n;
1594         }
1595     }
1596
1597     ctx->mres = n;
1598     return 0;
1599 #endif
1600 }
1601
1602 int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const unsigned char *tag,
1603                          size_t len)
1604 {
1605     const union {
1606         long one;
1607         char little;
1608     } is_endian = { 1 };
1609     u64 alen = ctx->len.u[0] << 3;
1610     u64 clen = ctx->len.u[1] << 3;
1611 #ifdef GCM_FUNCREF_4BIT
1612     void (*gcm_gmult_p) (u64 Xi[2], const u128 Htable[16]) = ctx->gmult;
1613 #endif
1614
1615     if (ctx->mres || ctx->ares)
1616         GCM_MUL(ctx, Xi);
1617
1618     if (is_endian.little) {
1619 #ifdef BSWAP8
1620         alen = BSWAP8(alen);
1621         clen = BSWAP8(clen);
1622 #else
1623         u8 *p = ctx->len.c;
1624
1625         ctx->len.u[0] = alen;
1626         ctx->len.u[1] = clen;
1627
1628         alen = (u64)GETU32(p) << 32 | GETU32(p + 4);
1629         clen = (u64)GETU32(p + 8) << 32 | GETU32(p + 12);
1630 #endif
1631     }
1632
1633     ctx->Xi.u[0] ^= alen;
1634     ctx->Xi.u[1] ^= clen;
1635     GCM_MUL(ctx, Xi);
1636
1637     ctx->Xi.u[0] ^= ctx->EK0.u[0];
1638     ctx->Xi.u[1] ^= ctx->EK0.u[1];
1639
1640     if (tag && len <= sizeof(ctx->Xi))
1641         return CRYPTO_memcmp(ctx->Xi.c, tag, len);
1642     else
1643         return -1;
1644 }
1645
1646 void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len)
1647 {
1648     CRYPTO_gcm128_finish(ctx, NULL, 0);
1649     memcpy(tag, ctx->Xi.c,
1650            len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c));
1651 }
1652
1653 GCM128_CONTEXT *CRYPTO_gcm128_new(void *key, block128_f block)
1654 {
1655     GCM128_CONTEXT *ret;
1656
1657     if ((ret = OPENSSL_malloc(sizeof(*ret))) != NULL)
1658         CRYPTO_gcm128_init(ret, key, block);
1659
1660     return ret;
1661 }
1662
1663 void CRYPTO_gcm128_release(GCM128_CONTEXT *ctx)
1664 {
1665     OPENSSL_clear_free(ctx, sizeof(*ctx));
1666 }