crypto/ofb128.c: make it indent-friendly.
[openssl.git] / crypto / modes / gcm128.c
index 8a48e90ac548d408b2e50f4cd327fe31afe3ce43..7e856b54894a36561d2256251aade30269e36583 100644 (file)
@@ -47,7 +47,7 @@
  * ====================================================================
  */
 
-#define OPENSSL_FIPSAPI
+
 
 #include <openssl/crypto.h>
 #include "modes_lcl.h"
        } \
 } while(0)
 
+/*-
+ * Even though permitted values for TABLE_BITS are 8, 4 and 1, it should
+ * never be set to 8. 8 is effectively reserved for testing purposes.
+ * TABLE_BITS>1 are lookup-table-driven implementations referred to as
+ * "Shoup's" in GCM specification. In other words OpenSSL does not cover
+ * whole spectrum of possible table driven implementations. Why? In
+ * non-"Shoup's" case memory access pattern is segmented in such manner,
+ * that it's trivial to see that cache timing information can reveal
+ * fair portion of intermediate hash value. Given that ciphertext is
+ * always available to attacker, it's possible for him to attempt to
+ * deduce secret parameter H and if successful, tamper with messages
+ * [which is nothing but trivial in CTR mode]. In "Shoup's" case it's
+ * not as trivial, but there is no reason to believe that it's resistant
+ * to cache-timing attack. And the thing about "8-bit" implementation is
+ * that it consumes 16 (sixteen) times more memory, 4KB per individual
+ * key + 1KB shared. Well, on pros side it should be twice as fast as
+ * "4-bit" version. And for gcc-generated x86[_64] code, "8-bit" version
+ * was observed to run ~75% faster, closer to 100% for commercial
+ * compilers... Yet "4-bit" procedure is preferred, because it's
+ * believed to provide better security-performance balance and adequate
+ * all-round performance. "All-round" refers to things like:
+ *
+ * - shorter setup time effectively improves overall timing for
+ *   handling short messages;
+ * - larger table allocation can become unbearable because of VM
+ *   subsystem penalties (for example on Windows large enough free
+ *   results in VM working set trimming, meaning that consequent
+ *   malloc would immediately incur working set expansion);
+ * - larger table has larger cache footprint, which can affect
+ *   performance of other code paths (not necessarily even from same
+ *   thread in Hyper-Threading world);
+ *
+ * Value of 1 is not appropriate for performance reasons.
+ */
 #if    TABLE_BITS==8
 
 static void gcm_init_8bit(u128 Htable[256], u64 H[2])
@@ -108,7 +142,7 @@ static void gcm_init_8bit(u128 Htable[256], u64 H[2])
        }
 }
 
-static void gcm_gmult_8bit(u64 Xi[2], u128 Htable[256])
+static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
 {
        u128 Z = { 0, 0};
        const u8 *xi = (const u8 *)Xi+15;
@@ -608,30 +642,78 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
 
 #endif
 
-#if    TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
+#if    TABLE_BITS==4 && (defined(GHASH_ASM) || defined(OPENSSL_CPUID_OBJ))
+# if   !defined(I386_ONLY) && \
        (defined(__i386)        || defined(__i386__)    || \
         defined(__x86_64)      || defined(__x86_64__)  || \
         defined(_M_IX86)       || defined(_M_AMD64)    || defined(_M_X64))
-# define GHASH_ASM_IAX
+#  define GHASH_ASM_X86_OR_64
+#  define GCM_FUNCREF_4BIT
 extern unsigned int OPENSSL_ia32cap_P[2];
 
 void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
 void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 
-# if   defined(__i386) || defined(__i386__) || defined(_M_IX86)
-#  define GHASH_ASM_X86
+#if defined(__i386) || defined(__i386__) || defined(_M_IX86)
+# define gcm_init_avx  gcm_init_clmul
+# define gcm_gmult_avx gcm_gmult_clmul
+# define gcm_ghash_avx gcm_ghash_clmul
+#else
+void gcm_init_avx(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_avx(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_avx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#endif
+
+#  if  defined(__i386) || defined(__i386__) || defined(_M_IX86)
+#   define GHASH_ASM_X86
 void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 
 void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
 void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# elif defined(__arm__) || defined(__arm) || defined(__aarch64__)
+#  include "arm_arch.h"
+#  if __ARM_MAX_ARCH__>=7
+#   define GHASH_ASM_ARM
+#   define GCM_FUNCREF_4BIT
+#   define PMULL_CAPABLE       (OPENSSL_armcap_P & ARMV8_PMULL)
+#   if defined(__arm__) || defined(__arm)
+#    define NEON_CAPABLE       (OPENSSL_armcap_P & ARMV7_NEON)
+#   endif
+void gcm_init_neon(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+void gcm_init_v8(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_v8(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_v8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#  endif
+# elif defined(__sparc__) || defined(__sparc)
+#  include "sparc_arch.h"
+#  define GHASH_ASM_SPARC
+#  define GCM_FUNCREF_4BIT
+extern unsigned int OPENSSL_sparcv9cap_P[];
+void gcm_init_vis3(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_vis3(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_vis3(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
+#elif defined(OPENSSL_CPUID_OBJ) && (defined(__powerpc__) || defined(__ppc__) || defined(_ARCH_PPC))
+#  include "ppc_arch.h"
+#  define GHASH_ASM_PPC
+#  define GCM_FUNCREF_4BIT
+void gcm_init_p8(u128 Htable[16],const u64 Xi[2]);
+void gcm_gmult_p8(u64 Xi[2],const u128 Htable[16]);
+void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
 # endif
+#endif
 
+#ifdef GCM_FUNCREF_4BIT
 # undef  GCM_MUL
-# define GCM_MUL(ctx,Xi)   (*((ctx)->gmult))(ctx->Xi.u,ctx->Htable)
-# undef  GHASH
-# define GHASH(ctx,in,len) (*((ctx)->ghash))((ctx)->Xi.u,(ctx)->Htable,in,len)
+# define GCM_MUL(ctx,Xi)       (*gcm_gmult_p)(ctx->Xi.u,ctx->Htable)
+# ifdef GHASH
+#  undef  GHASH
+#  define GHASH(ctx,in,len)    (*gcm_ghash_p)(ctx->Xi.u,ctx->Htable,in,len)
+# endif
 #endif
 
 void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
@@ -662,18 +744,29 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
 #if    TABLE_BITS==8
        gcm_init_8bit(ctx->Htable,ctx->H.u);
 #elif  TABLE_BITS==4
-# if   defined(GHASH_ASM_IAX)                  /* both x86 and x86_64 */
+# if   defined(GHASH_ASM_X86_OR_64)
 #  if  !defined(GHASH_ASM_X86) || defined(OPENSSL_IA32_SSE2)
-       if (OPENSSL_ia32cap_P[1]&(1<<1)) {
-               gcm_init_clmul(ctx->Htable,ctx->H.u);
-               ctx->gmult = gcm_gmult_clmul;
-               ctx->ghash = gcm_ghash_clmul;
+       if (OPENSSL_ia32cap_P[0]&(1<<24) &&     /* check FXSR bit */
+           OPENSSL_ia32cap_P[1]&(1<<1) ) {     /* check PCLMULQDQ bit */
+               if (((OPENSSL_ia32cap_P[1]>>22)&0x41)==0x41) {  /* AVX+MOVBE */
+                       gcm_init_avx(ctx->Htable,ctx->H.u);
+                       ctx->gmult = gcm_gmult_avx;
+                       ctx->ghash = gcm_ghash_avx;
+               } else {
+                       gcm_init_clmul(ctx->Htable,ctx->H.u);
+                       ctx->gmult = gcm_gmult_clmul;
+                       ctx->ghash = gcm_ghash_clmul;
+               }
                return;
        }
 #  endif
        gcm_init_4bit(ctx->Htable,ctx->H.u);
 #  if  defined(GHASH_ASM_X86)                  /* x86 only */
-       if (OPENSSL_ia32cap_P[0]&(1<<23)) {
+#   if defined(OPENSSL_IA32_SSE2)
+       if (OPENSSL_ia32cap_P[0]&(1<<25)) {     /* check SSE bit */
+#   else
+       if (OPENSSL_ia32cap_P[0]&(1<<23)) {     /* check MMX bit */
+#   endif
                ctx->gmult = gcm_gmult_4bit_mmx;
                ctx->ghash = gcm_ghash_4bit_mmx;
        } else {
@@ -684,6 +777,46 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
        ctx->gmult = gcm_gmult_4bit;
        ctx->ghash = gcm_ghash_4bit;
 #  endif
+# elif defined(GHASH_ASM_ARM)
+#  ifdef PMULL_CAPABLE
+       if (PMULL_CAPABLE) {
+               gcm_init_v8(ctx->Htable,ctx->H.u);
+               ctx->gmult = gcm_gmult_v8;
+               ctx->ghash = gcm_ghash_v8;
+       } else
+#  endif
+#  ifdef NEON_CAPABLE
+       if (NEON_CAPABLE) {
+               gcm_init_neon(ctx->Htable,ctx->H.u);
+               ctx->gmult = gcm_gmult_neon;
+               ctx->ghash = gcm_ghash_neon;
+       } else
+#  endif
+       {
+               gcm_init_4bit(ctx->Htable,ctx->H.u);
+               ctx->gmult = gcm_gmult_4bit;
+               ctx->ghash = gcm_ghash_4bit;
+       }
+# elif defined(GHASH_ASM_SPARC)
+       if (OPENSSL_sparcv9cap_P[0] & SPARCV9_VIS3) {
+               gcm_init_vis3(ctx->Htable,ctx->H.u);
+               ctx->gmult = gcm_gmult_vis3;
+               ctx->ghash = gcm_ghash_vis3;
+       } else {
+               gcm_init_4bit(ctx->Htable,ctx->H.u);
+               ctx->gmult = gcm_gmult_4bit;
+               ctx->ghash = gcm_ghash_4bit;
+       }
+# elif defined(GHASH_ASM_PPC)
+       if (OPENSSL_ppccap_P & PPC_CRYPTO207) {
+               gcm_init_p8(ctx->Htable,ctx->H.u);
+               ctx->gmult = gcm_gmult_p8;
+               ctx->ghash = gcm_ghash_p8;
+       } else {
+               gcm_init_4bit(ctx->Htable,ctx->H.u);
+               ctx->gmult = gcm_gmult_4bit;
+               ctx->ghash = gcm_ghash_4bit;
+       }
 # else
        gcm_init_4bit(ctx->Htable,ctx->H.u);
 # endif
@@ -694,6 +827,9 @@ void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
 {
        const union { long one; char little; } is_endian = {1};
        unsigned int ctr;
+#ifdef GCM_FUNCREF_4BIT
+       void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+#endif
 
        ctx->Yi.u[0]  = 0;
        ctx->Yi.u[1]  = 0;
@@ -744,7 +880,11 @@ void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
                GCM_MUL(ctx,Yi);
 
                if (is_endian.little)
+#ifdef BSWAP4
+                       ctr = BSWAP4(ctx->Yi.d[3]);
+#else
                        ctr = GETU32(ctx->Yi.c+12);
+#endif
                else
                        ctr = ctx->Yi.d[3];
        }
@@ -752,7 +892,11 @@ void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx,const unsigned char *iv,size_t len)
        (*ctx->block)(ctx->Yi.c,ctx->EK0.c,ctx->key);
        ++ctr;
        if (is_endian.little)
+#ifdef BSWAP4
+               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                PUTU32(ctx->Yi.c+12,ctr);
+#endif
        else
                ctx->Yi.d[3] = ctr;
 }
@@ -762,6 +906,13 @@ int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx,const unsigned char *aad,size_t len)
        size_t i;
        unsigned int n;
        u64 alen = ctx->len.u[0];
+#ifdef GCM_FUNCREF_4BIT
+       void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+       void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                               const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
 
        if (ctx->len.u[1]) return -2;
 
@@ -814,7 +965,16 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
        const union { long one; char little; } is_endian = {1};
        unsigned int n, ctr;
        size_t i;
-       u64 mlen = ctx->len.u[1];
+       u64        mlen  = ctx->len.u[1];
+       block128_f block = ctx->block;
+       void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+       void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+       void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                               const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
 
 #if 0
        n = (unsigned int)mlen%16; /* alternative to ctx->mres */
@@ -831,13 +991,18 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
        }
 
        if (is_endian.little)
+#ifdef BSWAP4
+               ctr = BSWAP4(ctx->Yi.d[3]);
+#else
                ctr = GETU32(ctx->Yi.c+12);
+#endif
        else
                ctr = ctx->Yi.d[3];
 
        n = ctx->mres;
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-       if (16%sizeof(size_t) == 0) do {        /* always true actually */
+       if (16%sizeof(size_t) == 0) {   /* always true actually */
+           do {
                if (n) {
                        while (n && len) {
                                ctx->Xi.c[n] ^= *(out++) = *(in++)^ctx->EKi.c[n];
@@ -859,15 +1024,21 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
                    size_t j=GHASH_CHUNK;
 
                    while (j) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       size_t *out_t=(size_t *)out;
+                       const size_t *in_t=(const size_t *)in;
+
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
-                       for (i=0; i<16; i+=sizeof(size_t))
-                               *(size_t *)(out+i) =
-                               *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                       for (i=0; i<16/sizeof(size_t); ++i)
+                               out_t[i] = in_t[i] ^ ctx->EKi.t[i];
                        out += 16;
                        in  += 16;
                        j   -= 16;
@@ -879,15 +1050,21 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
                    size_t j=i;
 
                    while (len>=16) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       size_t *out_t=(size_t *)out;
+                       const size_t *in_t=(const size_t *)in;
+
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
-                       for (i=0; i<16; i+=sizeof(size_t))
-                               *(size_t *)(out+i) =
-                               *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                       for (i=0; i<16/sizeof(size_t); ++i)
+                               out_t[i] = in_t[i] ^ ctx->EKi.t[i];
                        out += 16;
                        in  += 16;
                        len -= 16;
@@ -896,16 +1073,22 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
                }
 #else
                while (len>=16) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       size_t *out_t=(size_t *)out;
+                       const size_t *in_t=(const size_t *)in;
+
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
-                       for (i=0; i<16; i+=sizeof(size_t))
-                               *(size_t *)(ctx->Xi.c+i) ^=
-                               *(size_t *)(out+i) =
-                               *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                       for (i=0; i<16/sizeof(size_t); ++i)
+                               ctx->Xi.t[i] ^=
+                               out_t[i] = in_t[i]^ctx->EKi.t[i];
                        GCM_MUL(ctx,Xi);
                        out += 16;
                        in  += 16;
@@ -913,10 +1096,14 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
                }
 #endif
                if (len) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
                        while (len--) {
@@ -927,14 +1114,19 @@ int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx,
 
                ctx->mres = n;
                return 0;
-       } while(0);
+           } while(0);
+       }
 #endif
        for (i=0;i<len;++i) {
                if (n==0) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
                }
@@ -955,7 +1147,16 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
        const union { long one; char little; } is_endian = {1};
        unsigned int n, ctr;
        size_t i;
-       u64 mlen = ctx->len.u[1];
+       u64        mlen  = ctx->len.u[1];
+       block128_f block = ctx->block;
+       void      *key   = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+       void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+       void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                               const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
 
        mlen += len;
        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
@@ -969,13 +1170,18 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
        }
 
        if (is_endian.little)
+#ifdef BSWAP4
+               ctr = BSWAP4(ctx->Yi.d[3]);
+#else
                ctr = GETU32(ctx->Yi.c+12);
+#endif
        else
                ctr = ctx->Yi.d[3];
 
        n = ctx->mres;
 #if !defined(OPENSSL_SMALL_FOOTPRINT)
-       if (16%sizeof(size_t) == 0) do {        /* always true actually */
+       if (16%sizeof(size_t) == 0) {   /* always true actually */
+           do {
                if (n) {
                        while (n && len) {
                                u8 c = *(in++);
@@ -1000,15 +1206,21 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
 
                    GHASH(ctx,in,GHASH_CHUNK);
                    while (j) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       size_t *out_t=(size_t *)out;
+                       const size_t *in_t=(const size_t *)in;
+
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
-                       for (i=0; i<16; i+=sizeof(size_t))
-                               *(size_t *)(out+i) =
-                               *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                       for (i=0; i<16/sizeof(size_t); ++i)
+                               out_t[i] = in_t[i]^ctx->EKi.t[i];
                        out += 16;
                        in  += 16;
                        j   -= 16;
@@ -1018,15 +1230,21 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
                if ((i = (len&(size_t)-16))) {
                    GHASH(ctx,in,i);
                    while (len>=16) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       size_t *out_t=(size_t *)out;
+                       const size_t *in_t=(const size_t *)in;
+
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
-                       for (i=0; i<16; i+=sizeof(size_t))
-                               *(size_t *)(out+i) =
-                               *(size_t *)(in+i)^*(size_t *)(ctx->EKi.c+i);
+                       for (i=0; i<16/sizeof(size_t); ++i)
+                               out_t[i] = in_t[i]^ctx->EKi.t[i];
                        out += 16;
                        in  += 16;
                        len -= 16;
@@ -1034,16 +1252,23 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
                }
 #else
                while (len>=16) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       size_t *out_t=(size_t *)out;
+                       const size_t *in_t=(const size_t *)in;
+
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
-                       for (i=0; i<16; i+=sizeof(size_t)) {
-                               size_t c = *(size_t *)(in+i);
-                               *(size_t *)(out+i) = c^*(size_t *)(ctx->EKi.c+i);
-                               *(size_t *)(ctx->Xi.c+i) ^= c;
+                       for (i=0; i<16/sizeof(size_t); ++i) {
+                               size_t c = in[i];
+                               out[i] = c^ctx->EKi.t[i];
+                               ctx->Xi.t[i] ^= c;
                        }
                        GCM_MUL(ctx,Xi);
                        out += 16;
@@ -1052,10 +1277,14 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
                }
 #endif
                if (len) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
                        while (len--) {
@@ -1068,15 +1297,20 @@ int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx,
 
                ctx->mres = n;
                return 0;
-       } while(0);
+           } while(0);
+       }
 #endif
        for (i=0;i<len;++i) {
                u8 c;
                if (n==0) {
-                       (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+                       (*block)(ctx->Yi.c,ctx->EKi.c,key);
                        ++ctr;
                        if (is_endian.little)
+#ifdef BSWAP4
+                               ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                                PUTU32(ctx->Yi.c+12,ctr);
+#endif
                        else
                                ctx->Yi.d[3] = ctr;
                }
@@ -1099,7 +1333,15 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
        const union { long one; char little; } is_endian = {1};
        unsigned int n, ctr;
        size_t i;
-       u64 mlen = ctx->len.u[1];
+       u64   mlen = ctx->len.u[1];
+       void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+       void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+       void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                               const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
 
        mlen += len;
        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
@@ -1113,7 +1355,11 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
        }
 
        if (is_endian.little)
+#ifdef BSWAP4
+               ctr = BSWAP4(ctx->Yi.d[3]);
+#else
                ctr = GETU32(ctx->Yi.c+12);
+#endif
        else
                ctr = ctx->Yi.d[3];
 
@@ -1132,10 +1378,14 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
        }
 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
        while (len>=GHASH_CHUNK) {
-               (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
+               (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
                ctr += GHASH_CHUNK/16;
                if (is_endian.little)
+#ifdef BSWAP4
+                       ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                        PUTU32(ctx->Yi.c+12,ctr);
+#endif
                else
                        ctx->Yi.d[3] = ctr;
                GHASH(ctx,out,GHASH_CHUNK);
@@ -1147,10 +1397,14 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
        if ((i = (len&(size_t)-16))) {
                size_t j=i/16;
 
-               (*stream)(in,out,j,ctx->key,ctx->Yi.c);
+               (*stream)(in,out,j,key,ctx->Yi.c);
                ctr += (unsigned int)j;
                if (is_endian.little)
+#ifdef BSWAP4
+                       ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                        PUTU32(ctx->Yi.c+12,ctr);
+#endif
                else
                        ctx->Yi.d[3] = ctr;
                in  += i;
@@ -1167,10 +1421,14 @@ int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx,
 #endif
        }
        if (len) {
-               (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+               (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
                ++ctr;
                if (is_endian.little)
+#ifdef BSWAP4
+                       ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                        PUTU32(ctx->Yi.c+12,ctr);
+#endif
                else
                        ctx->Yi.d[3] = ctr;
                while (len--) {
@@ -1190,7 +1448,15 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
        const union { long one; char little; } is_endian = {1};
        unsigned int n, ctr;
        size_t i;
-       u64 mlen = ctx->len.u[1];
+       u64   mlen = ctx->len.u[1];
+       void *key  = ctx->key;
+#ifdef GCM_FUNCREF_4BIT
+       void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+# ifdef GHASH
+       void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                               const u8 *inp,size_t len)       = ctx->ghash;
+# endif
+#endif
 
        mlen += len;
        if (mlen>((U64(1)<<36)-32) || (sizeof(len)==8 && mlen<len))
@@ -1204,7 +1470,11 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
        }
 
        if (is_endian.little)
+#ifdef BSWAP4
+               ctr = BSWAP4(ctx->Yi.d[3]);
+#else
                ctr = GETU32(ctx->Yi.c+12);
+#endif
        else
                ctr = ctx->Yi.d[3];
 
@@ -1226,10 +1496,14 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
 #if defined(GHASH) && !defined(OPENSSL_SMALL_FOOTPRINT)
        while (len>=GHASH_CHUNK) {
                GHASH(ctx,in,GHASH_CHUNK);
-               (*stream)(in,out,GHASH_CHUNK/16,ctx->key,ctx->Yi.c);
+               (*stream)(in,out,GHASH_CHUNK/16,key,ctx->Yi.c);
                ctr += GHASH_CHUNK/16;
                if (is_endian.little)
+#ifdef BSWAP4
+                       ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                        PUTU32(ctx->Yi.c+12,ctr);
+#endif
                else
                        ctx->Yi.d[3] = ctr;
                out += GHASH_CHUNK;
@@ -1252,10 +1526,14 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
                j   = i/16;
                in -= i;
 #endif
-               (*stream)(in,out,j,ctx->key,ctx->Yi.c);
+               (*stream)(in,out,j,key,ctx->Yi.c);
                ctr += (unsigned int)j;
                if (is_endian.little)
+#ifdef BSWAP4
+                       ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                        PUTU32(ctx->Yi.c+12,ctr);
+#endif
                else
                        ctx->Yi.d[3] = ctr;
                out += i;
@@ -1263,10 +1541,14 @@ int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx,
                len -= i;
        }
        if (len) {
-               (*ctx->block)(ctx->Yi.c,ctx->EKi.c,ctx->key);
+               (*ctx->block)(ctx->Yi.c,ctx->EKi.c,key);
                ++ctr;
                if (is_endian.little)
+#ifdef BSWAP4
+                       ctx->Yi.d[3] = BSWAP4(ctr);
+#else
                        PUTU32(ctx->Yi.c+12,ctr);
+#endif
                else
                        ctx->Yi.d[3] = ctr;
                while (len--) {
@@ -1287,8 +1569,11 @@ int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx,const unsigned char *tag,
        const union { long one; char little; } is_endian = {1};
        u64 alen = ctx->len.u[0]<<3;
        u64 clen = ctx->len.u[1]<<3;
+#ifdef GCM_FUNCREF_4BIT
+       void (*gcm_gmult_p)(u64 Xi[2],const u128 Htable[16])    = ctx->gmult;
+#endif
 
-       if (ctx->mres)
+       if (ctx->mres || ctx->ares)
                GCM_MUL(ctx,Xi);
 
        if (is_endian.little) {
@@ -1352,213 +1637,502 @@ static const u8       K1[16],
                *P1=NULL,
                *A1=NULL,
                IV1[12],
-               *C1=NULL,
-               T1[]=  {0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a};
+               *C1=NULL;
+static const u8        T1[]=  {
+                       0x58,0xe2,0xfc,0xce,0xfa,0x7e,0x30,0x61,
+                       0x36,0x7f,0x1d,0x57,0xa4,0xe7,0x45,0x5a
+                       };
 
 /* Test Case 2 */
 #define K2 K1
 #define A2 A1
 #define IV2 IV1
-static const u8        P2[16],
-               C2[]=  {0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78},
-               T2[]=  {0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf};
+static const u8        P2[16];
+static const u8        C2[]=  {
+                       0x03,0x88,0xda,0xce,0x60,0xb6,0xa3,0x92,
+                       0xf3,0x28,0xc2,0xb9,0x71,0xb2,0xfe,0x78
+                       };
+static const u8        T2[]=  {
+                       0xab,0x6e,0x47,0xd4,0x2c,0xec,0x13,0xbd,
+                       0xf5,0x3a,0x67,0xb2,0x12,0x57,0xbd,0xdf
+                       };
 
 /* Test Case 3 */
 #define A3 A2
-static const u8        K3[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
-               P3[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
-               IV3[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
-               C3[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
-                       0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
-                       0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
-                       0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85},
-               T3[]=  {0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4};
+static const u8        K3[]=  {
+                       0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,
+                       0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08
+                       };
+static const u8        P3[]=  {
+                       0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,
+                       0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,
+                       0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,
+                       0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,
+                       0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55
+                       };
+static const u8        IV3[]= {
+                       0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,
+                       0xde,0xca,0xf8,0x88};
+static const u8        C3[]=  {
+                       0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,
+                       0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+                       0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,
+                       0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+                       0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,
+                       0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+                       0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,
+                       0x3d,0x58,0xe0,0x91,0x47,0x3f,0x59,0x85
+                       };
+static const u8        T3[]=  {
+                       0x4d,0x5c,0x2a,0xf3,0x27,0xcd,0x64,0xa6,
+                       0x2c,0xf3,0x5a,0xbd,0x2b,0xa6,0xfa,0xb4
+                       };
 
 /* Test Case 4 */
 #define K4 K3
 #define IV4 IV3
-static const u8        P4[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
-               A4[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-                       0xab,0xad,0xda,0xd2},
-               C4[]=  {0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
-                       0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
-                       0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
-                       0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,0x3d,0x58,0xe0,0x91},
-               T4[]=  {0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47};
+static const u8        P4[]=  {
+                       0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,
+                       0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,
+                       0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,
+                       0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,
+                       0xba,0x63,0x7b,0x39};
+static const u8        A4[]=  {
+                       0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                       0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                       0xab,0xad,0xda,0xd2};
+static const u8        C4[]=  {
+                       0x42,0x83,0x1e,0xc2,0x21,0x77,0x74,0x24,
+                       0x4b,0x72,0x21,0xb7,0x84,0xd0,0xd4,0x9c,
+                       0xe3,0xaa,0x21,0x2f,0x2c,0x02,0xa4,0xe0,
+                       0x35,0xc1,0x7e,0x23,0x29,0xac,0xa1,0x2e,
+                       0x21,0xd5,0x14,0xb2,0x54,0x66,0x93,0x1c,
+                       0x7d,0x8f,0x6a,0x5a,0xac,0x84,0xaa,0x05,
+                       0x1b,0xa3,0x0b,0x39,0x6a,0x0a,0xac,0x97,
+                       0x3d,0x58,0xe0,0x91
+                       };
+static const u8        T4[]=  {
+                       0x5b,0xc9,0x4f,0xbc,0x32,0x21,0xa5,0xdb,
+                       0x94,0xfa,0xe9,0x5a,0xe7,0x12,0x1a,0x47
+                       };
 
 /* Test Case 5 */
 #define K5 K4
 #define P5 P4
-static const u8        A5[]=  {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-                       0xab,0xad,0xda,0xd2},
-               IV5[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
-               C5[]=  {0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
-                       0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
-                       0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
-                       0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,0xc2,0x3f,0x45,0x98},
-               T5[]=  {0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb};
+#define A5 A4
+static const u8        IV5[]= {
+                       0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad
+                       };
+static const u8        C5[]=  {
+                       0x61,0x35,0x3b,0x4c,0x28,0x06,0x93,0x4a,
+                       0x77,0x7f,0xf5,0x1f,0xa2,0x2a,0x47,0x55,
+                       0x69,0x9b,0x2a,0x71,0x4f,0xcd,0xc6,0xf8,
+                       0x37,0x66,0xe5,0xf9,0x7b,0x6c,0x74,0x23,
+                       0x73,0x80,0x69,0x00,0xe4,0x9f,0x24,0xb2,
+                       0x2b,0x09,0x75,0x44,0xd4,0x89,0x6b,0x42,
+                       0x49,0x89,0xb5,0xe1,0xeb,0xac,0x0f,0x07,
+                       0xc2,0x3f,0x45,0x98};
+static const u8        T5[]=  {
+                       0x36,0x12,0xd2,0xe7,0x9e,0x3b,0x07,0x85,
+                       0x56,0x1b,0xe1,0x4a,0xac,0xa2,0xfc,0xcb
+                       };
 
 /* Test Case 6 */
 #define K6 K5
 #define P6 P5
 #define A6 A5
-static const u8        IV6[]= {0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
-                       0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
-                       0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
-                       0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
-               C6[]=  {0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
-                       0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
-                       0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
-                       0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,0x4c,0x34,0xae,0xe5},
-               T6[]=  {0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50};
+static const u8        IV6[]= {
+                       0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,
+                       0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+                       0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,
+                       0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+                       0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,
+                       0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+                       0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,
+                       0xa6,0x37,0xb3,0x9b
+                       };
+static const u8        C6[]=  {
+                       0x8c,0xe2,0x49,0x98,0x62,0x56,0x15,0xb6,
+                       0x03,0xa0,0x33,0xac,0xa1,0x3f,0xb8,0x94,
+                       0xbe,0x91,0x12,0xa5,0xc3,0xa2,0x11,0xa8,
+                       0xba,0x26,0x2a,0x3c,0xca,0x7e,0x2c,0xa7,
+                       0x01,0xe4,0xa9,0xa4,0xfb,0xa4,0x3c,0x90,
+                       0xcc,0xdc,0xb2,0x81,0xd4,0x8c,0x7c,0x6f,
+                       0xd6,0x28,0x75,0xd2,0xac,0xa4,0x17,0x03,
+                       0x4c,0x34,0xae,0xe5
+                       };
+static const u8        T6[]=  {
+                       0x61,0x9c,0xc5,0xae,0xff,0xfe,0x0b,0xfa,
+                       0x46,0x2a,0xf4,0x3c,0x16,0x99,0xd0,0x50
+                       };
 
 /* Test Case 7 */
 static const u8 K7[24],
                *P7=NULL,
                *A7=NULL,
                IV7[12],
-               *C7=NULL,
-               T7[]=  {0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35};
+               *C7=NULL;
+static const u8        T7[]=  {
+                       0xcd,0x33,0xb2,0x8a,0xc7,0x73,0xf7,0x4b,
+                       0xa0,0x0e,0xd1,0xf3,0x12,0x57,0x24,0x35
+                       };
 
 /* Test Case 8 */
 #define K8 K7
 #define IV8 IV7
 #define A8 A7
-static const u8        P8[16],
-               C8[]=  {0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00},
-               T8[]=  {0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb};
+static const u8        P8[16];
+static const u8        C8[]=  {
+                       0x98,0xe7,0x24,0x7c,0x07,0xf0,0xfe,0x41,
+                       0x1c,0x26,0x7e,0x43,0x84,0xb0,0xf6,0x00
+                       };
+static const u8        T8[]=  {
+                       0x2f,0xf5,0x8d,0x80,0x03,0x39,0x27,0xab,
+                       0x8e,0xf4,0xd4,0x58,0x75,0x14,0xf0,0xfb
+                       };
 
 /* Test Case 9 */
 #define A9 A8
-static const u8        K9[]=  {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
-                       0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c},
-               P9[]=  {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
-               IV9[]= {0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
-               C9[]=  {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
-                       0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
-                       0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
-                       0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56},
-               T9[]=  {0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14};
+static const u8        K9[]=  {
+                       0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,
+                       0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+                       0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c
+                       };
+static const u8        P9[]=  {
+                       0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,
+                       0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,
+                       0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,
+                       0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,
+                       0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55
+                       };
+static const u8        IV9[]= {
+                       0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,
+                       0xde,0xca,0xf8,0x88
+                       };
+static const u8        C9[]=  {
+                       0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,
+                       0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+                       0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,
+                       0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+                       0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,
+                       0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+                       0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,
+                       0xcc,0xda,0x27,0x10,0xac,0xad,0xe2,0x56
+                       };
+static const u8        T9[]=  {
+                       0x99,0x24,0xa7,0xc8,0x58,0x73,0x36,0xbf,
+                       0xb1,0x18,0x02,0x4d,0xb8,0x67,0x4a,0x14
+                       };
 
 /* Test Case 10 */
 #define K10 K9
 #define IV10 IV9
-static const u8        P10[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
-               A10[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-                       0xab,0xad,0xda,0xd2},
-               C10[]= {0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
-                       0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
-                       0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
-                       0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,0xcc,0xda,0x27,0x10},
-               T10[]= {0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c};
+static const u8        P10[]= {
+                       0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,
+                       0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,
+                       0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,
+                       0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,
+                       0xba,0x63,0x7b,0x39
+                       };
+static const u8        A10[]= {
+                       0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                       0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                       0xab,0xad,0xda,0xd2
+                       };
+static const u8        C10[]= {
+                       0x39,0x80,0xca,0x0b,0x3c,0x00,0xe8,0x41,
+                       0xeb,0x06,0xfa,0xc4,0x87,0x2a,0x27,0x57,
+                       0x85,0x9e,0x1c,0xea,0xa6,0xef,0xd9,0x84,
+                       0x62,0x85,0x93,0xb4,0x0c,0xa1,0xe1,0x9c,
+                       0x7d,0x77,0x3d,0x00,0xc1,0x44,0xc5,0x25,
+                       0xac,0x61,0x9d,0x18,0xc8,0x4a,0x3f,0x47,
+                       0x18,0xe2,0x44,0x8b,0x2f,0xe3,0x24,0xd9,
+                       0xcc,0xda,0x27,0x10
+                       };
+static const u8        T10[]= {
+                       0x25,0x19,0x49,0x8e,0x80,0xf1,0x47,0x8f,
+                       0x37,0xba,0x55,0xbd,0x6d,0x27,0x61,0x8c
+                       };
 
 /* Test Case 11 */
 #define K11 K10
 #define P11 P10
 #define A11 A10
-static const u8        IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
-               C11[]= {0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
-                       0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
-                       0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
-                       0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,0xa0,0xf0,0x62,0xf7},
-               T11[]= {0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8};
+static const u8        IV11[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad};
+static const u8        C11[]= {
+                       0x0f,0x10,0xf5,0x99,0xae,0x14,0xa1,0x54,
+                       0xed,0x24,0xb3,0x6e,0x25,0x32,0x4d,0xb8,
+                       0xc5,0x66,0x63,0x2e,0xf2,0xbb,0xb3,0x4f,
+                       0x83,0x47,0x28,0x0f,0xc4,0x50,0x70,0x57,
+                       0xfd,0xdc,0x29,0xdf,0x9a,0x47,0x1f,0x75,
+                       0xc6,0x65,0x41,0xd4,0xd4,0xda,0xd1,0xc9,
+                       0xe9,0x3a,0x19,0xa5,0x8e,0x8b,0x47,0x3f,
+                       0xa0,0xf0,0x62,0xf7};
+static const u8        T11[]= {
+                       0x65,0xdc,0xc5,0x7f,0xcf,0x62,0x3a,0x24,
+                       0x09,0x4f,0xcc,0xa4,0x0d,0x35,0x33,0xf8
+                       };
 
 /* Test Case 12 */
 #define K12 K11
 #define P12 P11
 #define A12 A11
-static const u8        IV12[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
-                       0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
-                       0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
-                       0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
-               C12[]= {0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
-                       0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
-                       0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
-                       0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,0xe9,0xb7,0x37,0x3b},
-               T12[]= {0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9};
+static const u8        IV12[]={
+                       0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,
+                       0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+                       0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,
+                       0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+                       0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,
+                       0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+                       0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,
+                       0xa6,0x37,0xb3,0x9b
+                       };
+static const u8        C12[]= {
+                       0xd2,0x7e,0x88,0x68,0x1c,0xe3,0x24,0x3c,
+                       0x48,0x30,0x16,0x5a,0x8f,0xdc,0xf9,0xff,
+                       0x1d,0xe9,0xa1,0xd8,0xe6,0xb4,0x47,0xef,
+                       0x6e,0xf7,0xb7,0x98,0x28,0x66,0x6e,0x45,
+                       0x81,0xe7,0x90,0x12,0xaf,0x34,0xdd,0xd9,
+                       0xe2,0xf0,0x37,0x58,0x9b,0x29,0x2d,0xb3,
+                       0xe6,0x7c,0x03,0x67,0x45,0xfa,0x22,0xe7,
+                       0xe9,0xb7,0x37,0x3b
+                       };
+static const u8        T12[]= {
+                       0xdc,0xf5,0x66,0xff,0x29,0x1c,0x25,0xbb,
+                       0xb8,0x56,0x8f,0xc3,0xd3,0x76,0xa6,0xd9
+                       };
 
 /* Test Case 13 */
 static const u8        K13[32],
                *P13=NULL,
                *A13=NULL,
                IV13[12],
-               *C13=NULL,
-               T13[]={0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b};
+               *C13=NULL;
+static const u8        T13[]= {
+                       0x53,0x0f,0x8a,0xfb,0xc7,0x45,0x36,0xb9,
+                       0xa9,0x63,0xb4,0xf1,0xc4,0xcb,0x73,0x8b
+                       };
 
 /* Test Case 14 */
 #define K14 K13
 #define A14 A13
 static const u8        P14[16],
-               IV14[12],
-               C14[]= {0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18},
-               T14[]= {0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19};
+               IV14[12];
+static const u8        C14[]= {
+                       0xce,0xa7,0x40,0x3d,0x4d,0x60,0x6b,0x6e,
+                       0x07,0x4e,0xc5,0xd3,0xba,0xf3,0x9d,0x18
+                       };
+static const u8        T14[]= {
+                       0xd0,0xd1,0xc8,0xa7,0x99,0x99,0x6b,0xf0,
+                       0x26,0x5b,0x98,0xb5,0xd4,0x8a,0xb9,0x19
+                       };
 
 /* Test Case 15 */
 #define A15 A14
-static const u8        K15[]= {0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
-                       0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08},
-               P15[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55},
-               IV15[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,0xde,0xca,0xf8,0x88},
-               C15[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
-                       0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
-                       0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
-                       0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad},
-               T15[]= {0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c};
+static const u8        K15[]= {
+                       0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,
+                       0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08,
+                       0xfe,0xff,0xe9,0x92,0x86,0x65,0x73,0x1c,
+                       0x6d,0x6a,0x8f,0x94,0x67,0x30,0x83,0x08
+                       };
+static const u8        P15[]= {
+                       0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,
+                       0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,
+                       0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,
+                       0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,
+                       0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55
+                       };
+static const u8        IV15[]={
+                       0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad,
+                       0xde,0xca,0xf8,0x88
+                       };
+static const u8        C15[]= {
+                       0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,
+                       0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+                       0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,
+                       0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+                       0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,
+                       0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+                       0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,
+                       0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad
+                       };
+static const u8        T15[]= {
+                       0xb0,0x94,0xda,0xc5,0xd9,0x34,0x71,0xbd,
+                       0xec,0x1a,0x50,0x22,0x70,0xe3,0xcc,0x6c
+                       };
 
 /* Test Case 16 */
 #define K16 K15
 #define IV16 IV15
-static const u8        P16[]= {0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
-                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
-                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
-                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,0xba,0x63,0x7b,0x39},
-               A16[]= {0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
-                       0xab,0xad,0xda,0xd2},
-               C16[]= {0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
-                       0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
-                       0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
-                       0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,0xbc,0xc9,0xf6,0x62},
-               T16[]= {0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b};
+static const u8        P16[]= {
+                       0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,
+                       0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,
+                       0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,
+                       0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,
+                       0xba,0x63,0x7b,0x39
+                       };
+static const u8        A16[]= {
+                       0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                       0xfe,0xed,0xfa,0xce,0xde,0xad,0xbe,0xef,
+                       0xab,0xad,0xda,0xd2
+                       };
+static const u8        C16[]= {
+                       0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,
+                       0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+                       0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,
+                       0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+                       0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,
+                       0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+                       0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,
+                       0xbc,0xc9,0xf6,0x62
+                       };
+static const u8        T16[]= {
+                       0x76,0xfc,0x6e,0xce,0x0f,0x4e,0x17,0x68,
+                       0xcd,0xdf,0x88,0x53,0xbb,0x2d,0x55,0x1b
+                       };
 
 /* Test Case 17 */
 #define K17 K16
 #define P17 P16
 #define A17 A16
-static const u8        IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad},
-               C17[]= {0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
-                       0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
-                       0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
-                       0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,0xf4,0x7c,0x9b,0x1f},
-               T17[]= {0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2};
+static const u8        IV17[]={0xca,0xfe,0xba,0xbe,0xfa,0xce,0xdb,0xad};
+static const u8        C17[]= {
+                       0xc3,0x76,0x2d,0xf1,0xca,0x78,0x7d,0x32,
+                       0xae,0x47,0xc1,0x3b,0xf1,0x98,0x44,0xcb,
+                       0xaf,0x1a,0xe1,0x4d,0x0b,0x97,0x6a,0xfa,
+                       0xc5,0x2f,0xf7,0xd7,0x9b,0xba,0x9d,0xe0,
+                       0xfe,0xb5,0x82,0xd3,0x39,0x34,0xa4,0xf0,
+                       0x95,0x4c,0xc2,0x36,0x3b,0xc7,0x3f,0x78,
+                       0x62,0xac,0x43,0x0e,0x64,0xab,0xe4,0x99,
+                       0xf4,0x7c,0x9b,0x1f
+                       };
+static const u8        T17[]= {
+                       0x3a,0x33,0x7d,0xbf,0x46,0xa7,0x92,0xc4,
+                       0x5e,0x45,0x49,0x13,0xfe,0x2e,0xa8,0xf2
+                       };
 
 /* Test Case 18 */
 #define K18 K17
 #define P18 P17
 #define A18 A17
-static const u8        IV18[]={0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
-                       0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
-                       0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
-                       0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,0xa6,0x37,0xb3,0x9b},
-               C18[]= {0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
-                       0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
-                       0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
-                       0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,0x44,0xae,0x7e,0x3f},
-               T18[]= {0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a};
+static const u8        IV18[]={
+                       0x93,0x13,0x22,0x5d,0xf8,0x84,0x06,0xe5,
+                       0x55,0x90,0x9c,0x5a,0xff,0x52,0x69,0xaa,
+                       0x6a,0x7a,0x95,0x38,0x53,0x4f,0x7d,0xa1,
+                       0xe4,0xc3,0x03,0xd2,0xa3,0x18,0xa7,0x28,
+                       0xc3,0xc0,0xc9,0x51,0x56,0x80,0x95,0x39,
+                       0xfc,0xf0,0xe2,0x42,0x9a,0x6b,0x52,0x54,
+                       0x16,0xae,0xdb,0xf5,0xa0,0xde,0x6a,0x57,
+                       0xa6,0x37,0xb3,0x9b
+                       };
+static const u8        C18[]= {
+                       0x5a,0x8d,0xef,0x2f,0x0c,0x9e,0x53,0xf1,
+                       0xf7,0x5d,0x78,0x53,0x65,0x9e,0x2a,0x20,
+                       0xee,0xb2,0xb2,0x2a,0xaf,0xde,0x64,0x19,
+                       0xa0,0x58,0xab,0x4f,0x6f,0x74,0x6b,0xf4,
+                       0x0f,0xc0,0xc3,0xb7,0x80,0xf2,0x44,0x45,
+                       0x2d,0xa3,0xeb,0xf1,0xc5,0xd8,0x2c,0xde,
+                       0xa2,0x41,0x89,0x97,0x20,0x0e,0xf8,0x2e,
+                       0x44,0xae,0x7e,0x3f
+                       };
+static const u8        T18[]= {
+                       0xa4,0x4a,0x82,0x66,0xee,0x1c,0x8e,0xb0,
+                       0xc8,0xb5,0xd4,0xcf,0x5a,0xe9,0xf1,0x9a
+                       };
+
+/* Test Case 19 */
+#define K19 K1
+#define P19 P1
+#define IV19 IV1
+#define C19 C1
+static const u8 A19[]= {
+                       0xd9,0x31,0x32,0x25,0xf8,0x84,0x06,0xe5,
+                       0xa5,0x59,0x09,0xc5,0xaf,0xf5,0x26,0x9a,
+                       0x86,0xa7,0xa9,0x53,0x15,0x34,0xf7,0xda,
+                       0x2e,0x4c,0x30,0x3d,0x8a,0x31,0x8a,0x72,
+                       0x1c,0x3c,0x0c,0x95,0x95,0x68,0x09,0x53,
+                       0x2f,0xcf,0x0e,0x24,0x49,0xa6,0xb5,0x25,
+                       0xb1,0x6a,0xed,0xf5,0xaa,0x0d,0xe6,0x57,
+                       0xba,0x63,0x7b,0x39,0x1a,0xaf,0xd2,0x55,
+                       0x52,0x2d,0xc1,0xf0,0x99,0x56,0x7d,0x07,
+                       0xf4,0x7f,0x37,0xa3,0x2a,0x84,0x42,0x7d,
+                       0x64,0x3a,0x8c,0xdc,0xbf,0xe5,0xc0,0xc9,
+                       0x75,0x98,0xa2,0xbd,0x25,0x55,0xd1,0xaa,
+                       0x8c,0xb0,0x8e,0x48,0x59,0x0d,0xbb,0x3d,
+                       0xa7,0xb0,0x8b,0x10,0x56,0x82,0x88,0x38,
+                       0xc5,0xf6,0x1e,0x63,0x93,0xba,0x7a,0x0a,
+                       0xbc,0xc9,0xf6,0x62,0x89,0x80,0x15,0xad
+                       };
+static const u8        T19[]= {
+                       0x5f,0xea,0x79,0x3a,0x2d,0x6f,0x97,0x4d,
+                       0x37,0xe6,0x8e,0x0c,0xb8,0xff,0x94,0x92
+                       };
+
+/* Test Case 20 */
+#define K20 K1
+#define A20 A1
+static const u8 IV20[64]={0xff,0xff,0xff,0xff};        /* this results in 0xff in counter LSB */
+static const u8        P20[288];
+static const u8        C20[]= {
+                       0x56,0xb3,0x37,0x3c,0xa9,0xef,0x6e,0x4a,
+                       0x2b,0x64,0xfe,0x1e,0x9a,0x17,0xb6,0x14,
+                       0x25,0xf1,0x0d,0x47,0xa7,0x5a,0x5f,0xce,
+                       0x13,0xef,0xc6,0xbc,0x78,0x4a,0xf2,0x4f,
+                       0x41,0x41,0xbd,0xd4,0x8c,0xf7,0xc7,0x70,
+                       0x88,0x7a,0xfd,0x57,0x3c,0xca,0x54,0x18,
+                       0xa9,0xae,0xff,0xcd,0x7c,0x5c,0xed,0xdf,
+                       0xc6,0xa7,0x83,0x97,0xb9,0xa8,0x5b,0x49,
+                       0x9d,0xa5,0x58,0x25,0x72,0x67,0xca,0xab,
+                       0x2a,0xd0,0xb2,0x3c,0xa4,0x76,0xa5,0x3c,
+                       0xb1,0x7f,0xb4,0x1c,0x4b,0x8b,0x47,0x5c,
+                       0xb4,0xf3,0xf7,0x16,0x50,0x94,0xc2,0x29,
+                       0xc9,0xe8,0xc4,0xdc,0x0a,0x2a,0x5f,0xf1,
+                       0x90,0x3e,0x50,0x15,0x11,0x22,0x13,0x76,
+                       0xa1,0xcd,0xb8,0x36,0x4c,0x50,0x61,0xa2,
+                       0x0c,0xae,0x74,0xbc,0x4a,0xcd,0x76,0xce,
+                       0xb0,0xab,0xc9,0xfd,0x32,0x17,0xef,0x9f,
+                       0x8c,0x90,0xbe,0x40,0x2d,0xdf,0x6d,0x86,
+                       0x97,0xf4,0xf8,0x80,0xdf,0xf1,0x5b,0xfb,
+                       0x7a,0x6b,0x28,0x24,0x1e,0xc8,0xfe,0x18,
+                       0x3c,0x2d,0x59,0xe3,0xf9,0xdf,0xff,0x65,
+                       0x3c,0x71,0x26,0xf0,0xac,0xb9,0xe6,0x42,
+                       0x11,0xf4,0x2b,0xae,0x12,0xaf,0x46,0x2b,
+                       0x10,0x70,0xbe,0xf1,0xab,0x5e,0x36,0x06,
+                       0x87,0x2c,0xa1,0x0d,0xee,0x15,0xb3,0x24,
+                       0x9b,0x1a,0x1b,0x95,0x8f,0x23,0x13,0x4c,
+                       0x4b,0xcc,0xb7,0xd0,0x32,0x00,0xbc,0xe4,
+                       0x20,0xa2,0xf8,0xeb,0x66,0xdc,0xf3,0x64,
+                       0x4d,0x14,0x23,0xc1,0xb5,0x69,0x90,0x03,
+                       0xc1,0x3e,0xce,0xf4,0xbf,0x38,0xa3,0xb6,
+                       0x0e,0xed,0xc3,0x40,0x33,0xba,0xc1,0x90,
+                       0x27,0x83,0xdc,0x6d,0x89,0xe2,0xe7,0x74,
+                       0x18,0x8a,0x43,0x9c,0x7e,0xbc,0xc0,0x67,
+                       0x2d,0xbd,0xa4,0xdd,0xcf,0xb2,0x79,0x46,
+                       0x13,0xb0,0xbe,0x41,0x31,0x5e,0xf7,0x78,
+                       0x70,0x8a,0x70,0xee,0x7d,0x75,0x16,0x5c
+                       };
+static const u8        T20[]= {
+                       0x8b,0x30,0x7f,0x6b,0x33,0x28,0x6d,0x0a,
+                       0xb0,0x26,0xa9,0xed,0x3f,0xe1,0xe8,0x5f
+                       };
 
 #define TEST_CASE(n)   do {                                    \
        u8 out[sizeof(P##n)];                                   \
@@ -1604,6 +2178,8 @@ int main()
        TEST_CASE(16);
        TEST_CASE(17);
        TEST_CASE(18);
+       TEST_CASE(19);
+       TEST_CASE(20);
 
 #ifdef OPENSSL_CPUID_OBJ
        {
@@ -1634,11 +2210,16 @@ int main()
                        ctr_t/(double)sizeof(buf),
                        (gcm_t-ctr_t)/(double)sizeof(buf));
 #ifdef GHASH
-       GHASH(&ctx,buf.c,sizeof(buf));
+       {
+       void (*gcm_ghash_p)(u64 Xi[2],const u128 Htable[16],
+                               const u8 *inp,size_t len)       = ctx.ghash;
+
+       GHASH((&ctx),buf.c,sizeof(buf));
        start = OPENSSL_rdtsc();
-       for (i=0;i<100;++i) GHASH(&ctx,buf.c,sizeof(buf));
+       for (i=0;i<100;++i) GHASH((&ctx),buf.c,sizeof(buf));
        gcm_t = OPENSSL_rdtsc() - start;
        printf("%.2f\n",gcm_t/(double)sizeof(buf)/(double)i);
+       }
 #endif
        }
 #endif