evp/e_aes_cbc_hmac_sha*.c: improve cache locality.
authorAndy Polyakov <appro@openssl.org>
Thu, 13 Feb 2014 13:39:55 +0000 (14:39 +0100)
committerAndy Polyakov <appro@openssl.org>
Thu, 13 Feb 2014 13:39:55 +0000 (14:39 +0100)
crypto/evp/e_aes_cbc_hmac_sha1.c
crypto/evp/e_aes_cbc_hmac_sha256.c

index 09f9281..0b6f292 100644 (file)
@@ -205,13 +205,15 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
                u32     d[32];
                u8      c[128]; } blocks[8];
        SHA1_MB_CTX     *ctx;
-       unsigned int    frag, last, packlen, i, x4=4*n4x;
+       unsigned int    frag, last, packlen, i, x4=4*n4x, minblocks, processed=0;
        size_t          ret = 0;
        u8              *IVs;
 #if defined(BSWAP8)
        u64             seqnum;
 #endif
 
+       RAND_bytes((IVs=blocks[0].c),16*x4);    /* ask for IVs in bulk */
+
        ctx = (SHA1_MB_CTX *)(storage+32-((size_t)storage%32)); /* align */
 
        frag = (unsigned int)inp_len>>(1+n4x);
@@ -221,8 +223,21 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
                last -= x4-1;
        }
 
+       packlen = 5+16+((frag+20+16)&-16);
+
+       /* populate descriptors with pointers and IVs */
        hash_d[0].ptr = inp;
-       for (i=1;i<x4;i++)      hash_d[i].ptr = hash_d[i-1].ptr+frag;
+       ciph_d[0].inp = inp;
+       ciph_d[0].out = out+5+16;       /* 5+16 is place for header and explicit IV */
+       memcpy(ciph_d[0].out-16,IVs,16);
+       memcpy(ciph_d[0].iv,IVs,16);    IVs += 16;
+
+       for (i=1;i<x4;i++) {
+               ciph_d[i].inp = hash_d[i].ptr = hash_d[i-1].ptr+frag;
+               ciph_d[i].out = ciph_d[i-1].out+packlen;
+               memcpy(ciph_d[i].out-16,IVs,16);
+               memcpy(ciph_d[i].iv,IVs,16);    IVs+=16;
+       }
 
 #if defined(BSWAP8)
        memcpy(blocks[0].c,key->md.data,8);
@@ -268,6 +283,39 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
        /* hash 13-byte headers and first 64-13 bytes of inputs */
        sha1_multi_block(ctx,edges,n4x);
        /* hash bulk inputs */
+#define        MAXCHUNKSIZE    2048
+#if    MAXCHUNKSIZE%64
+#error "MAXCHUNKSIZE is not divisible by 64"
+#elif  MAXCHUNKSIZE
+       /* goal is to minimize pressure on L1 cache by moving
+        * in shorter steps, so that hashed data is still in
+        * the cache by the time we encrypt it */
+       minblocks = ((frag<=last ? frag : last)-(64-13))/64;
+       if (minblocks>MAXCHUNKSIZE/64) {
+               for (i=0;i<x4;i++) {
+                       edges[i].ptr     = hash_d[i].ptr;
+                       edges[i].blocks  = MAXCHUNKSIZE/64;
+                       ciph_d[i].blocks = MAXCHUNKSIZE/16;
+               }
+               do {
+                       sha1_multi_block(ctx,edges,n4x);
+                       aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);
+
+                       for (i=0;i<x4;i++) {
+                               edges[i].ptr     = hash_d[i].ptr += MAXCHUNKSIZE;
+                               hash_d[i].blocks -= MAXCHUNKSIZE/64;
+                               edges[i].blocks  = MAXCHUNKSIZE/64;
+                               ciph_d[i].inp    += MAXCHUNKSIZE;
+                               ciph_d[i].out    += MAXCHUNKSIZE;
+                               ciph_d[i].blocks = MAXCHUNKSIZE/16;
+                               memcpy(ciph_d[i].iv,ciph_d[i].out-16,16);
+                       }
+                       processed += MAXCHUNKSIZE;
+                       minblocks -= MAXCHUNKSIZE/64;
+               } while (minblocks>MAXCHUNKSIZE/64);
+       }
+#endif
+#undef MAXCHUNKSIZE
        sha1_multi_block(ctx,hash_d,n4x);
 
        memset(blocks,0,sizeof(blocks));
@@ -276,7 +324,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
                                        off = hash_d[i].blocks*64;
                const unsigned char    *ptr = hash_d[i].ptr+off;
 
-               off = len-(64-13)-off;  /* remainder actually */
+               off = (len-processed)-(64-13)-off;      /* remainder actually */
                memcpy(blocks[i].c,ptr,off);
                blocks[i].c[off]=0x80;
                len += 64+13;           /* 64 is HMAC header */
@@ -310,23 +358,14 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
        /* finalize MACs */
        sha1_multi_block(ctx,edges,n4x);
 
-       packlen = 5+16+((frag+20+16)&-16);
-
-       out += (packlen<<(1+n4x))-packlen;
-       inp += (frag<<(1+n4x))-frag;
-
-       RAND_bytes((IVs=blocks[0].c),16*x4);    /* ask for IVs in bulk */
-
-       for (i=x4-1;;i--) {
+       for (i=0;i<x4;i++) {
                unsigned int len = (i==(x4-1)?last:frag), pad, j;
                unsigned char *out0 = out;
 
-               out += 5+16;            /* place for header and explicit IV */
-               ciph_d[i].inp = out;
-               ciph_d[i].out = out;
+               memcpy(ciph_d[i].out,ciph_d[i].inp,len-processed);
+               ciph_d[i].inp = ciph_d[i].out;
 
-               memmove(out,inp,len);
-               out += len;
+               out += 5+16+len;
 
                /* write MAC */
                ((u32 *)out)[0] = BSWAP4(ctx->A[i]);
@@ -342,7 +381,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
                for (j=0;j<=pad;j++) *(out++) = pad;
                len += pad+1;
 
-               ciph_d[i].blocks = len/16;
+               ciph_d[i].blocks = (len-processed)/16;
                len += 16;      /* account for explicit iv */
 
                /* arrange header */
@@ -352,17 +391,8 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
                out0[3] = (u8)(len>>8);
                out0[4] = (u8)(len);
 
-               /* explicit iv */
-               memcpy(ciph_d[i].iv, IVs, 16);
-               memcpy(&out0[5],     IVs, 16);
-
                ret += len+5;
-
-               if (i==0) break;
-
-               out = out0-packlen;
-               inp -= frag;
-               IVs += 16;
+               inp += frag;
        }
 
        aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);
index 95bdd42..c2c48f0 100644 (file)
@@ -201,13 +201,15 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
                u32     d[32];
                u8      c[128]; } blocks[8];
        SHA256_MB_CTX   *ctx;
-       unsigned int    frag, last, packlen, i, x4=4*n4x;
+       unsigned int    frag, last, packlen, i, x4=4*n4x, minblocks, processed=0;
        size_t          ret = 0;
        u8              *IVs;
 #if defined(BSWAP8)
        u64             seqnum;
 #endif
 
+       RAND_bytes((IVs=blocks[0].c),16*x4);    /* ask for IVs in bulk */
+
        ctx = (SHA256_MB_CTX *)(storage+32-((size_t)storage%32));       /* align */
 
        frag = (unsigned int)inp_len>>(1+n4x);
@@ -217,8 +219,21 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
                last -= x4-1;
        }
 
+       packlen = 5+16+((frag+32+16)&-16);
+
+       /* populate descriptors with pointers and IVs */
        hash_d[0].ptr = inp;
-       for (i=1;i<x4;i++)      hash_d[i].ptr = hash_d[i-1].ptr+frag;
+       ciph_d[0].inp = inp;
+       ciph_d[0].out = out+5+16;       /* 5+16 is place for header and explicit IV */
+       memcpy(ciph_d[0].out-16,IVs,16);
+       memcpy(ciph_d[0].iv,IVs,16);    IVs += 16;
+
+       for (i=1;i<x4;i++) {
+               ciph_d[i].inp = hash_d[i].ptr = hash_d[i-1].ptr+frag;
+               ciph_d[i].out = ciph_d[i-1].out+packlen;
+               memcpy(ciph_d[i].out-16,IVs,16);
+               memcpy(ciph_d[i].iv,IVs,16);    IVs+=16;
+       }
 
 #if defined(BSWAP8)
        memcpy(blocks[0].c,key->md.data,8);
@@ -267,6 +282,39 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
        /* hash 13-byte headers and first 64-13 bytes of inputs */
        sha256_multi_block(ctx,edges,n4x);
        /* hash bulk inputs */
+#define        MAXCHUNKSIZE    2048
+#if    MAXCHUNKSIZE%64
+#error "MAXCHUNKSIZE is not divisible by 64"
+#elif  MAXCHUNKSIZE
+       /* goal is to minimize pressure on L1 cache by moving
+        * in shorter steps, so that hashed data is still in
+        * the cache by the time we encrypt it */
+       minblocks = ((frag<=last ? frag : last)-(64-13))/64;
+       if (minblocks>MAXCHUNKSIZE/64) {
+               for (i=0;i<x4;i++) {
+                       edges[i].ptr     = hash_d[i].ptr;
+                       edges[i].blocks  = MAXCHUNKSIZE/64;
+                       ciph_d[i].blocks = MAXCHUNKSIZE/16;
+               }
+               do {
+                       sha256_multi_block(ctx,edges,n4x);
+                       aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);
+
+                       for (i=0;i<x4;i++) {
+                               edges[i].ptr     = hash_d[i].ptr += MAXCHUNKSIZE;
+                               hash_d[i].blocks -= MAXCHUNKSIZE/64;
+                               edges[i].blocks  = MAXCHUNKSIZE/64;
+                               ciph_d[i].inp    += MAXCHUNKSIZE;
+                               ciph_d[i].out    += MAXCHUNKSIZE;
+                               ciph_d[i].blocks = MAXCHUNKSIZE/16;
+                               memcpy(ciph_d[i].iv,ciph_d[i].out-16,16);
+                       }
+                       processed += MAXCHUNKSIZE;
+                       minblocks -= MAXCHUNKSIZE/64;
+               } while (minblocks>MAXCHUNKSIZE/64);
+       }
+#endif
+#undef MAXCHUNKSIZE
        sha256_multi_block(ctx,hash_d,n4x);
 
        memset(blocks,0,sizeof(blocks));
@@ -275,7 +323,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
                                        off = hash_d[i].blocks*64;
                const unsigned char    *ptr = hash_d[i].ptr+off;
 
-               off = len-(64-13)-off;  /* remainder actually */
+               off = (len-processed)-(64-13)-off;      /* remainder actually */
                memcpy(blocks[i].c,ptr,off);
                blocks[i].c[off]=0x80;
                len += 64+13;           /* 64 is HMAC header */
@@ -312,23 +360,14 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
        /* finalize MACs */
        sha256_multi_block(ctx,edges,n4x);
 
-       packlen = 5+16+((frag+32+16)&-16);
-
-       out += (packlen<<(1+n4x))-packlen;
-       inp += (frag<<(1+n4x))-frag;
-
-       RAND_bytes((IVs=blocks[0].c),16*x4);    /* ask for IVs in bulk */
-
-       for (i=x4-1;;i--) {
+       for (i=0;i<x4;i++) {
                unsigned int len = (i==(x4-1)?last:frag), pad, j;
                unsigned char *out0 = out;
 
-               out += 5+16;            /* place for header and explicit IV */
-               ciph_d[i].inp = out;
-               ciph_d[i].out = out;
+               memcpy(ciph_d[i].out,ciph_d[i].inp,len-processed);
+               ciph_d[i].inp = ciph_d[i].out;
 
-               memmove(out,inp,len);
-               out += len;
+               out += 5+16+len;
 
                /* write MAC */
                ((u32 *)out)[0] = BSWAP4(ctx->A[i]);
@@ -347,7 +386,7 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
                for (j=0;j<=pad;j++) *(out++) = pad;
                len += pad+1;
 
-               ciph_d[i].blocks = len/16;
+               ciph_d[i].blocks = (len-processed)/16;
                len += 16;      /* account for explicit iv */
 
                /* arrange header */
@@ -357,17 +396,8 @@ static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
                out0[3] = (u8)(len>>8);
                out0[4] = (u8)(len);
 
-               /* explicit iv */
-               memcpy(ciph_d[i].iv, IVs, 16);
-               memcpy(&out0[5],     IVs, 16);
-
                ret += len+5;
-
-               if (i==0) break;
-
-               out = out0-packlen;
-               inp -= frag;
-               IVs += 16;
+               inp += frag;
        }
 
        aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);