evp/e_chacha20_poly1305.c: further improve small-fragment TLS performance.
authorAndy Polyakov <appro@openssl.org>
Tue, 3 Jul 2018 19:34:08 +0000 (21:34 +0200)
committerAndy Polyakov <appro@openssl.org>
Fri, 6 Jul 2018 14:33:19 +0000 (16:33 +0200)
Improvement coefficients vary with TLS fragment length and platform, on
most Intel processors maximum improvement is ~50%, while on Ryzen - 80%.
The "secret" is new dedicated ChaCha20_128 code path and vectorized xor
helpers.

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6638)

crypto/evp/e_chacha20_poly1305.c
crypto/poly1305/asm/poly1305-x86_64.pl

index 47d5e50..6a9bccf 100644 (file)
@@ -196,14 +196,23 @@ static int chacha20_poly1305_init_key(EVP_CIPHER_CTX *ctx,
 }
 
 #  if !defined(OPENSSL_SMALL_FOOTPRINT)
+
+#   if defined(POLY1305_ASM) && (defined(__x86_64) || defined(__x86_64__) || \
+                                 defined(_M_AMD64) || defined(_M_X64))
+#    define XOR128_HELPERS
+void *xor128_encrypt_n_pad(void *out, const void *inp, void *otp, size_t len);
+void *xor128_decrypt_n_pad(void *out, const void *inp, void *otp, size_t len);
+static const unsigned char zero[4 * CHACHA_BLK_SIZE] = { 0 };
+#   else
 static const unsigned char zero[2 * CHACHA_BLK_SIZE] = { 0 };
+#   endif
 
 static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
                                         const unsigned char *in, size_t len)
 {
     EVP_CHACHA_AEAD_CTX *actx = aead_data(ctx);
-    size_t i, tail, tohash_len, plen = actx->tls_payload_length;
-    unsigned char *buf, *tohash, *ctr, storage[2 * CHACHA_BLK_SIZE + 32];
+    size_t tail, tohash_len, buf_len, plen = actx->tls_payload_length;
+    unsigned char *buf, *tohash, *ctr, storage[sizeof(zero) + 32];
 
     if (len != plen + POLY1305_BLOCK_SIZE)
         return -1;
@@ -212,9 +221,11 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
     ctr = buf + CHACHA_BLK_SIZE;
     tohash = buf + CHACHA_BLK_SIZE - POLY1305_BLOCK_SIZE;
 
-    if (plen <= CHACHA_BLK_SIZE) {
+#   ifdef XOR128_HELPERS
+    if (plen <= 3 * CHACHA_BLK_SIZE) {
         actx->key.counter[0] = 0;
-        ChaCha20_ctr32(buf, zero, 2 * CHACHA_BLK_SIZE, actx->key.key.d,
+        buf_len = (plen + 2 * CHACHA_BLK_SIZE - 1) & (0 - CHACHA_BLK_SIZE);
+        ChaCha20_ctr32(buf, zero, buf_len, actx->key.key.d,
                        actx->key.counter);
         Poly1305_Init(POLY1305_ctx(actx), buf);
         actx->key.partial_len = 0;
@@ -223,6 +234,31 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
         actx->len.aad = EVP_AEAD_TLS1_AAD_LEN;
         actx->len.text = plen;
 
+        if (plen) {
+            if (ctx->encrypt)
+                ctr = xor128_encrypt_n_pad(out, in, ctr, plen);
+            else
+                ctr = xor128_decrypt_n_pad(out, in, ctr, plen);
+
+            in += plen;
+            out += plen;
+            tohash_len = (size_t)(ctr - tohash);
+        }
+    }
+#   else
+    if (plen <= CHACHA_BLK_SIZE) {
+        size_t i;
+
+        actx->key.counter[0] = 0;
+        ChaCha20_ctr32(buf, zero, (buf_len = 2 * CHACHA_BLK_SIZE),
+                       actx->key.key.d, actx->key.counter);
+        Poly1305_Init(POLY1305_ctx(actx), buf);
+        actx->key.partial_len = 0;
+        memcpy(tohash, actx->tls_aad, POLY1305_BLOCK_SIZE);
+        tohash_len = POLY1305_BLOCK_SIZE;
+        actx->len.aad = EVP_AEAD_TLS1_AAD_LEN;
+        actx->len.text = plen;
+
         if (ctx->encrypt) {
             for (i = 0; i < plen; i++) {
                 out[i] = ctr[i] ^= in[i];
@@ -242,10 +278,12 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
         memset(ctr + i, 0, tail);
         ctr += i + tail;
         tohash_len += i + tail;
-    } else {
+    }
+#   endif
+    else {
         actx->key.counter[0] = 0;
-        ChaCha20_ctr32(buf, zero, CHACHA_BLK_SIZE, actx->key.key.d,
-                       actx->key.counter);
+        ChaCha20_ctr32(buf, zero, (buf_len = CHACHA_BLK_SIZE),
+                       actx->key.key.d, actx->key.counter);
         Poly1305_Init(POLY1305_ctx(actx), buf);
         actx->key.counter[0] = 1;
         actx->key.partial_len = 0;
@@ -300,7 +338,7 @@ static int chacha20_poly1305_tls_cipher(EVP_CIPHER_CTX *ctx, unsigned char *out,
     }
 
     Poly1305_Update(POLY1305_ctx(actx), tohash, tohash_len);
-    OPENSSL_cleanse(buf, 2 * CHACHA_BLK_SIZE);
+    OPENSSL_cleanse(buf, buf_len);
     Poly1305_Final(POLY1305_ctx(actx), ctx->encrypt ? actx->tag
                                                     : tohash);
 
index 0d1c0de..0b4c56e 100755 (executable)
@@ -3753,6 +3753,110 @@ poly1305_emit_base2_44:
 .size  poly1305_emit_base2_44,.-poly1305_emit_base2_44
 ___
 }      }       }
+
+{      # chacha20-poly1305 helpers
+my ($out,$inp,$otp,$len)=$win64 ? ("%rcx","%rdx","%r8", "%r9") :  # Win64 order
+                                  ("%rdi","%rsi","%rdx","%rcx");  # Unix order
+$code.=<<___;
+.globl xor128_encrypt_n_pad
+.type  xor128_encrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_encrypt_n_pad:
+       sub     $otp,$inp
+       sub     $otp,$out
+       mov     $len,%r10               # put len aside
+       shr     \$4,$len                # len / 16
+       jz      .Ltail_enc
+       nop
+.Loop_enc_xmm:
+       movdqu  ($inp,$otp),%xmm0
+       pxor    ($otp),%xmm0
+       movdqu  %xmm0,($out,$otp)
+       movdqa  %xmm0,($otp)
+       lea     16($otp),$otp
+       dec     $len
+       jnz     .Loop_enc_xmm
+
+       and     \$15,%r10               # len % 16
+       jz      .Ldone_enc
+
+.Ltail_enc:
+       mov     \$16,$len
+       sub     %r10,$len
+       xor     %eax,%eax
+.Loop_enc_byte:
+       mov     ($inp,$otp),%al
+       xor     ($otp),%al
+       mov     %al,($out,$otp)
+       mov     %al,($otp)
+       lea     1($otp),$otp
+       dec     %r10
+       jnz     .Loop_enc_byte
+
+       xor     %eax,%eax
+.Loop_enc_pad:
+       mov     %al,($otp)
+       lea     1($otp),$otp
+       dec     $len
+       jnz     .Loop_enc_pad
+
+.Ldone_enc:
+       mov     $otp,%rax
+       ret
+.size  xor128_encrypt_n_pad,.-xor128_encrypt_n_pad
+
+.globl xor128_decrypt_n_pad
+.type  xor128_decrypt_n_pad,\@abi-omnipotent
+.align 16
+xor128_decrypt_n_pad:
+       sub     $otp,$inp
+       sub     $otp,$out
+       mov     $len,%r10               # put len aside
+       shr     \$4,$len                # len / 16
+       jz      .Ltail_dec
+       nop
+.Loop_dec_xmm:
+       movdqu  ($inp,$otp),%xmm0
+       movdqa  ($otp),%xmm1
+       pxor    %xmm0,%xmm1
+       movdqu  %xmm1,($out,$otp)
+       movdqa  %xmm0,($otp)
+       lea     16($otp),$otp
+       dec     $len
+       jnz     .Loop_dec_xmm
+
+       pxor    %xmm1,%xmm1
+       and     \$15,%r10               # len % 16
+       jz      .Ldone_dec
+
+.Ltail_dec:
+       mov     \$16,$len
+       sub     %r10,$len
+       xor     %eax,%eax
+       xor     %r11,%r11
+.Loop_dec_byte:
+       mov     ($inp,$otp),%r11b
+       mov     ($otp),%al
+       xor     %r11b,%al
+       mov     %al,($out,$otp)
+       mov     %r11b,($otp)
+       lea     1($otp),$otp
+       dec     %r10
+       jnz     .Loop_dec_byte
+
+       xor     %eax,%eax
+.Loop_dec_pad:
+       mov     %al,($otp)
+       lea     1($otp),$otp
+       dec     $len
+       jnz     .Loop_dec_pad
+
+.Ldone_dec:
+       mov     $otp,%rax
+       ret
+.size  xor128_decrypt_n_pad,.-xor128_decrypt_n_pad
+___
+}
 $code.=<<___;
 .align 64
 .Lconst: