chacha/asm/chacha-x86_64.pl: add dedicated path for 128-byte inputs.
authorAndy Polyakov <appro@openssl.org>
Mon, 2 Jul 2018 11:16:33 +0000 (13:16 +0200)
committerAndy Polyakov <appro@openssl.org>
Tue, 3 Jul 2018 17:02:02 +0000 (19:02 +0200)
The 128-byte vectors are extensively used in chacha20_poly1305_tls_cipher
and dedicated code path is ~30-50% faster on most platforms.

Reviewed-by: Rich Salz <rsalz@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/6626)

crypto/chacha/asm/chacha-x86_64.pl

index 51bb6a9..b54f3b1 100755 (executable)
@@ -1,5 +1,5 @@
 #! /usr/bin/env perl
-# Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2016-2018 The OpenSSL Project Authors. All Rights Reserved.
 #
 # Licensed under the OpenSSL license (the "License").  You may not use
 # this file except in compliance with the License.  You can obtain a copy
 #
 # Performance in cycles per byte out of large buffer.
 #
-#              IALU/gcc 4.8(i) 1xSSSE3/SSE2    4xSSSE3     NxAVX(v)
+#              IALU/gcc 4.8(i) 1x/2xSSSE3(ii)  4xSSSE3     NxAVX(v)
 #
-# P4           9.48/+99%       -/22.7(ii)      -
-# Core2                7.83/+55%       7.90/8.08       4.35
-# Westmere     7.19/+50%       5.60/6.70       3.00
-# Sandy Bridge 8.31/+42%       5.45/6.76       2.72
-# Ivy Bridge   6.71/+46%       5.40/6.49       2.41
-# Haswell      5.92/+43%       5.20/6.45       2.42        1.23
-# Skylake[-X]  5.87/+39%       4.70/-          2.31        1.19[0.80(vi)]
-# Silvermont   12.0/+33%       7.75/7.40       7.03(iii)
-# Knights L    11.7/-          -               9.60(iii)   0.80
-# Goldmont     10.6/+17%       5.10/-          3.28
-# Sledgehammer 7.28/+52%       -/14.2(ii)      -
-# Bulldozer    9.66/+28%       9.85/11.1       3.06(iv)
-# Ryzen                5.96/+50%       5.19/-          2.40        2.09
-# VIA Nano     10.5/+46%       6.72/8.60       6.05
+# P4           9.48/+99%       -               -
+# Core2                7.83/+55%       7.90/5.76       4.35
+# Westmere     7.19/+50%       5.60/4.50       3.00
+# Sandy Bridge 8.31/+42%       5.45/4.00       2.72
+# Ivy Bridge   6.71/+46%       5.40/?          2.41
+# Haswell      5.92/+43%       5.20/3.45       2.42        1.23
+# Skylake[-X]  5.87/+39%       4.70/3.22       2.31        1.19[0.80(vi)]
+# Silvermont   12.0/+33%       7.75/6.90       7.03(iii)
+# Knights L    11.7/-          ?               9.60(iii)   0.80
+# Goldmont     10.6/+17%       5.10/3.52       3.28
+# Sledgehammer 7.28/+52%       -               -
+# Bulldozer    9.66/+28%       9.85/5.35(iv)   3.06(iv)
+# Ryzen                5.96/+50%       5.19/3.00       2.40        2.09
+# VIA Nano     10.5/+46%       6.72/6.88       6.05
 #
 # (i)  compared to older gcc 3.x one can observe >2x improvement on
 #      most platforms;
-# (ii) as it can be seen, SSE2 performance is too low on legacy
-#      processors; NxSSE2 results are naturally better, but not
-#      impressively better than IALU ones, which is why you won't
-#      find SSE2 code below;
+# (ii) 2xSSSE3 is code path optimized specifically for 128 bytes used
+#      by chacha20_poly1305_tls_cipher, results are EVP-free;
 # (iii)        this is not optimal result for Atom because of MSROM
 #      limitations, SSE2 can do better, but gain is considered too
 #      low to justify the [maintenance] effort;
-# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20;
+# (iv) Bulldozer actually executes 4xXOP code path that delivers 2.20
+#      and 4.85 for 128-byte inputs;
 # (v)  8xAVX2, 8xAVX512VL or 16xAVX512F, whichever best applicable;
 # (vi) even though Skylake-X can execute AVX512F code and deliver 0.57
 #      cpb in single thread, the corresponding capability is suppressed;
@@ -489,6 +488,7 @@ $code.=<<___        if ($avx);
 ___
 $code.=<<___;
        cmp     \$128,$len              # we might throw away some data,
+       je      .LChaCha20_128
        ja      .LChaCha20_4x           # but overall it won't be slower
 
 .Ldo_sse3_after_all:
@@ -605,6 +605,172 @@ $code.=<<___;
 ___
 }
 
+########################################################################
+# SSSE3 code path that handles 128-byte inputs
+{
+my ($a,$b,$c,$d,$t,$t1,$rot16,$rot24)=map("%xmm$_",(8,9,2..7));
+my ($a1,$b1,$c1,$d1)=map("%xmm$_",(10,11,0,1));
+
+sub SSSE3ROUND_2x {
+       &paddd  ($a,$b);
+       &pxor   ($d,$a);
+        &paddd ($a1,$b1);
+        &pxor  ($d1,$a1);
+       &pshufb ($d,$rot16);
+        &pshufb($d1,$rot16);
+
+       &paddd  ($c,$d);
+        &paddd ($c1,$d1);
+       &pxor   ($b,$c);
+        &pxor  ($b1,$c1);
+       &movdqa ($t,$b);
+       &psrld  ($b,20);
+        &movdqa($t1,$b1);
+       &pslld  ($t,12);
+        &psrld ($b1,20);
+       &por    ($b,$t);
+        &pslld ($t1,12);
+        &por   ($b1,$t1);
+
+       &paddd  ($a,$b);
+       &pxor   ($d,$a);
+        &paddd ($a1,$b1);
+        &pxor  ($d1,$a1);
+       &pshufb ($d,$rot24);
+        &pshufb($d1,$rot24);
+
+       &paddd  ($c,$d);
+        &paddd ($c1,$d1);
+       &pxor   ($b,$c);
+        &pxor  ($b1,$c1);
+       &movdqa ($t,$b);
+       &psrld  ($b,25);
+        &movdqa($t1,$b1);
+       &pslld  ($t,7);
+        &psrld ($b1,25);
+       &por    ($b,$t);
+        &pslld ($t1,7);
+        &por   ($b1,$t1);
+}
+
+my $xframe = $win64 ? 0x68 : 8;
+
+$code.=<<___;
+.type  ChaCha20_128,\@function,5
+.align 32
+ChaCha20_128:
+.cfi_startproc
+.LChaCha20_128:
+       mov     %rsp,%r9                # frame pointer
+.cfi_def_cfa_register  %r9
+       sub     \$64+$xframe,%rsp
+___
+$code.=<<___   if ($win64);
+       movaps  %xmm6,-0x68(%r9)
+       movaps  %xmm7,-0x58(%r9)
+       movaps  %xmm8,-0x48(%r9)
+       movaps  %xmm9,-0x38(%r9)
+       movaps  %xmm10,-0x28(%r9)
+       movaps  %xmm11,-0x18(%r9)
+.L128_body:
+___
+$code.=<<___;
+       movdqa  .Lsigma(%rip),$a
+       movdqu  ($key),$b
+       movdqu  16($key),$c
+       movdqu  ($counter),$d
+       movdqa  .Lone(%rip),$d1
+       movdqa  .Lrot16(%rip),$rot16
+       movdqa  .Lrot24(%rip),$rot24
+
+       movdqa  $a,$a1
+       movdqa  $a,0x00(%rsp)
+       movdqa  $b,$b1
+       movdqa  $b,0x10(%rsp)
+       movdqa  $c,$c1
+       movdqa  $c,0x20(%rsp)
+       paddd   $d,$d1
+       movdqa  $d,0x30(%rsp)
+       mov     \$10,$counter           # reuse $counter
+       jmp     .Loop_128
+
+.align 32
+.Loop_128:
+___
+       &SSSE3ROUND_2x();
+       &pshufd ($c,$c,0b01001110);
+       &pshufd ($b,$b,0b00111001);
+       &pshufd ($d,$d,0b10010011);
+       &pshufd ($c1,$c1,0b01001110);
+       &pshufd ($b1,$b1,0b00111001);
+       &pshufd ($d1,$d1,0b10010011);
+
+       &SSSE3ROUND_2x();
+       &pshufd ($c,$c,0b01001110);
+       &pshufd ($b,$b,0b10010011);
+       &pshufd ($d,$d,0b00111001);
+       &pshufd ($c1,$c1,0b01001110);
+       &pshufd ($b1,$b1,0b10010011);
+       &pshufd ($d1,$d1,0b00111001);
+
+       &dec    ($counter);
+       &jnz    (".Loop_128");
+
+$code.=<<___;
+       paddd   0x00(%rsp),$a
+       paddd   0x10(%rsp),$b
+       paddd   0x20(%rsp),$c
+       paddd   0x30(%rsp),$d
+       paddd   .Lone(%rip),$d1
+       paddd   0x00(%rsp),$a1
+       paddd   0x10(%rsp),$b1
+       paddd   0x20(%rsp),$c1
+       paddd   0x30(%rsp),$d1
+
+       movdqu  0x00($inp),$t
+       movdqu  0x10($inp),$t1
+       pxor    $t,$a                   # xor with input
+       movdqu  0x20($inp),$t
+       pxor    $t1,$b
+       movdqu  0x30($inp),$t1
+       pxor    $t,$c
+       movdqu  0x40($inp),$t
+       pxor    $t1,$d
+       movdqu  0x50($inp),$t1
+       pxor    $t,$a1
+       movdqu  0x60($inp),$t
+       pxor    $t1,$b1
+       movdqu  0x70($inp),$t1
+       pxor    $t,$c1
+       pxor    $t1,$d1
+
+       movdqu  $a,0x00($out)           # write output
+       movdqu  $b,0x10($out)
+       movdqu  $c,0x20($out)
+       movdqu  $d,0x30($out)
+       movdqu  $a1,0x40($out)
+       movdqu  $b1,0x50($out)
+       movdqu  $c1,0x60($out)
+       movdqu  $d1,0x70($out)
+___
+$code.=<<___   if ($win64);
+       movaps  -0x68(%r9),%xmm6
+       movaps  -0x58(%r9),%xmm7
+       movaps  -0x48(%r9),%xmm8
+       movaps  -0x38(%r9),%xmm9
+       movaps  -0x28(%r9),%xmm10
+       movaps  -0x18(%r9),%xmm11
+___
+$code.=<<___;
+       lea     (%r9),%rsp
+.cfi_def_cfa_register  %rsp
+.L128_epilogue:
+       ret
+.cfi_endproc
+.size  ChaCha20_128,.-ChaCha20_128
+___
+}
+
 ########################################################################
 # SSSE3 code path that handles longer messages.
 {
@@ -3674,9 +3840,9 @@ se_handler:
        ret
 .size  se_handler,.-se_handler
 
-.type  ssse3_handler,\@abi-omnipotent
+.type  simd_handler,\@abi-omnipotent
 .align 16
-ssse3_handler:
+simd_handler:
        push    %rsi
        push    %rdi
        push    %rbx
@@ -3702,57 +3868,20 @@ ssse3_handler:
        mov     192($context),%rax      # pull context->R9
 
        mov     4(%r11),%r10d           # HandlerData[1]
+       mov     8(%r11),%ecx            # HandlerData[2]
        lea     (%rsi,%r10),%r10        # epilogue label
        cmp     %r10,%rbx               # context->Rip>=epilogue label
        jae     .Lcommon_seh_tail
 
-       lea     -0x28(%rax),%rsi
+       neg     %rcx
+       lea     -8(%rax,%rcx),%rsi
        lea     512($context),%rdi      # &context.Xmm6
-       mov     \$4,%ecx
+       neg     %ecx
+       shr     \$3,%ecx
        .long   0xa548f3fc              # cld; rep movsq
 
        jmp     .Lcommon_seh_tail
-.size  ssse3_handler,.-ssse3_handler
-
-.type  full_handler,\@abi-omnipotent
-.align 16
-full_handler:
-       push    %rsi
-       push    %rdi
-       push    %rbx
-       push    %rbp
-       push    %r12
-       push    %r13
-       push    %r14
-       push    %r15
-       pushfq
-       sub     \$64,%rsp
-
-       mov     120($context),%rax      # pull context->Rax
-       mov     248($context),%rbx      # pull context->Rip
-
-       mov     8($disp),%rsi           # disp->ImageBase
-       mov     56($disp),%r11          # disp->HandlerData
-
-       mov     0(%r11),%r10d           # HandlerData[0]
-       lea     (%rsi,%r10),%r10        # prologue label
-       cmp     %r10,%rbx               # context->Rip<prologue label
-       jb      .Lcommon_seh_tail
-
-       mov     192($context),%rax      # pull context->R9
-
-       mov     4(%r11),%r10d           # HandlerData[1]
-       lea     (%rsi,%r10),%r10        # epilogue label
-       cmp     %r10,%rbx               # context->Rip>=epilogue label
-       jae     .Lcommon_seh_tail
-
-       lea     -0xa8(%rax),%rsi
-       lea     512($context),%rdi      # &context.Xmm6
-       mov     \$20,%ecx
-       .long   0xa548f3fc              # cld; rep movsq
-
-       jmp     .Lcommon_seh_tail
-.size  full_handler,.-full_handler
+.size  simd_handler,.-simd_handler
 
 .section       .pdata
 .align 4
@@ -3764,6 +3893,10 @@ full_handler:
        .rva    .LSEH_end_ChaCha20_ssse3
        .rva    .LSEH_info_ChaCha20_ssse3
 
+       .rva    .LSEH_begin_ChaCha20_128
+       .rva    .LSEH_end_ChaCha20_128
+       .rva    .LSEH_info_ChaCha20_128
+
        .rva    .LSEH_begin_ChaCha20_4x
        .rva    .LSEH_end_ChaCha20_4x
        .rva    .LSEH_info_ChaCha20_4x
@@ -3804,46 +3937,60 @@ $code.=<<___;
 
 .LSEH_info_ChaCha20_ssse3:
        .byte   9,0,0,0
-       .rva    ssse3_handler
+       .rva    simd_handler
        .rva    .Lssse3_body,.Lssse3_epilogue
+       .long   0x20,0
+
+.LSEH_info_ChaCha20_128:
+       .byte   9,0,0,0
+       .rva    simd_handler
+       .rva    .L128_body,.L128_epilogue
+       .long   0x60,0
 
 .LSEH_info_ChaCha20_4x:
        .byte   9,0,0,0
-       .rva    full_handler
+       .rva    simd_handler
        .rva    .L4x_body,.L4x_epilogue
+       .long   0xa0,0
 ___
 $code.=<<___ if ($avx);
 .LSEH_info_ChaCha20_4xop:
        .byte   9,0,0,0
-       .rva    full_handler
+       .rva    simd_handler
        .rva    .L4xop_body,.L4xop_epilogue             # HandlerData[]
+       .long   0xa0,0
 ___
 $code.=<<___ if ($avx>1);
 .LSEH_info_ChaCha20_8x:
        .byte   9,0,0,0
-       .rva    full_handler
+       .rva    simd_handler
        .rva    .L8x_body,.L8x_epilogue                 # HandlerData[]
+       .long   0xa0,0
 ___
 $code.=<<___ if ($avx>2);
 .LSEH_info_ChaCha20_avx512:
        .byte   9,0,0,0
-       .rva    ssse3_handler
+       .rva    simd_handler
        .rva    .Lavx512_body,.Lavx512_epilogue         # HandlerData[]
+       .long   0x20,0
 
 .LSEH_info_ChaCha20_avx512vl:
        .byte   9,0,0,0
-       .rva    ssse3_handler
+       .rva    simd_handler
        .rva    .Lavx512vl_body,.Lavx512vl_epilogue     # HandlerData[]
+       .long   0x20,0
 
 .LSEH_info_ChaCha20_16x:
        .byte   9,0,0,0
-       .rva    full_handler
+       .rva    simd_handler
        .rva    .L16x_body,.L16x_epilogue               # HandlerData[]
+       .long   0xa0,0
 
 .LSEH_info_ChaCha20_8xvl:
        .byte   9,0,0,0
-       .rva    full_handler
+       .rva    simd_handler
        .rva    .L8xvl_body,.L8xvl_epilogue             # HandlerData[]
+       .long   0xa0,0
 ___
 }