poly1305/asm/poly1305-x86_64.pl: minor AVX512 optimization.
[openssl.git] / crypto / poly1305 / asm / poly1305-x86_64.pl
index ff4efb33e6a2fc6a702b9470e14ee92e02cf40e8..a3970198b714b11d4b6b3d4a5aab00c9c56a7dda 100755 (executable)
@@ -210,16 +210,23 @@ $code.=<<___;
 .type  poly1305_blocks,\@function,4
 .align 32
 poly1305_blocks:
+.cfi_startproc
 .Lblocks:
        shr     \$4,$len
        jz      .Lno_data               # too short
 
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lblocks_body:
 
        mov     $len,%r15               # reassign $len
@@ -255,15 +262,23 @@ $code.=<<___;
        mov     $h2,16($ctx)
 
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lno_data:
 .Lblocks_epilogue:
        ret
+.cfi_endproc
 .size  poly1305_blocks,.-poly1305_blocks
 
 .type  poly1305_emit,\@function,3
@@ -484,6 +499,7 @@ __poly1305_init_avx:
 .type  poly1305_blocks_avx,\@function,4
 .align 32
 poly1305_blocks_avx:
+.cfi_startproc
        mov     20($ctx),%r8d           # is_base2_26
        cmp     \$128,$len
        jae     .Lblocks_avx
@@ -503,11 +519,17 @@ poly1305_blocks_avx:
        jz      .Leven_avx
 
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lblocks_avx_body:
 
        mov     $len,%r15               # reassign $len
@@ -610,24 +632,39 @@ poly1305_blocks_avx:
 .align 16
 .Ldone_avx:
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lno_data_avx:
 .Lblocks_avx_epilogue:
        ret
+.cfi_endproc
 
 .align 32
 .Lbase2_64_avx:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lbase2_64_avx_body:
 
        mov     $len,%r15               # reassign $len
@@ -687,18 +724,27 @@ poly1305_blocks_avx:
        mov     %r15,$len
 
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rax
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lbase2_64_avx_epilogue:
        jmp     .Ldo_avx
+.cfi_endproc
 
 .align 32
 .Leven_avx:
+.cfi_startproc
        vmovd           4*0($ctx),$H0           # load hash value
        vmovd           4*1($ctx),$H1
        vmovd           4*2($ctx),$H2
@@ -709,6 +755,7 @@ poly1305_blocks_avx:
 ___
 $code.=<<___   if (!$win64);
        lea             -0x58(%rsp),%r11
+.cfi_def_cfa           %r11,0x60
        sub             \$0x178,%rsp
 ___
 $code.=<<___   if ($win64);
@@ -1301,10 +1348,12 @@ $code.=<<___    if ($win64);
 ___
 $code.=<<___   if (!$win64);
        lea             0x58(%r11),%rsp
+.cfi_def_cfa           %rsp,8
 ___
 $code.=<<___;
        vzeroupper
        ret
+.cfi_endproc
 .size  poly1305_blocks_avx,.-poly1305_blocks_avx
 
 .type  poly1305_emit_avx,\@function,3
@@ -1372,6 +1421,7 @@ $code.=<<___;
 .type  poly1305_blocks_avx2,\@function,4
 .align 32
 poly1305_blocks_avx2:
+.cfi_startproc
        mov     20($ctx),%r8d           # is_base2_26
        cmp     \$128,$len
        jae     .Lblocks_avx2
@@ -1391,11 +1441,17 @@ poly1305_blocks_avx2:
        jz      .Leven_avx2
 
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lblocks_avx2_body:
 
        mov     $len,%r15               # reassign $len
@@ -1504,24 +1560,39 @@ poly1305_blocks_avx2:
 .align 16
 .Ldone_avx2:
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lno_data_avx2:
 .Lblocks_avx2_epilogue:
        ret
+.cfi_endproc
 
 .align 32
 .Lbase2_64_avx2:
+.cfi_startproc
        push    %rbx
+.cfi_push      %rbx
        push    %rbp
+.cfi_push      %rbp
        push    %r12
+.cfi_push      %r12
        push    %r13
+.cfi_push      %r13
        push    %r14
+.cfi_push      %r14
        push    %r15
+.cfi_push      %r15
 .Lbase2_64_avx2_body:
 
        mov     $len,%r15               # reassign $len
@@ -1588,18 +1659,27 @@ poly1305_blocks_avx2:
        mov     \$`(1<<31|1<<30|1<<16)`,%r11d
 
        mov     0(%rsp),%r15
+.cfi_restore   %r15
        mov     8(%rsp),%r14
+.cfi_restore   %r14
        mov     16(%rsp),%r13
+.cfi_restore   %r13
        mov     24(%rsp),%r12
+.cfi_restore   %r12
        mov     32(%rsp),%rbp
+.cfi_restore   %rbp
        mov     40(%rsp),%rbx
+.cfi_restore   %rbx
        lea     48(%rsp),%rax
        lea     48(%rsp),%rsp
+.cfi_adjust_cfa_offset -48
 .Lbase2_64_avx2_epilogue:
        jmp     .Ldo_avx2
+.cfi_endproc
 
 .align 32
 .Leven_avx2:
+.cfi_startproc
        mov             OPENSSL_ia32cap_P+8(%rip),%r10d
        mov             \$`(1<<31|1<<30|1<<16)`,%r11d
        vmovd           4*0($ctx),%x#$H0        # load hash value base 2^26
@@ -1620,6 +1700,7 @@ $code.=<<___              if ($avx>2);
 ___
 $code.=<<___   if (!$win64);
        lea             -8(%rsp),%r11
+.cfi_def_cfa           %r11,16
        sub             \$0x128,%rsp
 ___
 $code.=<<___   if ($win64);
@@ -2008,10 +2089,12 @@ $code.=<<___    if ($win64);
 ___
 $code.=<<___   if (!$win64);
        lea             8(%r11),%rsp
+.cfi_def_cfa           %rsp,8
 ___
 $code.=<<___;
        vzeroupper
        ret
+.cfi_endproc
 .size  poly1305_blocks_avx2,.-poly1305_blocks_avx2
 ___
 #######################################################################
@@ -2031,11 +2114,13 @@ $code.=<<___;
 .type  poly1305_blocks_avx512,\@function,4
 .align 32
 poly1305_blocks_avx512:
+.cfi_startproc
 .Lblocks_avx512:
        vzeroupper
 ___
 $code.=<<___   if (!$win64);
        lea             -8(%rsp),%r11
+.cfi_def_cfa           %r11,16
        sub             \$0x128,%rsp
 ___
 $code.=<<___   if ($win64);
@@ -2044,13 +2129,13 @@ $code.=<<___    if ($win64);
        vmovdqa         %xmm6,0x50(%r11)
        vmovdqa         %xmm7,0x60(%r11)
        vmovdqa         %xmm8,0x70(%r11)
-       vmovdqa         %xmm9,0x80(%r11)
-       vmovdqa         %xmm10,0x90(%r11)
-       vmovdqa         %xmm11,0xa0(%r11)
-       vmovdqa         %xmm12,0xb0(%r11)
-       vmovdqa         %xmm13,0xc0(%r11)
-       vmovdqa         %xmm14,0xd0(%r11)
-       vmovdqa         %xmm15,0xe0(%r11)
+       vmovdqa32       %xmm9,0x80(%r11)
+       vmovdqa32       %xmm10,0x90(%r11)
+       vmovdqa32       %xmm11,0xa0(%r11)
+       vmovdqa32       %xmm12,0xb0(%r11)
+       vmovdqa32       %xmm13,0xc0(%r11)
+       vmovdqa32       %xmm14,0xd0(%r11)
+       vmovdqa32       %xmm15,0xe0(%r11)
 .Ldo_avx512_body:
 ___
 $code.=<<___;
@@ -2213,36 +2298,21 @@ $code.=<<___;
        # we could just flow along, hence the goal for $R0-$S4 is
        # 1858286838784888 ...
 
-       mov             \$0b0110011001100110,%eax
-       mov             \$0b1100110011001100,%r8d
-       mov             \$0b0101010101010101,%r9d
+       vmovdqa32       128(%rcx),$M0           # .Lpermd_avx512:
+       mov             \$0x7777,%eax
        kmovw           %eax,%k1
-       kmovw           %r8d,%k2
-       kmovw           %r9d,%k3
-
-       vpbroadcastq    %x#$D0,$M0      # 0808080808080808
-       vpbroadcastq    %x#$D1,$M1
-       vpbroadcastq    %x#$D2,$M2
-       vpbroadcastq    %x#$D3,$M3
-       vpbroadcastq    %x#$D4,$M4
-
-       vpexpandd       $D0,${D0}{%k1}  # 05060708 -> -05--06--07--08-
-       vpexpandd       $D1,${D1}{%k1}
-       vpexpandd       $D2,${D2}{%k1}
-       vpexpandd       $D3,${D3}{%k1}
-       vpexpandd       $D4,${D4}{%k1}
-
-       vpexpandd       $R0,${D0}{%k2}  # -05--06--07--08- -> 145-246-347-448-
-       vpexpandd       $R1,${D1}{%k2}
-       vpexpandd       $R2,${D2}{%k2}
-       vpexpandd       $R3,${D3}{%k2}
-       vpexpandd       $R4,${D4}{%k2}
-
-       vpblendmd       $M0,$D0,${R0}{%k3}      # 1858286838784888
-       vpblendmd       $M1,$D1,${R1}{%k3}
-       vpblendmd       $M2,$D2,${R2}{%k3}
-       vpblendmd       $M3,$D3,${R3}{%k3}
-       vpblendmd       $M4,$D4,${R4}{%k3}
+
+       vpermd          $R0,$M0,$R0             # 14243444 -> 1---2---3---4---
+       vpermd          $R1,$M0,$R1
+       vpermd          $R2,$M0,$R2
+       vpermd          $R3,$M0,$R3
+       vpermd          $R4,$M0,$R4
+
+       vpermd          $D0,$M0,${R0}{%k1}      # 05060708 -> 1858286838784888
+       vpermd          $D1,$M0,${R1}{%k1}
+       vpermd          $D2,$M0,${R2}{%k1}
+       vpermd          $D3,$M0,${R3}{%k1}
+       vpermd          $D4,$M0,${R4}{%k1}
 
        vpslld          \$2,$R1,$S1             # *5
        vpslld          \$2,$R2,$S2
@@ -2264,15 +2334,14 @@ $code.=<<___;
        vpsrlq          \$40,$T4,$T4            # 4
        vpandq          $MASK,$T2,$T2           # 2
        vpandq          $MASK,$T0,$T0           # 0
-       vpandq          $MASK,$T1,$T1           # 1
-       vpandq          $MASK,$T3,$T3           # 3
+       #vpandq         $MASK,$T1,$T1           # 1
+       #vpandq         $MASK,$T3,$T3           # 3
        #vporq          $PADBIT,$T4,$T4         # padbit, yes, always
 
        vpaddq          $H2,$T2,$H2             # accumulate input
-       mov             \$0x0f,%eax
        sub             \$192,$len
        jbe             .Ltail_avx512
-       jmp             .Loop_avx512
+       #jmp            .Loop_avx512
 
 .align 32
 .Loop_avx512:
@@ -2307,7 +2376,9 @@ $code.=<<___;
        vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
         vpaddq         $H0,$T0,$H0
        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
+        vpandq         $MASK,$T1,$T1           # 1
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
+        vpandq         $MASK,$T3,$T3           # 3
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
@@ -2415,8 +2486,8 @@ $code.=<<___;
        vpaddq          $D3,$H4,$H4             # h3 -> h4
 
         vpandq         $MASK,$T0,$T0           # 0
-        vpandq         $MASK,$T1,$T1           # 1
-        vpandq         $MASK,$T3,$T3           # 3
+        #vpandq        $MASK,$T1,$T1           # 1
+        #vpandq        $MASK,$T3,$T3           # 3
         #vporq         $PADBIT,$T4,$T4         # padbit, yes, always
 
        sub             \$128,$len
@@ -2448,7 +2519,9 @@ $code.=<<___;
        vpmuludq        $H2,$R1,$D3             # d3 = h2*r1
        vpmuludq        $H2,$R2,$D4             # d4 = h2*r2
        vpmuludq        $H2,$S3,$D0             # d0 = h2*s3
+        vpandq         $MASK,$T1,$T1           # 1
        vpmuludq        $H2,$S4,$D1             # d1 = h2*s4
+        vpandq         $MASK,$T3,$T3           # 3
        vpmuludq        $H2,$R0,$D2             # d2 = h2*r0
         vporq          $PADBIT,$T4,$T4         # padbit, yes, always
         vpaddq         $H1,$T1,$H1             # accumulate input
@@ -2622,9 +2695,11 @@ $code.=<<___     if ($win64);
 ___
 $code.=<<___   if (!$win64);
        lea             8(%r11),%rsp
+.cfi_def_cfa           %rsp,8
 ___
 $code.=<<___;
        ret
+.cfi_endproc
 .size  poly1305_blocks_avx512,.-poly1305_blocks_avx512
 ___
 if ($avx>3) {
@@ -2832,6 +2907,8 @@ $code.=<<___;
 .long  0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
 .Lpermd_avx2:
 .long  2,2,2,3,2,0,2,1
+.Lpermd_avx512:
+.long  0,0,0,1, 0,2,0,3, 0,4,0,5, 0,6,0,7
 
 .L2_44_inp_permd:
 .long  0,1,1,2,2,3,7,7