Update copyright year

[openssl.git] / crypto / modes / asm / ghash-x86_64.pl
diff --git a/crypto/modes/asm/ghash-x86_64.pl b/crypto/modes/asm/ghash-x86_64.pl

index b4a8ddbd2e80955915708ae369c892c0ccf21929..30158aa076da9b1abb58284e950fb974b6588b86 100644 (file)
--- a/crypto/modes/asm/ghash-x86_64.pl
+++ b/crypto/modes/asm/ghash-x86_64.pl
@@ -1,5 +1,5 @@
  #! /usr/bin/env perl
-# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved.
+# Copyright 2010-2019 The OpenSSL Project Authors. All Rights Reserved.
  #
  # Licensed under the OpenSSL license (the "License").  You may not use
  # this file except in compliance with the License.  You can obtain a copy
@@ -44,9 +44,8 @@
  # See ghash-x86.pl for background information and details about coding
  # techniques.
  #
-# Special thanks to David Woodhouse <dwmw2@infradead.org> for
-# providing access to a Westmere-based system on behalf of Intel
-# Open Source Technology Centre.
+# Special thanks to David Woodhouse for providing access to a
+# Westmere-based system on behalf of Intel Open Source Technology Centre.
  
  # December 2012
  #
@@ -74,6 +73,8 @@
  # Skylake      0.44(+110%)(if system doesn't support AVX)
  # Bulldozer    1.49(+27%)
  # Silvermont   2.88(+13%)
+# Knights L    2.12(-)    (if system doesn't support AVX)
+# Goldmont     1.08(+24%)
  
  # March 2013
  #
@@ -85,6 +86,8 @@
  # it performs in 0.41 cycles per byte on Haswell processor, in
  # 0.29 on Broadwell, and in 0.36 on Skylake.
  #
+# Knights Landing achieves 1.09 cpb.
+#
  # [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
  
  $flavour = shift;
@@ -235,9 +238,21 @@ $code=<<___;
  .type  gcm_gmult_4bit,\@function,2
  .align 16
  gcm_gmult_4bit:
+.cfi_startproc
         push    %rbx
-       push    %rbp            # %rbp and %r12 are pushed exclusively in
+.cfi_push      %rbx
+       push    %rbp            # %rbp and others are pushed exclusively in
+.cfi_push      %rbp
         push    %r12            # order to reuse Win64 exception handler...
+.cfi_push      %r12
+       push    %r13
+.cfi_push      %r13
+       push    %r14
+.cfi_push      %r14
+       push    %r15
+.cfi_push      %r15
+       sub     \$280,%rsp
+.cfi_adjust_cfa_offset 280
  .Lgmult_prologue:
  
         movzb   15($Xi),$Zlo
@@ -248,10 +263,15 @@ $code.=<<___;
         mov     $Zlo,8($Xi)
         mov     $Zhi,($Xi)
  
-       mov     16(%rsp),%rbx
-       lea     24(%rsp),%rsp
+       lea     280+48(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     (%rsi),%rsp
+.cfi_def_cfa_register  %rsp
  .Lgmult_epilogue:
         ret
+.cfi_endproc
  .size  gcm_gmult_4bit,.-gcm_gmult_4bit
  ___
  \f
@@ -265,13 +285,21 @@ $code.=<<___;
  .type  gcm_ghash_4bit,\@function,4
  .align 16
  gcm_ghash_4bit:
+.cfi_startproc
         push    %rbx
+.cfi_push      %rbx
         push    %rbp
+.cfi_push      %rbp
         push    %r12
+.cfi_push      %r12
         push    %r13
+.cfi_push      %r13
         push    %r14
+.cfi_push      %r14
         push    %r15
+.cfi_push      %r15
         sub     \$280,%rsp
+.cfi_adjust_cfa_offset 280
  .Lghash_prologue:
         mov     $inp,%r14               # reassign couple of args
         mov     $len,%r15
@@ -399,16 +427,25 @@ $code.=<<___;
         mov     $Zlo,8($Xi)
         mov     $Zhi,($Xi)
  
-       lea     280(%rsp),%rsi
-       mov     0(%rsi),%r15
-       mov     8(%rsi),%r14
-       mov     16(%rsi),%r13
-       mov     24(%rsi),%r12
-       mov     32(%rsi),%rbp
-       mov     40(%rsi),%rbx
-       lea     48(%rsi),%rsp
+       lea     280+48(%rsp),%rsi
+.cfi_def_cfa   %rsi,8
+       mov     -48(%rsi),%r15
+.cfi_restore   %r15
+       mov     -40(%rsi),%r14
+.cfi_restore   %r14
+       mov     -32(%rsi),%r13
+.cfi_restore   %r13
+       mov     -24(%rsi),%r12
+.cfi_restore   %r12
+       mov     -16(%rsi),%rbp
+.cfi_restore   %rbp
+       mov     -8(%rsi),%rbx
+.cfi_restore   %rbx
+       lea     0(%rsi),%rsp
+.cfi_def_cfa_register  %rsp
  .Lghash_epilogue:
         ret
+.cfi_endproc
  .size  gcm_ghash_4bit,.-gcm_ghash_4bit
  ___
  \f
@@ -468,7 +505,7 @@ $code.=<<___;
         psllq           \$57,$Xi                #
         movdqa          $Xi,$T1                 #
         pslldq          \$8,$Xi
-       psrldq          \$8,$T1                 #       
+       psrldq          \$8,$T1                 #
         pxor            $T2,$Xi
         pxor            $T1,$Xhi                #
  
@@ -492,6 +529,7 @@ $code.=<<___;
  .type  gcm_init_clmul,\@abi-omnipotent
  .align 16
  gcm_init_clmul:
+.cfi_startproc
  .L_init_clmul:
  ___
  $code.=<<___ if ($win64);
@@ -561,6 +599,7 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         ret
+.cfi_endproc
  .size  gcm_init_clmul,.-gcm_init_clmul
  ___
  }
@@ -572,6 +611,7 @@ $code.=<<___;
  .type  gcm_gmult_clmul,\@abi-omnipotent
  .align 16
  gcm_gmult_clmul:
+.cfi_startproc
  .L_gmult_clmul:
         movdqu          ($Xip),$Xi
         movdqa          .Lbswap_mask(%rip),$T3
@@ -582,7 +622,7 @@ ___
         &clmul64x64_T2  ($Xhi,$Xi,$Hkey,$T2);
  $code.=<<___ if (0 || (&reduction_alg9($Xhi,$Xi)&&0));
         # experimental alternative. special thing about is that there
-       # no dependency between the two multiplications... 
+       # no dependency between the two multiplications...
         mov             \$`0xE1<<1`,%eax
         mov             \$0xA040608020C0E000,%r10       # ((7..0)·0xE0)&0xff
         mov             \$0x07,%r11d
@@ -608,6 +648,7 @@ $code.=<<___;
         pshufb          $T3,$Xi
         movdqu          $Xi,($Xip)
         ret
+.cfi_endproc
  .size  gcm_gmult_clmul,.-gcm_gmult_clmul
  ___
  }
@@ -621,6 +662,7 @@ $code.=<<___;
  .type  gcm_ghash_clmul,\@abi-omnipotent
  .align 32
  gcm_ghash_clmul:
+.cfi_startproc
  .L_ghash_clmul:
  ___
  $code.=<<___ if ($win64);
@@ -757,7 +799,7 @@ $code.=<<___;
         movdqa          $T2,$T1                 #
         pslldq          \$8,$T2
          pclmulqdq      \$0x00,$Hkey2,$Xln
-       psrldq          \$8,$T1                 #       
+       psrldq          \$8,$T1                 #
         pxor            $T2,$Xi
         pxor            $T1,$Xhi                #
         movdqu          0($inp),$T1
@@ -893,7 +935,7 @@ $code.=<<___;
           psllq         \$57,$Xi                #
           movdqa        $Xi,$T1                 #
           pslldq        \$8,$Xi
-         psrldq        \$8,$T1                 #       
+         psrldq        \$8,$T1                 #
           pxor          $T2,$Xi
         pshufd          \$0b01001110,$Xhn,$Xmn
           pxor          $T1,$Xhi                #
@@ -968,6 +1010,7 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         ret
+.cfi_endproc
  .size  gcm_ghash_clmul,.-gcm_ghash_clmul
  ___
  }
@@ -977,6 +1020,7 @@ $code.=<<___;
  .type  gcm_init_avx,\@abi-omnipotent
  .align 32
  gcm_init_avx:
+.cfi_startproc
  ___
  if ($avx) {
  my ($Htbl,$Xip)=@_4args;
@@ -1105,6 +1149,7 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         ret
+.cfi_endproc
  .size  gcm_init_avx,.-gcm_init_avx
  ___
  } else {
@@ -1119,7 +1164,9 @@ $code.=<<___;
  .type  gcm_gmult_avx,\@abi-omnipotent
  .align 32
  gcm_gmult_avx:
+.cfi_startproc
         jmp     .L_gmult_clmul
+.cfi_endproc
  .size  gcm_gmult_avx,.-gcm_gmult_avx
  ___
  \f
@@ -1128,6 +1175,7 @@ $code.=<<___;
  .type  gcm_ghash_avx,\@abi-omnipotent
  .align 32
  gcm_ghash_avx:
+.cfi_startproc
  ___
  if ($avx) {
  my ($Xip,$Htbl,$inp,$len)=@_4args;
@@ -1540,6 +1588,7 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         ret
+.cfi_endproc
  .size  gcm_ghash_avx,.-gcm_ghash_avx
  ___
  } else {
@@ -1647,14 +1696,20 @@ se_handler:
         cmp     %r10,%rbx               # context->Rip>=epilogue label
         jae     .Lin_prologue
  
-       lea     24(%rax),%rax           # adjust "rsp"
+       lea     48+280(%rax),%rax       # adjust "rsp"
  
         mov     -8(%rax),%rbx
         mov     -16(%rax),%rbp
         mov     -24(%rax),%r12
+       mov     -32(%rax),%r13
+       mov     -40(%rax),%r14
+       mov     -48(%rax),%r15
         mov     %rbx,144($context)      # restore context->Rbx
         mov     %rbp,160($context)      # restore context->Rbp
         mov     %r12,216($context)      # restore context->R12
+       mov     %r13,224($context)      # restore context->R13
+       mov     %r14,232($context)      # restore context->R14
+       mov     %r15,240($context)      # restore context->R15
  
  .Lin_prologue:
         mov     8(%rax),%rdi