Fix some CFI issues in x86_64 assembly

[openssl.git] / crypto / aes / asm / vpaes-x86_64.pl
diff --git a/crypto/aes/asm/vpaes-x86_64.pl b/crypto/aes/asm/vpaes-x86_64.pl

index bca97fdcdf75d0512f082b752ea51b30c9c6eae1..ec2a8da819f6a66bb7c21ebf3a9a7a40fae83cc6 100644 (file)
--- a/crypto/aes/asm/vpaes-x86_64.pl
+++ b/crypto/aes/asm/vpaes-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  
  ######################################################################
  ## Constant-time SSSE3 AES core implementation.
@@ -29,7 +36,9 @@
  #
  # Core 2(**)   29.6/41.1/14.3          21.9/25.2(***)
  # Nehalem      29.6/40.3/14.6          10.0/11.8
-# Atom         57.3/74.2/32.1          60.9/82.3(***)
+# Atom         57.3/74.2/32.1          60.9/77.2(***)
+# Silvermont   52.7/64.0/19.5          48.8/60.8(***)
+# Goldmont     38.9/49.0/17.8          10.6/12.6
  #
  # (*)  "Hyper-threading" in the context refers rather to cache shared
  #      among multiple cores, than to specifically Intel HTT. As vast
@@ -56,7 +65,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  die "can't locate x86_64-xlate.pl";
  
-open STDOUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
  
  $PREFIX="vpaes";
  
@@ -81,6 +91,7 @@ $code.=<<___;
  .type  _vpaes_encrypt_core,\@abi-omnipotent
  .align 16
  _vpaes_encrypt_core:
+.cfi_startproc
         mov     %rdx,   %r9
         mov     \$16,   %r11
         mov     240(%rdx),%eax
@@ -161,8 +172,9 @@ _vpaes_encrypt_core:
         pxor    %xmm4,  %xmm0   # 0 = A
         pshufb  %xmm1,  %xmm0
         ret
+.cfi_endproc
  .size  _vpaes_encrypt_core,.-_vpaes_encrypt_core
-       
+
  ##
  ##  Decryption core
  ##
@@ -171,6 +183,7 @@ _vpaes_encrypt_core:
  .type  _vpaes_decrypt_core,\@abi-omnipotent
  .align 16
  _vpaes_decrypt_core:
+.cfi_startproc
         mov     %rdx,   %r9             # load key
         mov     240(%rdx),%eax
         movdqa  %xmm9,  %xmm1
@@ -203,35 +216,35 @@ _vpaes_decrypt_core:
         movdqa  -0x10(%r10),%xmm1       # 0 : sb9t
         pshufb  %xmm2,  %xmm4           # 4 = sb9u
         pshufb  %xmm3,  %xmm1           # 0 = sb9t
-       pxor    %xmm0,  %xmm4
-       add     \$16, %r9               # next round key
-       pxor    %xmm4,  %xmm1           # 0 = ch
-
+       pxor    %xmm4,  %xmm0
         movdqa  0x00(%r10),%xmm4        # 4 : sbdu
-       pshufb  %xmm5,  %xmm1           # MC ch
-       pshufb  %xmm2,  %xmm4           # 4 = sbdu
-       movdqa  0x10(%r10),%xmm0        # 0 : sbdt
-       pxor    %xmm1,  %xmm4           # 4 = ch
-       pshufb  %xmm3,  %xmm0           # 0 = sbdt
-       sub     \$1,%rax                # nr--
-       pxor    %xmm4,  %xmm0           # 0 = ch
+       pxor    %xmm1,  %xmm0           # 0 = ch
+       movdqa  0x10(%r10),%xmm1        # 0 : sbdt
  
-       movdqa  0x20(%r10),%xmm4        # 4 : sbbu
+       pshufb  %xmm2,  %xmm4           # 4 = sbdu
         pshufb  %xmm5,  %xmm0           # MC ch
+       pshufb  %xmm3,  %xmm1           # 0 = sbdt
+       pxor    %xmm4,  %xmm0           # 4 = ch
+       movdqa  0x20(%r10),%xmm4        # 4 : sbbu
+       pxor    %xmm1,  %xmm0           # 0 = ch
         movdqa  0x30(%r10),%xmm1        # 0 : sbbt
+
         pshufb  %xmm2,  %xmm4           # 4 = sbbu
+       pshufb  %xmm5,  %xmm0           # MC ch
         pshufb  %xmm3,  %xmm1           # 0 = sbbt
-       pxor    %xmm0,  %xmm4           # 4 = ch
-       pxor    %xmm4,  %xmm1           # 0 = ch
-
+       pxor    %xmm4,  %xmm0           # 4 = ch
         movdqa  0x40(%r10),%xmm4        # 4 : sbeu
-       pshufb  %xmm5,  %xmm1           # MC ch
-       movdqa  0x50(%r10),%xmm0        # 0 : sbet
+       pxor    %xmm1,  %xmm0           # 0 = ch
+       movdqa  0x50(%r10),%xmm1        # 0 : sbet
+
         pshufb  %xmm2,  %xmm4           # 4 = sbeu
-       pshufb  %xmm3,  %xmm0           # 0 = sbet
+       pshufb  %xmm5,  %xmm0           # MC ch
+       pshufb  %xmm3,  %xmm1           # 0 = sbet
+       pxor    %xmm4,  %xmm0           # 4 = ch
+       add     \$16, %r9               # next round key
         palignr \$12,   %xmm5,  %xmm5
-       pxor    %xmm1,  %xmm4           # 4 = ch
-       pxor    %xmm4,  %xmm0           # 0 = ch
+       pxor    %xmm1,  %xmm0           # 0 = ch
+       sub     \$1,%rax                # nr--
  
  .Ldec_entry:
         # top of round
@@ -267,6 +280,7 @@ _vpaes_decrypt_core:
         pxor    %xmm4,  %xmm0   # 0 = A
         pshufb  %xmm2,  %xmm0
         ret
+.cfi_endproc
  .size  _vpaes_decrypt_core,.-_vpaes_decrypt_core
  
  ########################################################
@@ -277,6 +291,7 @@ _vpaes_decrypt_core:
  .type  _vpaes_schedule_core,\@abi-omnipotent
  .align 16
  _vpaes_schedule_core:
+.cfi_startproc
         # rdi = key
         # rsi = size in bits
         # rdx = buffer
@@ -323,7 +338,7 @@ _vpaes_schedule_core:
  ##
  .Lschedule_128:
         mov     \$10, %esi
-       
+
  .Loop_schedule_128:
         call    _vpaes_schedule_round
         dec     %rsi
@@ -357,7 +372,7 @@ _vpaes_schedule_core:
  
  .Loop_schedule_192:
         call    _vpaes_schedule_round
-       palignr \$8,%xmm6,%xmm0 
+       palignr \$8,%xmm6,%xmm0
         call    _vpaes_schedule_mangle  # save key n
         call    _vpaes_schedule_192_smear
         call    _vpaes_schedule_mangle  # save key n+1
@@ -383,7 +398,7 @@ _vpaes_schedule_core:
         movdqu  16(%rdi),%xmm0          # load key part 2 (unaligned)
         call    _vpaes_schedule_transform       # input transform
         mov     \$7, %esi
-       
+
  .Loop_schedule_256:
         call    _vpaes_schedule_mangle  # output low result
         movdqa  %xmm0,  %xmm6           # save cur_lo in xmm6
@@ -392,7 +407,7 @@ _vpaes_schedule_core:
         call    _vpaes_schedule_round
         dec     %rsi
         jz      .Lschedule_mangle_last
-       call    _vpaes_schedule_mangle  
+       call    _vpaes_schedule_mangle
  
         # low round. swap xmm7 and xmm6
         pshufd  \$0xFF, %xmm0,  %xmm0
@@ -400,10 +415,10 @@ _vpaes_schedule_core:
         movdqa  %xmm6,  %xmm7
         call    _vpaes_schedule_low_round
         movdqa  %xmm5,  %xmm7
-       
+
         jmp     .Loop_schedule_256
  
-       
+
  ##
  ##  .aes_schedule_mangle_last
  ##
@@ -443,6 +458,7 @@ _vpaes_schedule_core:
         pxor    %xmm6,  %xmm6
         pxor    %xmm7,  %xmm7
         ret
+.cfi_endproc
  .size  _vpaes_schedule_core,.-_vpaes_schedule_core
  
  ##
@@ -462,6 +478,7 @@ _vpaes_schedule_core:
  .type  _vpaes_schedule_192_smear,\@abi-omnipotent
  .align 16
  _vpaes_schedule_192_smear:
+.cfi_startproc
         pshufd  \$0x80, %xmm6,  %xmm1   # d c 0 0 -> c 0 0 0
         pshufd  \$0xFE, %xmm7,  %xmm0   # b a _ _ -> b b b a
         pxor    %xmm1,  %xmm6           # -> c+d c 0 0
@@ -470,6 +487,7 @@ _vpaes_schedule_192_smear:
         movdqa  %xmm6,  %xmm0
         movhlps %xmm1,  %xmm6           # clobber low side with zeros
         ret
+.cfi_endproc
  .size  _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear
  
  ##
@@ -493,6 +511,7 @@ _vpaes_schedule_192_smear:
  .type  _vpaes_schedule_round,\@abi-omnipotent
  .align 16
  _vpaes_schedule_round:
+.cfi_startproc
         # extract rcon from xmm8
         pxor    %xmm1,  %xmm1
         palignr \$15,   %xmm8,  %xmm1
@@ -502,9 +521,9 @@ _vpaes_schedule_round:
         # rotate
         pshufd  \$0xFF, %xmm0,  %xmm0
         palignr \$1,    %xmm0,  %xmm0
-       
+
         # fall through...
-       
+
         # low round: same as high round, but no rotation and no rcon.
  _vpaes_schedule_low_round:
         # smear xmm7
@@ -543,9 +562,10 @@ _vpaes_schedule_low_round:
         pxor    %xmm4,  %xmm0           # 0 = sbox output
  
         # add in smeared stuff
-       pxor    %xmm7,  %xmm0   
+       pxor    %xmm7,  %xmm0
         movdqa  %xmm0,  %xmm7
         ret
+.cfi_endproc
  .size  _vpaes_schedule_round,.-_vpaes_schedule_round
  
  ##
@@ -560,6 +580,7 @@ _vpaes_schedule_low_round:
  .type  _vpaes_schedule_transform,\@abi-omnipotent
  .align 16
  _vpaes_schedule_transform:
+.cfi_startproc
         movdqa  %xmm9,  %xmm1
         pandn   %xmm0,  %xmm1
         psrld   \$4,    %xmm1
@@ -570,6 +591,7 @@ _vpaes_schedule_transform:
         pshufb  %xmm1,  %xmm0
         pxor    %xmm2,  %xmm0
         ret
+.cfi_endproc
  .size  _vpaes_schedule_transform,.-_vpaes_schedule_transform
  
  ##
@@ -598,6 +620,7 @@ _vpaes_schedule_transform:
  .type  _vpaes_schedule_mangle,\@abi-omnipotent
  .align 16
  _vpaes_schedule_mangle:
+.cfi_startproc
         movdqa  %xmm0,  %xmm4   # save xmm0 for later
         movdqa  .Lk_mc_forward(%rip),%xmm5
         test    %rcx,   %rcx
@@ -662,6 +685,7 @@ _vpaes_schedule_mangle:
         and     \$0x30, %r8
         movdqu  %xmm3,  (%rdx)
         ret
+.cfi_endproc
  .size  _vpaes_schedule_mangle,.-_vpaes_schedule_mangle
  
  #
@@ -671,6 +695,7 @@ _vpaes_schedule_mangle:
  .type  ${PREFIX}_set_encrypt_key,\@function,3
  .align 16
  ${PREFIX}_set_encrypt_key:
+.cfi_startproc
  ___
  $code.=<<___ if ($win64);
         lea     -0xb8(%rsp),%rsp
@@ -713,12 +738,14 @@ ___
  $code.=<<___;
         xor     %eax,%eax
         ret
+.cfi_endproc
  .size  ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
  
  .globl ${PREFIX}_set_decrypt_key
  .type  ${PREFIX}_set_decrypt_key,\@function,3
  .align 16
  ${PREFIX}_set_decrypt_key:
+.cfi_startproc
  ___
  $code.=<<___ if ($win64);
         lea     -0xb8(%rsp),%rsp
@@ -766,12 +793,14 @@ ___
  $code.=<<___;
         xor     %eax,%eax
         ret
+.cfi_endproc
  .size  ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
  
  .globl ${PREFIX}_encrypt
  .type  ${PREFIX}_encrypt,\@function,3
  .align 16
  ${PREFIX}_encrypt:
+.cfi_startproc
  ___
  $code.=<<___ if ($win64);
         lea     -0xb8(%rsp),%rsp
@@ -809,12 +838,14 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         ret
+.cfi_endproc
  .size  ${PREFIX}_encrypt,.-${PREFIX}_encrypt
  
  .globl ${PREFIX}_decrypt
  .type  ${PREFIX}_decrypt,\@function,3
  .align 16
  ${PREFIX}_decrypt:
+.cfi_startproc
  ___
  $code.=<<___ if ($win64);
         lea     -0xb8(%rsp),%rsp
@@ -852,6 +883,7 @@ $code.=<<___ if ($win64);
  ___
  $code.=<<___;
         ret
+.cfi_endproc
  .size  ${PREFIX}_decrypt,.-${PREFIX}_decrypt
  ___
  {
@@ -864,6 +896,7 @@ $code.=<<___;
  .type  ${PREFIX}_cbc_encrypt,\@function,6
  .align 16
  ${PREFIX}_cbc_encrypt:
+.cfi_startproc
         xchg    $key,$len
  ___
  ($len,$key)=($key,$len);
@@ -934,6 +967,7 @@ ___
  $code.=<<___;
  .Lcbc_abort:
         ret
+.cfi_endproc
  .size  ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
  ___
  }
@@ -947,6 +981,7 @@ $code.=<<___;
  .type  _vpaes_preheat,\@abi-omnipotent
  .align 16
  _vpaes_preheat:
+.cfi_startproc
         lea     .Lk_s0F(%rip), %r10
         movdqa  -0x20(%r10), %xmm10     # .Lk_inv
         movdqa  -0x10(%r10), %xmm11     # .Lk_inv+16
@@ -956,6 +991,7 @@ _vpaes_preheat:
         movdqa  0x50(%r10), %xmm15      # .Lk_sb2
         movdqa  0x60(%r10), %xmm14      # .Lk_sb2+16
         ret
+.cfi_endproc
  .size  _vpaes_preheat,.-_vpaes_preheat
  ########################################################
  ##                                                    ##
@@ -1058,7 +1094,7 @@ _vpaes_consts:
  .Lk_dsbo:      # decryption sbox final output
         .quad   0x1387EA537EF94000, 0xC7AA6DB9D4943E2D
         .quad   0x12D7560F93441D00, 0xCA4B8159D8C58E9C
-.asciz "Vector Permutaion AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
+.asciz "Vector Permutation AES for x86_64/SSSE3, Mike Hamburg (Stanford University)"
  .align 64
  .size  _vpaes_consts,.-_vpaes_consts
  ___