{md5,rc4}/asm/*-x86_64.pl: add CFI annotations.

[openssl.git] / crypto / rc4 / asm / rc4-x86_64.pl
diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl

index 02bfac34f29fa9cec1902f8ac709ff4056e6e6a2..9ccb70eeb47b98e0f61ce560adbd2739951fd22a 100755 (executable)
--- a/crypto/rc4/asm/rc4-x86_64.pl
+++ b/crypto/rc4/asm/rc4-x86_64.pl
@@ -1,4 +1,11 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
  #
  # ====================================================================
  # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
@@ -41,7 +48,7 @@
  
  # April 2005
  #
-# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing 
+# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
  # those with add/sub results in 50% performance improvement of folded
  # loop...
  
@@ -50,13 +57,13 @@
  # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
  # performance by >30% [unlike P4 32-bit case that is]. But this is
  # provided that loads are reordered even more aggressively! Both code
-# pathes, AMD64 and EM64T, reorder loads in essentially same manner
+# paths, AMD64 and EM64T, reorder loads in essentially same manner
  # as my IA-64 implementation. On Opteron this resulted in modest 5%
  # improvement [I had to test it], while final Intel P4 performance
  # achieves respectful 432MBps on 2.8GHz processor now. For reference.
  # If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
  # RC4_INT code-path. While if executed on Opteron, it's only 25%
-# slower than the RC4_INT one [meaning that if CPU µ-arch detection
+# slower than the RC4_INT one [meaning that if CPU Âµ-arch detection
  # is not implemented, then this final RC4_CHAR code-path should be
  # preferred, as it provides better *all-round* performance].
  
@@ -86,14 +93,20 @@
  # is better) and improvement coefficients relative to previous
  # version of this module are:
  #
-# Opteron      5.3/+0%
+# Opteron      5.3/+0%(*)
  # P4           6.5
-# Core2                6.2/+15%(*)
+# Core2                6.2/+15%(**)
  # Westmere     4.2/+60%
  # Sandy Bridge 4.2/+120%
  # Atom         9.3/+80%
+# VIA Nano     6.4/+4%
+# Ivy Bridge   4.1/+30%
+# Bulldozer    4.5/+30%(*)
  #
-# (*)  Note that Core2 result is ~15% lower than corresponding result
+# (*)  But corresponding loop has less instructions, which should have
+#      positive effect on upcoming Bulldozer, which has one less ALU.
+#      For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**) Note that Core2 result is ~15% lower than corresponding result
  #      for 32-bit code, meaning that it's possible to improve it,
  #      but more than likely at the cost of the others (see rc4-586.pl
  #      to get the idea)...
@@ -109,7 +122,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  die "can't locate x86_64-xlate.pl";
  
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
  
  $dat="%rdi";       # arg1
  $len="%rsi";       # arg2
@@ -119,6 +133,7 @@ $out="%rcx";            # arg4
  {
  $code=<<___;
  .text
+.extern        OPENSSL_ia32cap_P
  
  .globl RC4
  .type  RC4,\@function,4
@@ -127,9 +142,13 @@ RC4:       or      $len,$len
         jne     .Lentry
         ret
  .Lentry:
+.cfi_startproc
         push    %rbx
+.cfi_push      %rbx
         push    %r12
+.cfi_push      %r12
         push    %r13
+.cfi_push      %r13
  .Lprologue:
         mov     $len,%r11
         mov     $inp,%r12
@@ -412,11 +431,16 @@ $code.=<<___;
         movl    $YY#d,-4($dat)
  
         mov     (%rsp),%r13
+.cfi_restore   %r13
         mov     8(%rsp),%r12
+.cfi_restore   %r12
         mov     16(%rsp),%rbx
+.cfi_restore   %rbx
         add     \$24,%rsp
+.cfi_adjust_cfa_offset -24
  .Lepilogue:
         ret
+.cfi_endproc
  .size  RC4,.-RC4
  ___
  }
@@ -425,7 +449,6 @@ $idx="%r8";
  $ido="%r9";
  
  $code.=<<___;
-.extern        OPENSSL_ia32cap_P
  .globl RC4_set_key
  .type  RC4_set_key,\@function,3
  .align 16