-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
#
# ====================================================================
# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
# April 2005
#
-# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
+# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing
# those with add/sub results in 50% performance improvement of folded
# loop...
# As was shown by Zou Nanhai loop unrolling can improve Intel EM64T
# performance by >30% [unlike P4 32-bit case that is]. But this is
# provided that loads are reordered even more aggressively! Both code
-# pathes, AMD64 and EM64T, reorder loads in essentially same manner
+# paths, AMD64 and EM64T, reorder loads in essentially same manner
# as my IA-64 implementation. On Opteron this resulted in modest 5%
# improvement [I had to test it], while final Intel P4 performance
# achieves respectful 432MBps on 2.8GHz processor now. For reference.
# If executed on Xeon, current RC4_CHAR code-path is 2.7x faster than
# RC4_INT code-path. While if executed on Opteron, it's only 25%
-# slower than the RC4_INT one [meaning that if CPU µ-arch detection
+# slower than the RC4_INT one [meaning that if CPU µ-arch detection
# is not implemented, then this final RC4_CHAR code-path should be
# preferred, as it provides better *all-round* performance].
# is better) and improvement coefficients relative to previous
# version of this module are:
#
-# Opteron 5.3/+0%
+# Opteron 5.3/+0%(*)
# P4 6.5
-# Core2 6.2/+15%(*)
+# Core2 6.2/+15%(**)
# Westmere 4.2/+60%
# Sandy Bridge 4.2/+120%
# Atom 9.3/+80%
+# VIA Nano 6.4/+4%
+# Ivy Bridge 4.1/+30%
+# Bulldozer 4.5/+30%(*)
#
-# (*) Note that Core2 result is ~15% lower than corresponding result
+# (*) But corresponding loop has less instructions, which should have
+# positive effect on upcoming Bulldozer, which has one less ALU.
+# For reference, Intel code runs at 6.8 cpb rate on Opteron.
+# (**) Note that Core2 result is ~15% lower than corresponding result
# for 32-bit code, meaning that it's possible to improve it,
# but more than likely at the cost of the others (see rc4-586.pl
# to get the idea)...
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
$dat="%rdi"; # arg1
$len="%rsi"; # arg2
{
$code=<<___;
.text
+.extern OPENSSL_ia32cap_P
.globl RC4
.type RC4,\@function,4
jne .Lentry
ret
.Lentry:
+.cfi_startproc
push %rbx
+.cfi_push %rbx
push %r12
+.cfi_push %r12
push %r13
+.cfi_push %r13
.Lprologue:
mov $len,%r11
mov $inp,%r12
movl $YY#d,-4($dat)
mov (%rsp),%r13
+.cfi_restore %r13
mov 8(%rsp),%r12
+.cfi_restore %r12
mov 16(%rsp),%rbx
+.cfi_restore %rbx
add \$24,%rsp
+.cfi_adjust_cfa_offset -24
.Lepilogue:
ret
+.cfi_endproc
.size RC4,.-RC4
___
}
$ido="%r9";
$code.=<<___;
-.extern OPENSSL_ia32cap_P
.globl RC4_set_key
.type RC4_set_key,\@function,3
.align 16