X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Frc4%2Fasm%2Frc4-x86_64.pl;h=5a903f5739e123ad519e93c8cc132d7a1ca9990c;hp=46751064ea0f2665ae53efe7b43532c9fdfc98c1;hb=HEAD;hpb=053fa39af62f5b3543ebec8592e4592965b18e26 diff --git a/crypto/rc4/asm/rc4-x86_64.pl b/crypto/rc4/asm/rc4-x86_64.pl index 46751064ea..83a1d13635 100755 --- a/crypto/rc4/asm/rc4-x86_64.pl +++ b/crypto/rc4/asm/rc4-x86_64.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 2005-2023 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # # ==================================================================== -# Written by Andy Polyakov for the OpenSSL +# Written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -16,7 +23,7 @@ # Presumably it has everything to do with AMD cache architecture and # RAW or whatever penalties. Once again! The module *requires* config # line *without* RC4_CHAR! As for coding "secret," I bet on partial -# register arithmetics. For example instead of 'inc %r8; and $255,%r8' +# register arithmetic. For example instead of 'inc %r8; and $255,%r8' # I simply 'inc %r8b'. Even though optimization manual discourages # to operate on partial registers, it turned out to be the best bet. # At least for AMD... How IA32E would perform remains to be seen... @@ -41,7 +48,7 @@ # April 2005 # -# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing +# P4 EM64T core appears to be "allergic" to 64-bit inc/dec. Replacing # those with add/sub results in 50% performance improvement of folded # loop... @@ -50,7 +57,7 @@ # As was shown by Zou Nanhai loop unrolling can improve Intel EM64T # performance by >30% [unlike P4 32-bit case that is]. But this is # provided that loads are reordered even more aggressively! Both code -# pathes, AMD64 and EM64T, reorder loads in essentially same manner +# paths, AMD64 and EM64T, reorder loads in essentially same manner # as my IA-64 implementation. On Opteron this resulted in modest 5% # improvement [I had to test it], while final Intel P4 performance # achieves respectful 432MBps on 2.8GHz processor now. For reference. @@ -81,7 +88,7 @@ # The only code path that was not modified is P4-specific one. Non-P4 # Intel code path optimization is heavily based on submission by Maxim # Perminov, Maxim Locktyukhin and Jim Guilford of Intel. I've used -# some of the ideas even in attempt to optmize the original RC4_INT +# some of the ideas even in attempt to optimize the original RC4_INT # code path... Current performance in cycles per processed byte (less # is better) and improvement coefficients relative to previous # version of this module are: @@ -104,9 +111,10 @@ # but more than likely at the cost of the others (see rc4-586.pl # to get the idea)... -$flavour = shift; -$output = shift; -if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } +# $output is the last argument if it looks like a file (it has an extension) +# $flavour is the first argument if it doesn't look like a file +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/); @@ -115,7 +123,8 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or die "can't locate x86_64-xlate.pl"; -open OUT,"| \"$^X\" $xlate $flavour $output"; +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"" + or die "can't call $xlate: $!"; *STDOUT=*OUT; $dat="%rdi"; # arg1 @@ -131,13 +140,19 @@ $code=<<___; .globl RC4 .type RC4,\@function,4 .align 16 -RC4: or $len,$len +RC4: +.cfi_startproc + endbranch + or $len,$len jne .Lentry ret .Lentry: push %rbx +.cfi_push %rbx push %r12 +.cfi_push %r12 push %r13 +.cfi_push %r13 .Lprologue: mov $len,%r11 mov $inp,%r12 @@ -420,11 +435,16 @@ $code.=<<___; movl $YY#d,-4($dat) mov (%rsp),%r13 +.cfi_restore %r13 mov 8(%rsp),%r12 +.cfi_restore %r12 mov 16(%rsp),%rbx +.cfi_restore %rbx add \$24,%rsp +.cfi_adjust_cfa_offset -24 .Lepilogue: ret +.cfi_endproc .size RC4,.-RC4 ___ } @@ -437,6 +457,8 @@ $code.=<<___; .type RC4_set_key,\@function,3 .align 16 RC4_set_key: +.cfi_startproc + endbranch lea 8($dat),$dat lea ($inp,$len),$inp neg $len @@ -503,12 +525,15 @@ RC4_set_key: mov %eax,-8($dat) mov %eax,-4($dat) ret +.cfi_endproc .size RC4_set_key,.-RC4_set_key .globl RC4_options .type RC4_options,\@abi-omnipotent .align 16 RC4_options: +.cfi_startproc + endbranch lea .Lopts(%rip),%rax mov OPENSSL_ia32cap_P(%rip),%edx bt \$20,%edx @@ -521,6 +546,7 @@ RC4_options: add \$12,%rax .Ldone: ret +.cfi_endproc .align 64 .Lopts: .asciz "rc4(8x,int)" @@ -677,4 +703,4 @@ $code =~ s/\`([^\`]*)\`/eval $1/gem; print $code; -close STDOUT; +close STDOUT or die "error closing STDOUT: $!";