X-Git-Url: https://git.openssl.org/?p=openssl.git;a=blobdiff_plain;f=crypto%2Frc4%2Fasm%2Frc4-586.pl;h=1fae93ae1a7026051a6343701ab9e8f276758312;hp=84f1a798cb8eca08c4314589966a01e1847334b0;hb=HEAD;hpb=a908b711ac0ee74b2005a3e84e5099acbaecee60 diff --git a/crypto/rc4/asm/rc4-586.pl b/crypto/rc4/asm/rc4-586.pl index 84f1a798cb..1fae93ae1a 100644 --- a/crypto/rc4/asm/rc4-586.pl +++ b/crypto/rc4/asm/rc4-586.pl @@ -1,7 +1,14 @@ -#!/usr/bin/env perl +#! /usr/bin/env perl +# Copyright 1998-2020 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + # ==================================================================== -# [Re]written by Andy Polyakov for the OpenSSL +# [Re]written by Andy Polyakov for the OpenSSL # project. The module is, however, dual licensed under OpenSSL and # CRYPTOGAMS licenses depending on where you obtain it. For further # details see http://www.openssl.org/~appro/cryptogams/. @@ -25,8 +32,6 @@ # performance on the same Opteron machine. # (**) This number requires compressed key schedule set up by # RC4_set_key [see commentary below for further details]. -# -# # May 2011 # @@ -43,6 +48,9 @@ # Westmere 5.1/+94%(**) # Sandy Bridge 5.0/+8% # Atom 12.6/+6% +# VIA Nano 6.4/+9% +# Ivy Bridge 4.9/±0% +# Bulldozer 4.9/+15% # # (*) PIII can actually deliver 6.6 cycles per byte with MMX code, # but this specific code performs poorly on Core2. And vice @@ -60,7 +68,9 @@ $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; push(@INC,"${dir}","${dir}../../perlasm"); require "x86asm.pl"; -&asm_init($ARGV[0],"rc4-586.pl"); +$output = pop and open STDOUT,">$output"; + +&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386"); $xx="eax"; $yy="ebx"; @@ -123,7 +133,7 @@ if ($alt=0) { push (@XX,shift(@XX)) if ($i>=0); } } else { - # Using pinsrw here improves performane on Intel CPUs by 2-3%, but + # Using pinsrw here improves performance on Intel CPUs by 2-3%, but # brings down AMD by 7%... $RC4_loop_mmx = sub { my $i=shift; @@ -144,7 +154,7 @@ if ($alt=0) { &movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4)); # (*) This is the key to Core2 and Westmere performance. - # Whithout movz out-of-order execution logic confuses + # Without movz out-of-order execution logic confuses # itself and fails to reorder loads and stores. Problem # appears to be fixed in Sandy Bridge... } @@ -184,8 +194,11 @@ if ($alt=0) { &and ($ty,-4); # how many 4-byte chunks? &jz (&label("loop1")); - &test ($ty,-8); &mov (&wparam(3),$out); # $out as accumulator in these loops + if ($x86only) { + &jmp (&label("go4loop4")); + } else { + &test ($ty,-8); &jz (&label("go4loop4")); &picmeup($out,"OPENSSL_ia32cap_P"); @@ -228,6 +241,7 @@ if ($alt=0) { &cmp ($inp,&wparam(1)); # compare to input+len &je (&label("done")); &jmp (&label("loop1")); + } &set_label("go4loop4",16); &lea ($ty,&DWP(-4,$inp,$ty)); @@ -408,3 +422,4 @@ $idx="edx"; &asm_finish(); +close STDOUT or die "error closing STDOUT: $!";