-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 1998-2020 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
# ====================================================================
-# [Re]written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# [Re]written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# performance on the same Opteron machine.
# (**) This number requires compressed key schedule set up by
# RC4_set_key [see commentary below for further details].
-#
-# <appro@fy.chalmers.se>
# May 2011
#
# Westmere 5.1/+94%(**)
# Sandy Bridge 5.0/+8%
# Atom 12.6/+6%
+# VIA Nano 6.4/+9%
+# Ivy Bridge 4.9/±0%
+# Bulldozer 4.9/+15%
#
# (*) PIII can actually deliver 6.6 cycles per byte with MMX code,
# but this specific code performs poorly on Core2. And vice
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
-&asm_init($ARGV[0],"rc4-586.pl");
+$output = pop and open STDOUT,">$output";
+
+&asm_init($ARGV[0],$x86only = $ARGV[$#ARGV] eq "386");
$xx="eax";
$yy="ebx";
push (@XX,shift(@XX)) if ($i>=0);
}
} else {
- # Using pinsrw here improves performane on Intel CPUs by 2-3%, but
+ # Using pinsrw here improves performance on Intel CPUs by 2-3%, but
# brings down AMD by 7%...
$RC4_loop_mmx = sub {
my $i=shift;
&movd ($i>0?"mm1":"mm2",&DWP(0,$dat,$ty,4));
# (*) This is the key to Core2 and Westmere performance.
- # Whithout movz out-of-order execution logic confuses
+ # Without movz out-of-order execution logic confuses
# itself and fails to reorder loads and stores. Problem
# appears to be fixed in Sandy Bridge...
}
&and ($ty,-4); # how many 4-byte chunks?
&jz (&label("loop1"));
- &test ($ty,-8);
&mov (&wparam(3),$out); # $out as accumulator in these loops
+ if ($x86only) {
+ &jmp (&label("go4loop4"));
+ } else {
+ &test ($ty,-8);
&jz (&label("go4loop4"));
&picmeup($out,"OPENSSL_ia32cap_P");
&cmp ($inp,&wparam(1)); # compare to input+len
&je (&label("done"));
&jmp (&label("loop1"));
+ }
&set_label("go4loop4",16);
&lea ($ty,&DWP(-4,$inp,$ty));
&asm_finish();
+close STDOUT or die "error closing STDOUT: $!";