# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
# processed with 128-bit key. And given their throughput asymptotic
# performance for parallelizable modes is 1.25 cycles per byte. Being
# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
# processed with 128-bit key. And given their throughput asymptotic
# performance for parallelizable modes is 1.25 cycles per byte. Being
# but how close does one get? Below are results collected for
# different modes and block sized. Pairs of numbers are for en-/
# decryption.
# but how close does one get? Below are results collected for
# different modes and block sized. Pairs of numbers are for en-/
# decryption.
# overhead affects small-block performance, as well as OFB and CFB
# results. Differences are not large, most common coefficients are
# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
# overhead affects small-block performance, as well as OFB and CFB
# results. Differences are not large, most common coefficients are
# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
-# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB, CTR)...
+# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
my ($p,$key,$rounds,$inout)=@_; $inout=$inout0 if (!defined($inout));
++$sn;
$code.=<<___;
my ($p,$key,$rounds,$inout)=@_; $inout=$inout0 if (!defined($inout));
++$sn;
$code.=<<___;
mov 240($key),$rounds # pull $rounds
___
&aesni_generate1("enc",$key,$rounds);
mov 240($key),$rounds # pull $rounds
___
&aesni_generate1("enc",$key,$rounds);
mov 240($key),$rounds # pull $rounds
___
&aesni_generate1("dec",$key,$rounds);
mov 240($key),$rounds # pull $rounds
___
&aesni_generate1("dec",$key,$rounds);
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label