# about its alignment...
#
# Performance summary. aes-586.pl column lists large-block CBC
-# encrypt/decrypt/with-hypert-hreading-off(*) results in cycles per
-# byte processed with 128-bit key, and vpaes-x86.pl column -
-# encrypt/decrypt.
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
#
# aes-586.pl vpaes-x86.pl
#
#
# (***) Less impressive improvement on Core 2 and Atom is due to slow
# pshufb, yet it's respectable +32%/65% improvement on Core 2
-# and +58%/40% on Atom.
+# and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+# code path).
#
# <appro@openssl.org>
&asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
-$PREFIX="AES";
+$PREFIX="vpaes";
my ($round, $base, $magic, $key, $const, $inp, $out)=
("eax", "ebx", "ecx", "edx","ebp", "esi","edi");
$k_dsbo=0x2c0; # decryption sbox final output
&data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
&data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
-&asciz ("Vector Permutation AES for x86, Mike Hamburg (Stanford University)");
+&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
&align (64);
&function_begin_B("_vpaes_preheat");
&shl ($magic,4);
&pand ("xmm0","xmm6");
&pshufb ("xmm2","xmm0");
- &movdqa ("xmm0",&DWP($k_dipt-$k_dsbd+16,$base));
+ &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
&xor ($magic,0x30);
&pshufb ("xmm0","xmm1");
&and ($magic,0x30);