vpaes-x86.pl: revert previous commit and solve the problem through x86masm.pl.
[openssl.git] / crypto / aes / asm / vpaes-x86.pl
index 1de722b558f57826e0a672b8abebabcc9928f54e..84a6f6d336ca421b9e22de2515b5dbef6e6f45db 100644 (file)
@@ -21,9 +21,9 @@
 # about its alignment...
 #
 # Performance summary. aes-586.pl column lists large-block CBC
-# encrypt/decrypt/with-hypert-hreading-off(*) results in cycles per
-# byte processed with 128-bit key, and vpaes-x86.pl column -
-# encrypt/decrypt.
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
 #
 #              aes-586.pl              vpaes-x86.pl
 #
 #
 # (***)        Less impressive improvement on Core 2 and Atom is due to slow
 #      pshufb, yet it's respectable +32%/65%  improvement on Core 2
-#      and +58%/40% on Atom.
+#      and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+#      code path).
 #
-#                                              <appro@openss.org>
+#                                              <appro@openssl.org>
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 push(@INC,"${dir}","${dir}../../perlasm");
@@ -51,7 +52,7 @@ require "x86asm.pl";
 
 &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
 
-$PREFIX="AES";
+$PREFIX="vpaes";
 
 my  ($round, $base, $magic, $key, $const, $inp, $out)=
     ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
@@ -152,7 +153,7 @@ $k_dsbe=0x2a0;              # decryption sbox output *E*u, *E*t
 $k_dsbo=0x2c0;         # decryption sbox final output
        &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
        &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
-&asciz ("Vector Permutation AES for x86, Mike Hamburg (Stanford University)");
+&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
 &align (64);
 
 &function_begin_B("_vpaes_preheat");
@@ -275,7 +276,7 @@ $k_dsbo=0x2c0;              # decryption sbox final output
        &shl    ($magic,4);
        &pand   ("xmm0","xmm6");
        &pshufb ("xmm2","xmm0");
-       &movdqa ("xmm0",&DWP($k_dipt-$k_dsbd+16,$base));
+       &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
        &xor    ($magic,0x30);
        &pshufb ("xmm0","xmm1");
        &and    ($magic,0x30);