[bs|vp]aes-x86[_64].pl: typos and clarifications.

[openssl.git] / crypto / aes / asm / vpaes-x86.pl
diff --git a/crypto/aes/asm/vpaes-x86.pl b/crypto/aes/asm/vpaes-x86.pl

index efe68dff41bc8bb052fd5e209983415b9963e30c..84a6f6d336ca421b9e22de2515b5dbef6e6f45db 100644 (file)
--- a/crypto/aes/asm/vpaes-x86.pl
+++ b/crypto/aes/asm/vpaes-x86.pl
@@ -21,9 +21,9 @@
  # about its alignment...
  #
  # Performance summary. aes-586.pl column lists large-block CBC
-# encrypt/decrypt/with-hypert-hreading-off(*) results in cycles per
-# byte processed with 128-bit key, and vpaes-x86.pl column -
-# encrypt/decrypt.
+# encrypt/decrypt/with-hyper-threading-off(*) results in cycles per
+# byte processed with 128-bit key, and vpaes-x86.pl column - [also
+# large-block CBC] encrypt/decrypt.
  #
  #              aes-586.pl              vpaes-x86.pl
  #
@@ -41,7 +41,8 @@
  #
  # (***)        Less impressive improvement on Core 2 and Atom is due to slow
  #      pshufb, yet it's respectable +32%/65%  improvement on Core 2
-#      and +58%/40% on Atom.
+#      and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+#      code path).
  #
  #                                              <appro@openssl.org>
  
@@ -51,7 +52,7 @@ require "x86asm.pl";
  
  &asm_init($ARGV[0],"vpaes-x86.pl",$x86only = $ARGV[$#ARGV] eq "386");
  
-$PREFIX="AES";
+$PREFIX="vpaes";
  
  my  ($round, $base, $magic, $key, $const, $inp, $out)=
      ("eax",  "ebx", "ecx",  "edx","ebp",  "esi","edi");
@@ -152,7 +153,7 @@ $k_dsbe=0x2a0;              # decryption sbox output *E*u, *E*t
  $k_dsbo=0x2c0;         # decryption sbox final output
         &data_word(0x7EF94000,0x1387EA53,0xD4943E2D,0xC7AA6DB9);
         &data_word(0x93441D00,0x12D7560F,0xD8C58E9C,0xCA4B8159);
-&asciz ("Vector Permutation AES for x86, Mike Hamburg (Stanford University)");
+&asciz ("Vector Permutation AES for x86/SSSE3, Mike Hamburg (Stanford University)");
  &align (64);
  
  &function_begin_B("_vpaes_preheat");
@@ -275,7 +276,7 @@ $k_dsbo=0x2c0;              # decryption sbox final output
         &shl    ($magic,4);
         &pand   ("xmm0","xmm6");
         &pshufb ("xmm2","xmm0");
-       &movdqa ("xmm0",&DWP($k_dipt-$k_dsbd+16,$base));
+       &movdqa ("xmm0",&QWP($k_dipt-$k_dsbd+16,$base));
         &xor    ($magic,0x30);
         &pshufb ("xmm0","xmm1");
         &and    ($magic,0x30);