Make the RSA structure opaque

[openssl.git] / crypto / aes / asm / bsaes-x86_64.pl
diff --git a/crypto/aes/asm/bsaes-x86_64.pl b/crypto/aes/asm/bsaes-x86_64.pl

index 8cde67681e48a943df570201c44c987ea31be979..3f7d33c45bce7154a54789eb6c1a6902b17ad3fa 100644 (file)
--- a/crypto/aes/asm/bsaes-x86_64.pl
+++ b/crypto/aes/asm/bsaes-x86_64.pl
@@ -38,8 +38,9 @@
  #              Emilia's        this(*)         difference
  #
  # Core 2       9.30            8.69            +7%
-# Nehalem(**)  7.63            6.98            +9%
-# Atom         17.1            17.4            -2%(***)
+# Nehalem(**)  7.63            6.88            +11%
+# Atom         17.1            16.4            +4%
+# Silvermont   -               12.9
  #
  # (*)  Comparison is not completely fair, because "this" is ECB,
  #      i.e. no extra processing such as counter values calculation
@@ -50,14 +51,6 @@
  # (**) Results were collected on Westmere, which is considered to
  #      be equivalent to Nehalem for this code.
  #
-# (***)        Slowdown on Atom is rather strange per se, because original
-#      implementation has a number of 9+-bytes instructions, which
-#      are bad for Atom front-end, and which I eliminated completely.
-#      In attempt to address deterioration sbox() was tested in FP
-#      SIMD "domain" (movaps instead of movdqa, xorps instead of
-#      pxor, etc.). While it resulted in nominal 4% improvement on
-#      Atom, it hurted Westmere by more than 2x factor.
-#
  # As for key schedule conversion subroutine. Interface to OpenSSL
  # relies on per-invocation on-the-fly conversion. This naturally
  # has impact on performance, especially for short inputs. Conversion
@@ -67,7 +60,7 @@
  #              conversion      conversion/8x block
  # Core 2       240             0.22
  # Nehalem      180             0.20
-# Atom         430             0.19
+# Atom         430             0.20
  #
  # The ratio values mean that 128-byte blocks will be processed
  # 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
@@ -83,9 +76,10 @@
  # Add decryption procedure. Performance in CPU cycles spent to decrypt
  # one byte out of 4096-byte buffer with 128-bit key is:
  #
-# Core 2       9.83
-# Nehalem      7.74
-# Atom         18.9 (estimated, not measured yet)
+# Core 2       9.98
+# Nehalem      7.80
+# Atom         17.9
+# Silvermont   14.0
  #
  # November 2011.
  #
@@ -434,21 +428,21 @@ my $mask=pop;
  $code.=<<___;
         pxor    0x00($key),@x[0]
         pxor    0x10($key),@x[1]
-       pshufb  $mask,@x[0]
         pxor    0x20($key),@x[2]
-       pshufb  $mask,@x[1]
         pxor    0x30($key),@x[3]
-       pshufb  $mask,@x[2]
+       pshufb  $mask,@x[0]
+       pshufb  $mask,@x[1]
         pxor    0x40($key),@x[4]
-       pshufb  $mask,@x[3]
         pxor    0x50($key),@x[5]
-       pshufb  $mask,@x[4]
+       pshufb  $mask,@x[2]
+       pshufb  $mask,@x[3]
         pxor    0x60($key),@x[6]
-       pshufb  $mask,@x[5]
         pxor    0x70($key),@x[7]
+       pshufb  $mask,@x[4]
+       pshufb  $mask,@x[5]
         pshufb  $mask,@x[6]
-       lea     0x80($key),$key
         pshufb  $mask,@x[7]
+       lea     0x80($key),$key
  ___
  }
  
@@ -820,18 +814,18 @@ _bsaes_encrypt8:
         movdqa  0x50($const), @XMM[8]   # .LM0SR
         pxor    @XMM[9], @XMM[0]        # xor with round0 key
         pxor    @XMM[9], @XMM[1]
-        pshufb @XMM[8], @XMM[0]
         pxor    @XMM[9], @XMM[2]
-        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[0]
+        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[4]
-        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[6]
-        pshufb @XMM[8], @XMM[5]
         pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[5]
          pshufb @XMM[8], @XMM[6]
          pshufb @XMM[8], @XMM[7]
  _bsaes_encrypt8_bitslice:
@@ -884,18 +878,18 @@ _bsaes_decrypt8:
         movdqa  -0x30($const), @XMM[8]  # .LM0ISR
         pxor    @XMM[9], @XMM[0]        # xor with round0 key
         pxor    @XMM[9], @XMM[1]
-        pshufb @XMM[8], @XMM[0]
         pxor    @XMM[9], @XMM[2]
-        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[0]
+        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[4]
-        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[6]
-        pshufb @XMM[8], @XMM[5]
         pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[5]
          pshufb @XMM[8], @XMM[6]
          pshufb @XMM[8], @XMM[7]
  ___
@@ -1937,21 +1931,21 @@ $code.=<<___;
         movdqa  -0x10(%r11), @XMM[8]    # .LSWPUPM0SR
         pxor    @XMM[9], @XMM[0]        # xor with round0 key
         pxor    @XMM[9], @XMM[1]
-        pshufb @XMM[8], @XMM[0]
         pxor    @XMM[9], @XMM[2]
-        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[3]
-        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[0]
+        pshufb @XMM[8], @XMM[1]
         pxor    @XMM[9], @XMM[4]
-        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[5]
-        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[2]
+        pshufb @XMM[8], @XMM[3]
         pxor    @XMM[9], @XMM[6]
-        pshufb @XMM[8], @XMM[5]
         pxor    @XMM[9], @XMM[7]
+        pshufb @XMM[8], @XMM[4]
+        pshufb @XMM[8], @XMM[5]
          pshufb @XMM[8], @XMM[6]
-       lea     .LBS0(%rip), %r11       # constants table
          pshufb @XMM[8], @XMM[7]
+       lea     .LBS0(%rip), %r11       # constants table
         mov     %ebx,%r10d              # pass rounds
  
         call    _bsaes_encrypt8_bitslice
@@ -2089,6 +2083,8 @@ ___
  #      const unsigned char iv[16]);
  #
  my ($twmask,$twres,$twtmp)=@XMM[13..15];
+$arg6=~s/d$//;
+
  $code.=<<___;
  .globl bsaes_xts_encrypt
  .type  bsaes_xts_encrypt,\@abi-omnipotent