aesni-sha1-x86_64.pl: update performance data.

author Andy Polyakov <appro@openssl.org>

Mon, 10 Jun 2013 20:35:22 +0000 (22:35 +0200)

committer Andy Polyakov <appro@openssl.org>

Mon, 10 Jun 2013 20:35:22 +0000 (22:35 +0200)
author Andy Polyakov <appro@openssl.org>
Mon, 10 Jun 2013 20:35:22 +0000 (22:35 +0200)
committer Andy Polyakov <appro@openssl.org>
Mon, 10 Jun 2013 20:35:22 +0000 (22:35 +0200)
diff --git a/crypto/aes/asm/aesni-sha1-x86_64.pl b/crypto/aes/asm/aesni-sha1-x86_64.pl

index 031e19be653dff6b0433c00668eeaba3b594de3f..2d1a222ac246221fcfc6b65b08c242cec34b5e0b 100644 (file)
--- a/crypto/aes/asm/aesni-sha1-x86_64.pl
+++ b/crypto/aes/asm/aesni-sha1-x86_64.pl
@@ -22,20 +22,23 @@
  #
  #              AES-128-CBC     +SHA1           stitch      gain
  # Westmere     3.77[+5.5]      9.26            6.58        +41%
  #
  #              AES-128-CBC     +SHA1           stitch      gain
  # Westmere     3.77[+5.5]      9.26            6.58        +41%
-# Sandy Bridge 5.05[+5.0(6.2)] 10.06(11.21)    6.09(7.05)  +65%(+59%)
+# Sandy Bridge 5.05[+5.0(6.2)] 10.06(11.21)    5.98(7.05)  +68%(+59%)
  # Ivy Bridge   5.05[+4.6]      9.65            5.54        +74%
  # Ivy Bridge   5.05[+4.6]      9.65            5.54        +74%
+# Haswell      4.43[+3.6(4.4)] 8.00(8.80)      4.55(5.21)  +75%(+69%)
  # Bulldozer    5.77[+6.0]      11.72           6.37        +84%
  #
  #              AES-192-CBC
  # Westmere     4.51            10.00           6.87        +46%
  # Sandy Bridge 6.05            11.06(12.21)    6.11(7.20)  +81%(+70%)
  # Ivy Bridge   6.05            10.65           6.07        +75%
  # Bulldozer    5.77[+6.0]      11.72           6.37        +84%
  #
  #              AES-192-CBC
  # Westmere     4.51            10.00           6.87        +46%
  # Sandy Bridge 6.05            11.06(12.21)    6.11(7.20)  +81%(+70%)
  # Ivy Bridge   6.05            10.65           6.07        +75%
+# Haswell      5.29            8.86(9.65)      5.32(5.32)  +67%(+81%)
  # Bulldozer    6.89            12.84           6.96        +84%
  #
  #              AES-256-CBC
  # Westmere     5.25            10.74           7.19        +49%
  # Sandy Bridge 7.05            12.06(13.21)    7.12(7.68)  +69%(+72%)
  # Ivy Bridge   7.05            11.65           7.12        +64%
  # Bulldozer    6.89            12.84           6.96        +84%
  #
  #              AES-256-CBC
  # Westmere     5.25            10.74           7.19        +49%
  # Sandy Bridge 7.05            12.06(13.21)    7.12(7.68)  +69%(+72%)
  # Ivy Bridge   7.05            11.65           7.12        +64%
+# Haswell      6.19            9.76(10.6)      6.21(6.41)  +57%(+65%)
  # Bulldozer    8.00            13.95           8.25        +69%
  #
  # (*)  There are two code paths: SSSE3 and AVX. See sha1-568.pl for
  # Bulldozer    8.00            13.95           8.25        +69%
  #
  # (*)  There are two code paths: SSSE3 and AVX. See sha1-568.pl for
@@ -51,9 +54,10 @@
  # standalone AESNI-CBC decrypt:
  #
  #              AES-128-CBC     AES-192-CBC     AES-256-CBC
  # standalone AESNI-CBC decrypt:
  #
  #              AES-128-CBC     AES-192-CBC     AES-256-CBC
-# Westmere     1.31            1.55            1.80
+# Westmere     1.25            1.50            1.75
  # Sandy Bridge 0.74            0.91            1.09
  # Ivy Bridge   0.74            0.90            1.11
  # Sandy Bridge 0.74            0.91            1.09
  # Ivy Bridge   0.74            0.90            1.11
+# Haswell      0.63            0.76            0.88
  # Bulldozer    0.70            0.85            0.99
  
  $flavour = shift;
  # Bulldozer    0.70            0.85            0.99
  
  $flavour = shift;
@@ -94,7 +98,7 @@ $code.=<<___;
  
  .globl aesni_cbc_sha1_enc
  .type  aesni_cbc_sha1_enc,\@abi-omnipotent
  
  .globl aesni_cbc_sha1_enc
  .type  aesni_cbc_sha1_enc,\@abi-omnipotent
-.align 16
+.align 32
  aesni_cbc_sha1_enc:
         # caller should check for SSSE3 and AES-NI bits
         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
  aesni_cbc_sha1_enc:
         # caller should check for SSSE3 and AES-NI bits
         mov     OPENSSL_ia32cap_P+0(%rip),%r10d
@@ -144,7 +148,7 @@ my $_ror=sub { &ror(@_) };
  
  $code.=<<___;
  .type  aesni_cbc_sha1_enc_ssse3,\@function,6
  
  $code.=<<___;
  .type  aesni_cbc_sha1_enc_ssse3,\@function,6
-.align 16
+.align 32
  aesni_cbc_sha1_enc_ssse3:
         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
         #shr    \$6,$len                        # debugging artefact
  aesni_cbc_sha1_enc_ssse3:
         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
         #shr    \$6,$len                        # debugging artefact
@@ -570,7 +574,7 @@ sub body_40_59 () { # ((b^c)&(c^d))^c
      return @r;
  }
  $code.=<<___;
      return @r;
  }
  $code.=<<___;
-.align 16
+.align 32
  .Loop_ssse3:
  ___
         &Xupdate_ssse3_16_31(\&body_00_19);
  .Loop_ssse3:
  ___
         &Xupdate_ssse3_16_31(\&body_00_19);
@@ -618,7 +622,6 @@ $code.=<<___;
         and     @T[1],@T[0]
         jmp     .Loop_ssse3
  
         and     @T[1],@T[0]
         jmp     .Loop_ssse3
  
-.align 16
  .Ldone_ssse3:
  ___
                                 $jj=$j=$saved_j; @V=@saved_V;
  .Ldone_ssse3:
  ___
                                 $jj=$j=$saved_j; @V=@saved_V;
@@ -689,7 +692,7 @@ my $_ror=sub { &shrd(@_[0],@_) };
  
  $code.=<<___;
  .type  aesni_cbc_sha1_enc_avx,\@function,6
  
  $code.=<<___;
  .type  aesni_cbc_sha1_enc_avx,\@function,6
-.align 16
+.align 32
  aesni_cbc_sha1_enc_avx:
         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
         #shr    \$6,$len                        # debugging artefact
  aesni_cbc_sha1_enc_avx:
         mov     `($win64?56:8)`(%rsp),$inp      # load 7th argument
         #shr    \$6,$len                        # debugging artefact
@@ -773,14 +776,14 @@ my $aesenc=sub {
    my ($n,$k)=($r/10,$r%10);
      if ($k==0) {
        $code.=<<___;
    my ($n,$k)=($r/10,$r%10);
      if ($k==0) {
        $code.=<<___;
-       vmovups         `16*$n`($in0),$in               # load input
-       vxorps          $rndkey[1],$in,$in
+       vmovdqu         `16*$n`($in0),$in               # load input
+       vpxor           $rndkey[1],$in,$in
  ___
        $code.=<<___ if ($n);
         vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
  ___
        $code.=<<___;
  ___
        $code.=<<___ if ($n);
         vmovups         $iv,`16*($n-1)`($out,$in0)      # write output
  ___
        $code.=<<___;
-       vxorps          $in,$iv,$iv
+       vpxor           $in,$iv,$iv
         vaesenc         $rndkey[0],$iv,$iv
         vmovups         `32+16*$k-112`($key),$rndkey[1]
  ___
         vaesenc         $rndkey[0],$iv,$iv
         vmovups         `32+16*$k-112`($key),$rndkey[1]
  ___
@@ -1014,7 +1017,7 @@ sub Xtail_avx()
  }
  
  $code.=<<___;
  }
  
  $code.=<<___;
-.align 16
+.align 32
  .Loop_avx:
  ___
         &Xupdate_avx_16_31(\&body_00_19);
  .Loop_avx:
  ___
         &Xupdate_avx_16_31(\&body_00_19);
@@ -1062,7 +1065,6 @@ $code.=<<___;
         and     @T[1],@T[0]
         jmp     .Loop_avx
  
         and     @T[1],@T[0]
         jmp     .Loop_avx
  
-.align 16
  .Ldone_avx:
  ___
                                 $jj=$j=$saved_j; @V=@saved_V;
  .Ldone_avx:
  ___
                                 $jj=$j=$saved_j; @V=@saved_V;
author	Andy Polyakov <appro@openssl.org>
	Mon, 10 Jun 2013 20:35:22 +0000 (22:35 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Mon, 10 Jun 2013 20:35:22 +0000 (22:35 +0200)