[aesni|sha*]-mb-x86_64.pl: add data prefetching.

author Andy Polyakov <appro@openssl.org>

Wed, 5 Feb 2014 13:03:35 +0000 (14:03 +0100)

committer Andy Polyakov <appro@openssl.org>

Wed, 5 Feb 2014 13:03:35 +0000 (14:03 +0100)
author Andy Polyakov <appro@openssl.org>
Wed, 5 Feb 2014 13:03:35 +0000 (14:03 +0100)
committer Andy Polyakov <appro@openssl.org>
Wed, 5 Feb 2014 13:03:35 +0000 (14:03 +0100)
diff --git a/crypto/aes/asm/aesni-mb-x86_64.pl b/crypto/aes/asm/aesni-mb-x86_64.pl

index 17ed3b2ca9f2b2370f1819e0819da4057dc6389d..e6ac9f9ecfa74feb5adef783b9273f75af5a1671 100644 (file)
--- a/crypto/aes/asm/aesni-mb-x86_64.pl
+++ b/crypto/aes/asm/aesni-mb-x86_64.pl
@@ -15,8 +15,8 @@
  #                      asymptotic      measured
  #                      ---------------------------
  # Westmere             5.00/4=1.25     5.13/4=1.28
-# Atom                 15.0/4=3.75     15.7/4=3.93
-# Sandy Bridge         5.06/4=1.27     5.15/4=1.29
+# Atom                 15.0/4=3.75     ?15.7/4=3.93
+# Sandy Bridge         5.06/4=1.27     5.18/4=1.29
  # Ivy Bridge           5.06/4=1.27     5.14/4=1.29
  # Haswell              4.44/4=1.11     4.44/4=1.11
  # Bulldozer            5.75/4=1.44     5.76/4=1.44
@@ -27,8 +27,8 @@
  #
  #                      asymptotic      measured
  #                      ---------------------------
-# Sandy Bridge         5.06/8=0.64     7.05/8=0.88(*)
-# Ivy Bridge           5.06/8=0.64     7.02/8=0.88(*)
+# Sandy Bridge         5.06/8=0.64     7.10/8=0.89(*)
+# Ivy Bridge           5.06/8=0.64     7.14/8=0.89(*)
  # Haswell              5.00/8=0.63     5.00/8=0.63
  # Bulldozer            5.75/8=0.72     5.77/8=0.72
  #
@@ -188,7 +188,11 @@ $code.=<<___;
         sub     $offset,$sink
  
         aesenc          $rndkey1,@out[0]
+       prefetcht0      31(@inptr[0],$offset)   # prefetch input
+       prefetcht0      31(@inptr[1],$offset)
         aesenc          $rndkey1,@out[1]
+       prefetcht0      31(@inptr[2],$offset)
+       prefetcht0      31(@inptr[2],$offset)
         aesenc          $rndkey1,@out[2]
         aesenc          $rndkey1,@out[3]
         movups          0x30-0x78($key),$rndkey1
@@ -199,8 +203,8 @@ $code.=<<___;
          cmp            `32+4*$i`(%rsp),$one
         aesenc          $rndkey,@out[0]
         aesenc          $rndkey,@out[1]
-        cmovge         $sink,@inptr[$i]        # cancel input
         aesenc          $rndkey,@out[2]
+        cmovge         $sink,@inptr[$i]        # cancel input
          cmovg          $sink,@outptr[$i]       # sink output
         aesenc          $rndkey,@out[3]
         movups          `0x40+16*$i-0x78`($key),$rndkey
@@ -209,7 +213,11 @@ ___
  $code.=<<___;
          movdqa         $counters,$mask
         aesenc          $rndkey0,@out[0]
+       prefetcht0      15(@outptr[0],$offset)  # prefetch output
+       prefetcht0      15(@outptr[1],$offset)
         aesenc          $rndkey0,@out[1]
+       prefetcht0      15(@outptr[2],$offset)
+       prefetcht0      15(@outptr[3],$offset)
         aesenc          $rndkey0,@out[2]
         aesenc          $rndkey0,@out[3]
         movups          0x80-0x78($key),$rndkey0
@@ -260,13 +268,15 @@ $code.=<<___;
         aesenc          $rndkey0,@out[2]
         aesenc          $rndkey0,@out[3]
         movups          0xe0-0x78($key),$rndkey0
+       jmp     .Lenc4x_tail
  
+.align 32
  .Lenc4x_tail:
         aesenc          $rndkey1,@out[0]
         aesenc          $rndkey1,@out[1]
         aesenc          $rndkey1,@out[2]
-        movdqu         (@inptr[0],$offset),@inp[0]
         aesenc          $rndkey1,@out[3]
+        movdqu         (@inptr[0],$offset),@inp[0]
         movdqu          0x10-0x78($key),$rndkey1
  
         aesenclast      $rndkey0,@out[0]
@@ -426,7 +436,11 @@ $code.=<<___;
         sub     $offset,$sink
  
         aesdec          $rndkey1,@out[0]
+       prefetcht0      31(@inptr[0],$offset)   # prefetch input
+       prefetcht0      31(@inptr[1],$offset)
         aesdec          $rndkey1,@out[1]
+       prefetcht0      31(@inptr[2],$offset)
+       prefetcht0      31(@inptr[3],$offset)
         aesdec          $rndkey1,@out[2]
         aesdec          $rndkey1,@out[3]
         movups          0x30-0x78($key),$rndkey1
@@ -447,7 +461,11 @@ ___
  $code.=<<___;
          movdqa         $counters,$mask
         aesdec          $rndkey0,@out[0]
+       prefetcht0      15(@outptr[0],$offset)  # prefetch output
+       prefetcht0      15(@outptr[1],$offset)
         aesdec          $rndkey0,@out[1]
+       prefetcht0      15(@outptr[2],$offset)
+       prefetcht0      15(@outptr[3],$offset)
         aesdec          $rndkey0,@out[2]
         aesdec          $rndkey0,@out[3]
         movups          0x80-0x78($key),$rndkey0
@@ -498,7 +516,9 @@ $code.=<<___;
         aesdec          $rndkey0,@out[2]
         aesdec          $rndkey0,@out[3]
         movups          0xe0-0x78($key),$rndkey0
+       jmp     .Ldec4x_tail
  
+.align 32
  .Ldec4x_tail:
         aesdec          $rndkey1,@out[0]
         aesdec          $rndkey1,@out[1]
@@ -512,12 +532,12 @@ $code.=<<___;
         movdqu          0x20-0x78($key),$rndkey0
  
         aesdeclast      @inp[0],@out[0]
-        movdqu         -16(@inptr[0],$offset),@inp[0]  # load next IV
         aesdeclast      @inp[1],@out[1]
+        movdqu         -16(@inptr[0],$offset),@inp[0]  # load next IV
          movdqu         -16(@inptr[1],$offset),@inp[1]
         aesdeclast      @inp[2],@out[2]
-        movdqu         -16(@inptr[2],$offset),@inp[2]
         aesdeclast      @inp[3],@out[3]
+        movdqu         -16(@inptr[2],$offset),@inp[2]
          movdqu         -16(@inptr[3],$offset),@inp[3]
  
         movups          @out[0],-16(@outptr[0],$offset)
@@ -682,7 +702,13 @@ $code.=<<___ if ($i);
  ___
  $code.=<<___;
         vaesenc         $rndkey,@out[1],@out[1]
+       prefetcht0      31(@ptr[$i])                    # prefetch input
         vaesenc         $rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+       prefetcht0      15(@ptr[$i-2])                  # prefetch output
+___
+$code.=<<___;
         vaesenc         $rndkey,@out[3],@out[3]
          lea            (@ptr[$i],$offset),$offset
          cmovge         %rsp,@ptr[$i]                   # cancel input
@@ -703,6 +729,8 @@ ___
  }
  $code.=<<___;
          vmovdqu        32(%rsp),$counters
+       prefetcht0      15(@ptr[$i-2])                  # prefetch output
+       prefetcht0      15(@ptr[$i-1])
         cmp     \$11,$rounds
         jb      .Lenc8x_tail
  
@@ -958,7 +986,13 @@ $code.=<<___ if ($i);
  ___
  $code.=<<___;
         vaesdec         $rndkey,@out[1],@out[1]
+       prefetcht0      31(@ptr[$i])                    # prefetch input
         vaesdec         $rndkey,@out[2],@out[2]
+___
+$code.=<<___ if ($i>1);
+       prefetcht0      15(@ptr[$i-2])                  # prefetch output
+___
+$code.=<<___;
         vaesdec         $rndkey,@out[3],@out[3]
          lea            (@ptr[$i],$offset),$offset
          cmovge         %rsp,@ptr[$i]                   # cancel input
@@ -979,6 +1013,8 @@ ___
  }
  $code.=<<___;
          vmovdqu        32(%rsp),$counters
+       prefetcht0      15(@ptr[$i-2])                  # prefetch output
+       prefetcht0      15(@ptr[$i-1])
         cmp     \$11,$rounds
         jb      .Ldec8x_tail
  
diff --git a/crypto/sha/asm/sha1-mb-x86_64.pl b/crypto/sha/asm/sha1-mb-x86_64.pl

index 93996e15f96e93746c32e23170f8409cbbe0b96c..33e6620c689f4dadd5e3f11809db89c6ffccce9a 100644 (file)
--- a/crypto/sha/asm/sha1-mb-x86_64.pl
+++ b/crypto/sha/asm/sha1-mb-x86_64.pl
@@ -14,20 +14,21 @@
  #
  #              this    +aesni(i)       sha1    aesni-sha1      gain(iv)
  # -------------------------------------------------------------------
-# Westmere(ii) 10.4/n  +1.28=3.88(n=4) 5.44    6.58            +70%
-# Atom(ii)     18.9/n  +3.93=8.66(n=4) 10.0    14.0            +62%
+# Westmere(ii) 10.7/n  +1.28=3.96(n=4) 5.30    6.66            +68%
+# Atom(ii)     18.9?/n +3.93=8.66(n=4) 10.0    14.0            +62%
  # Sandy Bridge (8.16   +5.15=13.3)/n   4.99    5.98            +80%
-# Ivy Bridge   (8.03   +5.14=13.2)/n   4.60    5.54            +68%
+# Ivy Bridge   (8.08   +5.14=13.2)/n   4.60    5.54            +68%
  # Haswell(iii) (8.96   +5.00=14.0)/n   3.57    4.55            +160%
-# Bulldozer    (9.75   +5.76=15.5)/n   5.95    6.37            +64%
+# Bulldozer    (9.76   +5.76=15.5)/n   5.95    6.37            +64%
  #
  # (i)  multi-block CBC encrypt with 128-bit key;
  # (ii) (HASH+AES)/n does not apply to Westmere for n>3 and Atom,
  #      because of lower AES-NI instruction throughput;
  # (iii)        "this" is for n=8, when we gather twice as much data, result
-#      for n=4 is 7.98+4.44=12.4;
-# (iv) improvement coefficients in real-life application are somewhat
-#      lower and range from 30% to 100% (on Haswell);
+#      for n=4 is 8.00+4.44=12.4;
+# (iv) presented improvement coefficients are asymptotic limits and
+#      in real-life application are somewhat lower, e.g. for 2KB
+#      fragments they range from 30% to 100% (on Haswell);
  
  $flavour = shift;
  $output  = shift;
@@ -80,6 +81,14 @@ $Tbl="%rbp";
  @Xi=map("%xmm$_",(10..14));
  $K="%xmm15";
  
+if (1) {
+    # Atom-specific optimization aiming to eliminate pshufb with high
+    # registers [and thus get rid of 48 cycles accumulated penalty] 
+    @Xi=map("%xmm$_",(0..4));
+    ($tx,$t0,$t1,$t2,$t3)=map("%xmm$_",(5..9));
+    @V=($A,$B,$C,$D,$E)=map("%xmm$_",(10..14));
+}
+
  $REG_SZ=16;
  
  sub Xi_off {
@@ -139,8 +148,8 @@ $code.=<<___ if ($i<14);                    # just load input
  
         psrld   \$2,$b
         paddd   $t2,$e                          # e+=rol(a,5)
-        movd           `4*$j-16*4`(@ptr[2]),$t2
          pshufb $tx,@Xi[1]
+        movd           `4*$j-16*4`(@ptr[2]),$t2
         por     $t1,$b                          # b=rol(b,30)
  ___
  $code.=<<___ if ($i==14);                      # just load input
@@ -152,6 +161,7 @@ $code.=<<___ if ($i==14);                   # just load input
         movdqa  $b,$t1
         movdqa  $b,$t0
         pslld   \$5,$t2
+        prefetcht0     63(@ptr[0])
         pandn   $d,$t1
         pand    $c,$t0
          punpckldq      $t3,@Xi[1]
@@ -162,14 +172,17 @@ $code.=<<___ if ($i==14);                 # just load input
         psrld   \$27,$t3
         pxor    $t1,$t0                         # Ch(b,c,d)
         movdqa  $b,$t1
+        prefetcht0     63(@ptr[1])
  
         por     $t3,$t2                         # rol(a,5)
         pslld   \$30,$t1
         paddd   $t0,$e                          # e+=Ch(b,c,d)
+        prefetcht0     63(@ptr[2])
  
         psrld   \$2,$b
         paddd   $t2,$e                          # e+=rol(a,5)
          pshufb $tx,@Xi[1]
+        prefetcht0     63(@ptr[3])
         por     $t1,$b                          # b=rol(b,30)
  ___
  $code.=<<___ if ($i>=13 && $i<15);
@@ -382,12 +395,12 @@ $code.=<<___;
         movdqu  0x60($ctx),$D
         movdqu  0x80($ctx),$E
         movdqa  0x60($Tbl),$tx                  # pbswap_mask
+       movdqa  -0x20($Tbl),$K                  # K_00_19
         jmp     .Loop
  
  .align 32
  .Loop:
  ___
-$code.="       movdqa  -0x20($Tbl),$K\n";      # K_00_19
  for($i=0;$i<20;$i++)   { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  $code.="       movdqa  0x00($Tbl),$K\n";       # K_20_39
  for(;$i<40;$i++)       { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
@@ -434,6 +447,7 @@ $code.=<<___;
  
         movdqa  @Xi[0],(%rbx)                   # save counters
         movdqa  0x60($Tbl),$tx                  # pbswap_mask
+       movdqa  -0x20($Tbl),$K                  # K_00_19
         dec     $num
         jnz     .Loop
  
@@ -551,6 +565,7 @@ $code.=<<___ if ($i<14);
  ___
  $code.=<<___ if ($i==14);
         vpaddd  $K,$e,$e                        # e+=K_00_19
+        prefetcht0     63(@ptr[0])
         vpslld  \$5,$a,$t2
         vpandn  $d,$b,$t1
         vpand   $c,$b,$t0
@@ -559,14 +574,17 @@ $code.=<<___ if ($i==14);
         vpaddd  @Xi[0],$e,$e                    # e+=X[i]
          $vpack         $t3,@Xi[1],@Xi[1]
         vpsrld  \$27,$a,$t3
+        prefetcht0     63(@ptr[1])
         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
  
         vpslld  \$30,$b,$t1
         vpor    $t3,$t2,$t2                     # rol(a,5)
+        prefetcht0     63(@ptr[2])
         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
  
         vpsrld  \$2,$b,$b
         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
+        prefetcht0     63(@ptr[3])
          vpshufb        $tx,@Xi[1],@Xi[1]
         vpor    $t1,$b,$b                       # b=rol(b,30)
  ___
@@ -580,6 +598,7 @@ $code.=<<___ if ($i>=15);                   # apply Xupdate
         vpaddd  $K,$e,$e                        # e+=K_00_19
         vpslld  \$5,$a,$t2
         vpandn  $d,$b,$t1
+        `"prefetcht0   63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
         vpand   $c,$b,$t0
  
         vmovdqa @Xi[0],`&Xi_off($i)`
@@ -588,14 +607,17 @@ $code.=<<___ if ($i>=15);                 # apply Xupdate
         vpsrld  \$27,$a,$t3
         vpxor   $t1,$t0,$t0                     # Ch(b,c,d)
          vpxor  @Xi[3],@Xi[1],@Xi[1]
+        `"prefetcht0   63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
  
         vpslld  \$30,$b,$t1
         vpor    $t3,$t2,$t2                     # rol(a,5)
         vpaddd  $t0,$e,$e                       # e+=Ch(b,c,d)
+        `"prefetcht0   63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
          vpsrld \$31,@Xi[1],$tx
          vpaddd @Xi[1],@Xi[1],@Xi[1]
  
         vpsrld  \$2,$b,$b
+        `"prefetcht0   63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
         vpaddd  $t2,$e,$e                       # e+=rol(a,5)
          vpor   $tx,@Xi[1],@Xi[1]               # rol   \$1,@Xi[1]
         vpor    $t1,$b,$b                       # b=rol(b,30)
diff --git a/crypto/sha/asm/sha256-mb-x86_64.pl b/crypto/sha/asm/sha256-mb-x86_64.pl

index 2e4b102f52829a41ff6811ad30bb4eadd3c224ed..e86f0bc7fa9da335ae66289d02b04c38ede8ecf1 100644 (file)
--- a/crypto/sha/asm/sha256-mb-x86_64.pl
+++ b/crypto/sha/asm/sha256-mb-x86_64.pl
@@ -15,7 +15,7 @@
  #              this    +aesni(i)       sha256  aesni-sha256    gain(iv)
  # -------------------------------------------------------------------
  # Westmere(ii) 23.3/n  +1.28=7.11(n=4) 12.3    +3.75=16.1      +126%
-# Atom(ii)     39.1/n  +3.93=13.7(n=4) 20.8    +5.69=26.5      +93%
+# Atom(ii)     ?39.1/n +3.93=13.7(n=4) 20.8    +5.69=26.5      +93%
  # Sandy Bridge (20.5   +5.15=25.7)/n   11.6    13.0            +103%
  # Ivy Bridge   (20.4   +5.14=25.5)/n   10.3    11.6            +82%
  # Haswell(iii) (21.0   +5.00=26.0)/n   7.80    8.79            +170%
@@ -27,8 +27,9 @@
  #      AES-NI-SHA256 stitch for these processors;
  # (iii)        "this" is for n=8, when we gather twice as much data, result
  #      for n=4 is 20.3+4.44=24.7;
-# (iv) improvement coefficients in real-life application are somewhat
-#      lower and range from 75% to 130% (on Haswell);
+# (iv) presented improvement coefficients are asymptotic limits and
+#      in real-life application are somewhat lower, e.g. for 2KB 
+#      fragments they range from 75% to 13% (on Haswell);
  
  $flavour = shift;
  $output  = shift;
@@ -135,6 +136,7 @@ $code.=<<___;
  
         psrld   \$25-11,$t2
          movdqa $e,$t1
+        `"prefetch     63(@ptr[0])"            if ($i==15)`
         pxor    $t3,$sigma
          movdqa $e,$axb                         # borrow $axb
         pslld   \$26-21,$t3
@@ -142,6 +144,7 @@ $code.=<<___;
          pand   $f,$axb
         pxor    $t2,$sigma
  
+        `"prefetch     63(@ptr[1])"            if ($i==15)`
         movdqa  $a,$t2
         pxor    $t3,$sigma                      # Sigma1(e)
         movdqa  $a,$t3
@@ -153,6 +156,7 @@ $code.=<<___;
         pslld   \$10,$t3
          pxor   $a,$axb                         # a^b, b^c in next round
  
+        `"prefetch     63(@ptr[2])"            if ($i==15)`
         psrld   \$13,$sigma
         pxor    $t3,$t2
          paddd  $t1,$Xi                         # Xi+=Ch(e,f,g)
@@ -160,6 +164,7 @@ $code.=<<___;
          pand   $axb,$bxc
         pxor    $sigma,$t2
  
+        `"prefetch     63(@ptr[3])"            if ($i==15)`
         psrld   \$22-13,$sigma
         pxor    $t3,$t2
          movdqa $b,$h
@@ -465,30 +470,38 @@ $code.=<<___;
  
         vpsrld  \$25,$e,$t2
         vpxor   $t3,$sigma,$sigma
+        `"prefetch     63(@ptr[0])"            if ($i==15)`
         vpslld  \$7,$e,$t3
          vpandn $g,$e,$t1
          vpand  $f,$e,$axb                      # borrow $axb
+        `"prefetch     63(@ptr[1])"            if ($i==15)`
         vpxor   $t2,$sigma,$sigma
  
         vpsrld  \$2,$a,$h                       # borrow $h
         vpxor   $t3,$sigma,$sigma               # Sigma1(e)
+        `"prefetch     63(@ptr[2])"            if ($i==15)`
         vpslld  \$30,$a,$t2
          vpxor  $axb,$t1,$t1                    # Ch(e,f,g)
          vpxor  $a,$b,$axb                      # a^b, b^c in next round
+        `"prefetch     63(@ptr[3])"            if ($i==15)`
         vpxor   $t2,$h,$h
         vpaddd  $sigma,$Xi,$Xi                  # Xi+=Sigma1(e)
  
         vpsrld  \$13,$a,$t2
+        `"prefetch     63(@ptr[4])"            if ($i==15 && $REG_SZ==32)`
         vpslld  \$19,$a,$t3
          vpaddd $t1,$Xi,$Xi                     # Xi+=Ch(e,f,g)
          vpand  $axb,$bxc,$bxc
+        `"prefetch     63(@ptr[5])"            if ($i==15 && $REG_SZ==32)`
         vpxor   $t2,$h,$sigma
  
         vpsrld  \$22,$a,$t2
         vpxor   $t3,$sigma,$sigma
+        `"prefetch     63(@ptr[6])"            if ($i==15 && $REG_SZ==32)`
         vpslld  \$10,$a,$t3
          vpxor  $bxc,$b,$h                      # h=Maj(a,b,c)=Ch(a^b,c,b)
          vpaddd $Xi,$d,$d                       # d+=Xi
+        `"prefetch     63(@ptr[7])"            if ($i==15 && $REG_SZ==32)`
         vpxor   $t2,$sigma,$sigma
         vpxor   $t3,$sigma,$sigma               # Sigma0(a)
author	Andy Polyakov <appro@openssl.org>
	Wed, 5 Feb 2014 13:03:35 +0000 (14:03 +0100)
committer	Andy Polyakov <appro@openssl.org>
	Wed, 5 Feb 2014 13:03:35 +0000 (14:03 +0100)
crypto/aes/asm/aesni-mb-x86_64.pl		patch \| blob \| history
crypto/sha/asm/sha1-mb-x86_64.pl		patch \| blob \| history
crypto/sha/asm/sha256-mb-x86_64.pl		patch \| blob \| history