crypto/poly1305/asm: chase overflow bit on x86 and ARM platforms.

author Andy Polyakov <appro@openssl.org>

Fri, 15 Apr 2016 14:30:29 +0000 (16:30 +0200)

committer Andy Polyakov <appro@openssl.org>

Mon, 25 Apr 2016 20:56:09 +0000 (22:56 +0200)
author Andy Polyakov <appro@openssl.org>
Fri, 15 Apr 2016 14:30:29 +0000 (16:30 +0200)
committer Andy Polyakov <appro@openssl.org>
Mon, 25 Apr 2016 20:56:09 +0000 (22:56 +0200)
diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl

index aa3f2280c6481e168ce36b4c57943b5a3b36cacf..95e213329a22862d10cdae87b8d2b581582404d7 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-armv4.pl
+++ b/crypto/poly1305/asm/poly1305-armv4.pl
@@ -10,7 +10,7 @@
  #                      IALU(*)/gcc-4.4         NEON
  #
  # ARM11xx(ARMv6)       7.78/+100%              -
  #                      IALU(*)/gcc-4.4         NEON
  #
  # ARM11xx(ARMv6)       7.78/+100%              -
-# Cortex-A5            6.35/+130%              2.96
+# Cortex-A5            6.35/+130%              3.00
  # Cortex-A8            6.25/+115%              2.36
  # Cortex-A9            5.10/+95%               2.55
  # Cortex-A15           3.85/+85%               1.25(**)
  # Cortex-A8            6.25/+115%              2.36
  # Cortex-A9            5.10/+95%               2.55
  # Cortex-A15           3.85/+85%               1.25(**)
@@ -523,6 +523,51 @@ poly1305_init_neon:
         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
         @ and P. Schwabe
         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
         @ lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
         @ and P. Schwabe
+       @
+       @ H0>>+H1>>+H2>>+H3>>+H4
+       @ H3>>+H4>>*5+H0>>+H1
+       @
+       @ Trivia.
+       @
+       @ Result of multiplication of n-bit number by m-bit number is
+       @ n+m bits wide. However! Even though 2^n is a n+1-bit number,
+       @ m-bit number multiplied by 2^n is still n+m bits wide.
+       @
+       @ Sum of two n-bit numbers is n+1 bits wide, sum of three - n+2,
+       @ and so is sum of four. Sum of 2^m n-m-bit numbers and n-bit
+       @ one is n+1 bits wide.
+       @
+       @ >>+ denotes Hnext += Hn>>26, Hn &= 0x3ffffff. This means that
+       @ H0, H2, H3 are guaranteed to be 26 bits wide, while H1 and H4
+       @ can be 27. However! In cases when their width exceeds 26 bits
+       @ they are limited by 2^26+2^6. This in turn means that *sum*
+       @ of the products with these values can still be viewed as sum
+       @ of 52-bit numbers as long as the amount of addends is not a
+       @ power of 2. For example,
+       @
+       @ H4 = H4*R0 + H3*R1 + H2*R2 + H1*R3 + H0 * R4,
+       @
+       @ which can't be larger than 5 * (2^26 + 2^6) * (2^26 + 2^6), or
+       @ 5 * (2^52 + 2*2^32 + 2^12), which in turn is smaller than
+       @ 8 * (2^52) or 2^55. However, the value is then multiplied by
+       @ by 5, so we should be looking at 5 * 5 * (2^52 + 2^33 + 2^12),
+       @ which is less than 32 * (2^52) or 2^57. And when processing
+       @ data we are looking at triple as many addends...
+       @
+       @ In key setup procedure pre-reduced H0 is limited by 5*4+1 and
+       @ 5*H4 - by 5*5 52-bit addends, or 57 bits. But when hashing the
+       @ input H0 is limited by (5*4+1)*3 addends, or 58 bits, while
+       @ 5*H4 by 5*5*3, or 59[!] bits. How is this relevant? vmlal.u32
+       @ instruction accepts 2x32-bit input and writes 2x64-bit result.
+       @ This means that result of reduction have to be compressed upon
+       @ loop wrap-around. This can be done in the process of reduction
+       @ to minimize amount of instructions [as well as amount of
+       @ 128-bit instructions, which benefits low-end processors], but
+       @ one has to watch for H2 (which is narrower than H0) and 5*H4
+       @ not being wider than 58 bits, so that result of right shift
+       @ by 26 bits fits in 32 bits. This is also useful on x86,
+       @ because it allows to use paddd in place for paddq, which
+       @ benefits Atom, where paddq is ridiculously slow.
  
         vshr.u64        $T0,$D3,#26
         vmovn.i64       $D3#lo,$D3
  
         vshr.u64        $T0,$D3,#26
         vmovn.i64       $D3#lo,$D3
@@ -887,7 +932,8 @@ poly1305_blocks_neon:
  # endif
  
         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
  # endif
  
         @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ lazy reduction interleaved with base 2^32 -> base 2^26
+       @ lazy reduction interleaved with base 2^32 -> base 2^26 of
+       @ inp[0:3] previously loaded to $H0-$H3 and smashed to $H0-$H4.
  
         vshr.u64        $T0,$D3,#26
         vmovn.i64       $D3#lo,$D3
  
         vshr.u64        $T0,$D3,#26
         vmovn.i64       $D3#lo,$D3
@@ -915,19 +961,20 @@ poly1305_blocks_neon:
           vbic.i32      $H3,#0xfc000000
          vshrn.u64      $T1#lo,$D2,#26
          vmovn.i64      $D2#lo,$D2
           vbic.i32      $H3,#0xfc000000
          vshrn.u64      $T1#lo,$D2,#26
          vmovn.i64      $D2#lo,$D2
-       vadd.i32        $D0#lo,$D0#lo,$T0#lo    @ h4 -> h0
+       vaddl.u32       $D0,$D0#lo,$T0#lo       @ h4 -> h0 [widen for a sec]
           vsri.u32      $H2,$H1,#20
          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
           vshl.u32      $H1,$H1,#6
          vbic.i32       $D2#lo,#0xfc000000
           vbic.i32      $H2,#0xfc000000
  
           vsri.u32      $H2,$H1,#20
          vadd.i32       $D3#lo,$D3#lo,$T1#lo    @ h2 -> h3
           vshl.u32      $H1,$H1,#6
          vbic.i32       $D2#lo,#0xfc000000
           vbic.i32      $H2,#0xfc000000
  
-       vshr.u32        $T0#lo,$D0#lo,#26
-       vbic.i32        $D0#lo,#0xfc000000
+       vshrn.u64       $T0#lo,$D0,#26          @ re-narrow
+       vmovn.i64       $D0#lo,$D0
           vsri.u32      $H1,$H0,#26
           vbic.i32      $H0,#0xfc000000
          vshr.u32       $T1#lo,$D3#lo,#26
          vbic.i32       $D3#lo,#0xfc000000
           vsri.u32      $H1,$H0,#26
           vbic.i32      $H0,#0xfc000000
          vshr.u32       $T1#lo,$D3#lo,#26
          vbic.i32       $D3#lo,#0xfc000000
+       vbic.i32        $D0#lo,#0xfc000000
         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
           vbic.i32      $H1,#0xfc000000
         vadd.i32        $D1#lo,$D1#lo,$T0#lo    @ h0 -> h1
          vadd.i32       $D4#lo,$D4#lo,$T1#lo    @ h3 -> h4
           vbic.i32      $H1,#0xfc000000
diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl

index 2e1dae3df238d157a23215ec7d88885cf105c11b..928d3cf4fa9391ecec81af74bcc73d024cb183cd 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-armv8.pl
+++ b/crypto/poly1305/asm/poly1305-armv8.pl
@@ -19,7 +19,7 @@
  # Cortex-A53   2.69/+58%       1.47
  # Cortex-A57   2.70/+7%        1.14
  # Denver       1.64/+50%       1.18(*)
  # Cortex-A53   2.69/+58%       1.47
  # Cortex-A57   2.70/+7%        1.14
  # Denver       1.64/+50%       1.18(*)
-# X-Gene       2.13/+68%       2.19
+# X-Gene       2.13/+68%       2.27
  #
  # (*)  estimate based on resources availability is less than 1.0,
  #      i.e. measured result is worse than expected, presumably binary
  #
  # (*)  estimate based on resources availability is less than 1.0,
  #      i.e. measured result is worse than expected, presumably binary
@@ -507,9 +507,11 @@ poly1305_blocks_neon:
         fmov    $IN01_1,x6
         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
         fmov    $IN01_1,x6
         add     x10,x10,x11,lsl#32      // bfi  x10,x11,#32,#32
         add     x12,x12,x13,lsl#32      // bfi  x12,x13,#32,#32
+       movi    $MASK.2d,#-1
         fmov    $IN01_2,x8
         fmov    $IN01_3,x10
         fmov    $IN01_4,x12
         fmov    $IN01_2,x8
         fmov    $IN01_3,x10
         fmov    $IN01_4,x12
+       ushr    $MASK.2d,$MASK.2d,#38
  
         b.ls    .Lskip_loop
  
  
         b.ls    .Lskip_loop
  
@@ -660,41 +662,43 @@ poly1305_blocks_neon:
          fmov   $IN01_2,x8
         umlal   $ACC2,$IN01_4,${S3}[0]
          fmov   $IN01_3,x10
          fmov   $IN01_2,x8
         umlal   $ACC2,$IN01_4,${S3}[0]
          fmov   $IN01_3,x10
+        fmov   $IN01_4,x12
  
         /////////////////////////////////////////////////////////////////
         // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
  
         /////////////////////////////////////////////////////////////////
         // lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
-        // and P. Schwabe
+       // and P. Schwabe
+       //
+       // [see discussion in poly1305-armv4 module]
  
         ushr    $T0.2d,$ACC3,#26
  
         ushr    $T0.2d,$ACC3,#26
-        fmov   $IN01_4,x12
         xtn     $H3,$ACC3
          ushr   $T1.2d,$ACC0,#26
         xtn     $H3,$ACC3
          ushr   $T1.2d,$ACC0,#26
-        xtn    $H0,$ACC0
+        and    $ACC0,$ACC0,$MASK.2d
         add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
         bic     $H3,#0xfc,lsl#24        // &=0x03ffffff
          add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
         add     $ACC4,$ACC4,$T0.2d      // h3 -> h4
         bic     $H3,#0xfc,lsl#24        // &=0x03ffffff
          add    $ACC1,$ACC1,$T1.2d      // h0 -> h1
-        bic    $H0,#0xfc,lsl#24
  
  
-       shrn    $T0.2s,$ACC4,#26
+       ushr    $T0.2d,$ACC4,#26
         xtn     $H4,$ACC4
          ushr   $T1.2d,$ACC1,#26
          xtn    $H1,$ACC1
         xtn     $H4,$ACC4
          ushr   $T1.2d,$ACC1,#26
          xtn    $H1,$ACC1
-        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
         bic     $H4,#0xfc,lsl#24
         bic     $H4,#0xfc,lsl#24
-        bic    $H1,#0xfc,lsl#24
+        add    $ACC2,$ACC2,$T1.2d      // h1 -> h2
  
  
-       add     $H0,$H0,$T0.2s
-       shl     $T0.2s,$T0.2s,#2
+       add     $ACC0,$ACC0,$T0.2d
+       shl     $T0.2d,$T0.2d,#2
          shrn   $T1.2s,$ACC2,#26
          xtn    $H2,$ACC2
          shrn   $T1.2s,$ACC2,#26
          xtn    $H2,$ACC2
-       add     $H0,$H0,$T0.2s          // h4 -> h0
+       add     $ACC0,$ACC0,$T0.2d      // h4 -> h0
+        bic    $H1,#0xfc,lsl#24
          add    $H3,$H3,$T1.2s          // h2 -> h3
          bic    $H2,#0xfc,lsl#24
  
          add    $H3,$H3,$T1.2s          // h2 -> h3
          bic    $H2,#0xfc,lsl#24
  
-       ushr    $T0.2s,$H0,#26
-       bic     $H0,#0xfc,lsl#24
+       shrn    $T0.2s,$ACC0,#26
+       xtn     $H0,$ACC0
          ushr   $T1.2s,$H3,#26
          bic    $H3,#0xfc,lsl#24
          ushr   $T1.2s,$H3,#26
          bic    $H3,#0xfc,lsl#24
+        bic    $H0,#0xfc,lsl#24
         add     $H1,$H1,$T0.2s          // h0 -> h1
          add    $H4,$H4,$T1.2s          // h3 -> h4
  
         add     $H1,$H1,$T0.2s          // h0 -> h1
          add    $H4,$H4,$T1.2s          // h3 -> h4
  
@@ -702,9 +706,7 @@ poly1305_blocks_neon:
  
  .Lskip_loop:
         dup     $IN23_2,${IN23_2}[0]
  
  .Lskip_loop:
         dup     $IN23_2,${IN23_2}[0]
-       movi    $MASK.2d,#-1
         add     $IN01_2,$IN01_2,$H2
         add     $IN01_2,$IN01_2,$H2
-       ushr    $MASK.2d,$MASK.2d,#38
  
         ////////////////////////////////////////////////////////////////
         // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
  
         ////////////////////////////////////////////////////////////////
         // multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl

index 97d0a81bea4377889cbf01dfe61ae8762a76960d..cf521c26868da5ec386a3f17b4bbaaa157d8df2b 100755 (executable)
--- a/crypto/poly1305/asm/poly1305-x86.pl
+++ b/crypto/poly1305/asm/poly1305-x86.pl
@@ -541,11 +541,12 @@ my $base = shift; $base = "esp" if (!defined($base));
  
  sub lazy_reduction {
  my $extra = shift;
  
  sub lazy_reduction {
  my $extra = shift;
-my $paddx = defined($extra) ? paddq : paddd;
  
         ################################################################
         # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
         # and P. Schwabe
  
         ################################################################
         # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
         # and P. Schwabe
+       #
+       # [(*) see discussion in poly1305-armv4 module]
  
          &movdqa        ($T0,$D3);
          &pand          ($D3,$MASK);
  
          &movdqa        ($T0,$D3);
          &pand          ($D3,$MASK);
@@ -567,7 +568,7 @@ my $paddx = defined($extra) ? paddq : paddd;
                                                         # on Atom
          &psllq         ($T0,2);
         &paddq          ($T1,$D2);                      # h1 -> h2
                                                         # on Atom
          &psllq         ($T0,2);
         &paddq          ($T1,$D2);                      # h1 -> h2
-        &$paddx        ($T0,$D0);                      # h4 -> h0
+        &paddq         ($T0,$D0);                      # h4 -> h0 (*)
         &pand           ($D1,$MASK);
         &movdqa         ($D2,$T1);
         &psrlq          ($T1,26);
         &pand           ($D1,$MASK);
         &movdqa         ($D2,$T1);
         &psrlq          ($T1,26);
diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c

index 6bec8b30f885fd04c6afe3694590812efc76457c..2a766b3295529d2b35c13b6b87cff8af09165afa 100644 (file)
--- a/crypto/poly1305/poly1305.c
+++ b/crypto/poly1305/poly1305.c
@@ -590,7 +590,8 @@ static const struct poly1305_test poly1305_tests[] = {
       "5154ad0d2cb26e01274fc51148491f1b"
      },
      /*
       "5154ad0d2cb26e01274fc51148491f1b"
      },
      /*
-     * self-generated
+     * self-generated vectors exercise "significant" lengths, such that
+     * are handled by different code paths
       */
      {
       "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
       */
      {
       "ab0812724a7f1e342742cbed374d94d136c6b8795d45b3819830f2c04491faf0"
@@ -672,6 +673,21 @@ static const struct poly1305_test poly1305_tests[] = {
       "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
       "b846d44e9bbd53cedffbfbb6b7fa4933"
      },
       "12976a08c4426d0ce8a82407c4f48207""80f8c20aa71202d1e29179cbcb555a57",
       "b846d44e9bbd53cedffbfbb6b7fa4933"
      },
+    /*
+     * 4th power of the key spills to 131th bit in SIMD key setup
+     */
+    {
+     "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+     "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+     "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+     "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+     "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+     "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+     "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
+     "ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff",
+     "ad628107e8351d0f2c231a05dc4a4106""00000000000000000000000000000000",
+     "07145a4c02fe5fa32036de68fabe9066"
+    },
      {
      /*
       * poly1305_ieee754.c failed this in final stage
      {
      /*
       * poly1305_ieee754.c failed this in final stage
author	Andy Polyakov <appro@openssl.org>
	Fri, 15 Apr 2016 14:30:29 +0000 (16:30 +0200)
committer	Andy Polyakov <appro@openssl.org>
	Mon, 25 Apr 2016 20:56:09 +0000 (22:56 +0200)
crypto/poly1305/asm/poly1305-armv4.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-armv8.pl		patch \| blob \| history
crypto/poly1305/asm/poly1305-x86.pl		patch \| blob \| history
crypto/poly1305/poly1305.c		patch \| blob \| history