poly1305/asm/poly1305-*.pl: flip horizontal add and reduction.
authorAndy Polyakov <appro@openssl.org>
Sun, 28 Feb 2016 20:48:43 +0000 (21:48 +0100)
committerAndy Polyakov <appro@openssl.org>
Wed, 2 Mar 2016 12:11:38 +0000 (13:11 +0100)
Formally only 32-bit AVX2 code path needs this, but I choose to
harmonize all vector code paths.

RT#4346
Reviewed-by: Richard Levitte <levitte@openssl.org>
crypto/poly1305/asm/poly1305-armv4.pl
crypto/poly1305/asm/poly1305-armv8.pl
crypto/poly1305/asm/poly1305-x86.pl
crypto/poly1305/asm/poly1305-x86_64.pl
crypto/poly1305/poly1305.c

index 86a6070..06301aa 100755 (executable)
@@ -1057,6 +1057,15 @@ poly1305_blocks_neon:
 
 .Lshort_tail:
        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
 
 .Lshort_tail:
        @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+       @ horizontal addition
+
+       vadd.i64        $D3#lo,$D3#lo,$D3#hi
+       vadd.i64        $D0#lo,$D0#lo,$D0#hi
+       vadd.i64        $D4#lo,$D4#lo,$D4#hi
+       vadd.i64        $D1#lo,$D1#lo,$D1#hi
+       vadd.i64        $D2#lo,$D2#lo,$D2#hi
+
+       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
        @ lazy reduction, but without narrowing
 
        vshr.u64        $T0,$D3,#26
        @ lazy reduction, but without narrowing
 
        vshr.u64        $T0,$D3,#26
@@ -1086,15 +1095,6 @@ poly1305_blocks_neon:
        vadd.i64        $D1,$D1,$T0             @ h0 -> h1
         vadd.i64       $D4,$D4,$T1             @ h3 -> h4
 
        vadd.i64        $D1,$D1,$T0             @ h0 -> h1
         vadd.i64       $D4,$D4,$T1             @ h3 -> h4
 
-       @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@
-       @ horizontal addition
-
-       vadd.i64        $D2#lo,$D2#lo,$D2#hi
-       vadd.i64        $D0#lo,$D0#lo,$D0#hi
-       vadd.i64        $D3#lo,$D3#lo,$D3#hi
-       vadd.i64        $D1#lo,$D1#lo,$D1#hi
-       vadd.i64        $D4#lo,$D4#lo,$D4#hi
-
        cmp             $len,#0
        bne             .Leven
 
        cmp             $len,#0
        bne             .Leven
 
index 79185d2..f1359fd 100755 (executable)
@@ -791,6 +791,19 @@ poly1305_blocks_neon:
 
 .Lshort_tail:
        ////////////////////////////////////////////////////////////////
 
 .Lshort_tail:
        ////////////////////////////////////////////////////////////////
+       // horizontal add
+
+       addp    $ACC3,$ACC3,$ACC3
+        ldp    d8,d9,[sp,#16]          // meet ABI requirements
+       addp    $ACC0,$ACC0,$ACC0
+        ldp    d10,d11,[sp,#32]
+       addp    $ACC4,$ACC4,$ACC4
+        ldp    d12,d13,[sp,#48]
+       addp    $ACC1,$ACC1,$ACC1
+        ldp    d14,d15,[sp,#64]
+       addp    $ACC2,$ACC2,$ACC2
+
+       ////////////////////////////////////////////////////////////////
        // lazy reduction, but without narrowing
 
        ushr    $T0.2d,$ACC3,#26
        // lazy reduction, but without narrowing
 
        ushr    $T0.2d,$ACC3,#26
@@ -822,19 +835,6 @@ poly1305_blocks_neon:
         add    $ACC4,$ACC4,$T1.2d      // h3 -> h4
 
        ////////////////////////////////////////////////////////////////
         add    $ACC4,$ACC4,$T1.2d      // h3 -> h4
 
        ////////////////////////////////////////////////////////////////
-       // horizontal add
-
-       addp    $ACC2,$ACC2,$ACC2
-        ldp    d8,d9,[sp,#16]          // meet ABI requirements
-       addp    $ACC0,$ACC0,$ACC0
-        ldp    d10,d11,[sp,#32]
-       addp    $ACC1,$ACC1,$ACC1
-        ldp    d12,d13,[sp,#48]
-       addp    $ACC3,$ACC3,$ACC3
-        ldp    d14,d15,[sp,#64]
-       addp    $ACC4,$ACC4,$ACC4
-
-       ////////////////////////////////////////////////////////////////
        // write the result, can be partially reduced
 
        st4     {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
        // write the result, can be partially reduced
 
        st4     {$ACC0,$ACC1,$ACC2,$ACC3}[0],[$ctx],#16
index 7c1aee5..fb9fa2b 100755 (executable)
@@ -536,6 +536,8 @@ my $base = shift; $base = "esp" if (!defined($base));
                             },"edx");
 
 sub lazy_reduction {
                             },"edx");
 
 sub lazy_reduction {
+my $extra = shift;
+
        ################################################################
        # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
        # and P. Schwabe
        ################################################################
        # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
        # and P. Schwabe
@@ -543,6 +545,7 @@ sub lazy_reduction {
         &movdqa        ($T0,$D3);
         &pand          ($D3,$MASK);
         &psrlq         ($T0,26);
         &movdqa        ($T0,$D3);
         &pand          ($D3,$MASK);
         &psrlq         ($T0,26);
+        &$extra        ()                              if (defined($extra));
         &paddq         ($T0,$D4);                      # h3 -> h4
        &movdqa         ($T1,$D0);
        &pand           ($D0,$MASK);
         &paddq         ($T0,$D4);                      # h3 -> h4
        &movdqa         ($T1,$D0);
        &pand           ($D0,$MASK);
@@ -1091,21 +1094,21 @@ my $addr = shift;
 
 &set_label("short_tail");
 
 
 &set_label("short_tail");
 
-       &lazy_reduction ();
-
        ################################################################
        # horizontal addition
 
        ################################################################
        # horizontal addition
 
+       &pshufd         ($T1,$D4,0b01001110);
+       &pshufd         ($T0,$D3,0b01001110);
+       &paddq          ($D4,$T1);
+       &paddq          ($D3,$T0);
        &pshufd         ($T1,$D0,0b01001110);
        &pshufd         ($T0,$D1,0b01001110);
        &pshufd         ($T1,$D0,0b01001110);
        &pshufd         ($T0,$D1,0b01001110);
-       &paddd          ($D0,$T1);
+       &paddq          ($D0,$T1);
+       &paddq          ($D1,$T0);
        &pshufd         ($T1,$D2,0b01001110);
        &pshufd         ($T1,$D2,0b01001110);
-       &paddd          ($D1,$T0);
-       &pshufd         ($T0,$D3,0b01001110);
-       &paddd          ($D2,$T1);
-       &pshufd         ($T1,$D4,0b01001110);
-       &paddd          ($D3,$T0);
-       &paddd          ($D4,$T1);
+       #&paddq         ($D2,$T1);
+
+       &lazy_reduction (sub { &paddq ($D2,$T1) });
 
 &set_label("done");
        &movd           (&DWP(-16*3+4*0,"edi"),$D0);    # store hash value
 
 &set_label("done");
        &movd           (&DWP(-16*3+4*0,"edi"),$D0);    # store hash value
@@ -1113,8 +1116,8 @@ my $addr = shift;
        &movd           (&DWP(-16*3+4*2,"edi"),$D2);
        &movd           (&DWP(-16*3+4*3,"edi"),$D3);
        &movd           (&DWP(-16*3+4*4,"edi"),$D4);
        &movd           (&DWP(-16*3+4*2,"edi"),$D2);
        &movd           (&DWP(-16*3+4*3,"edi"),$D3);
        &movd           (&DWP(-16*3+4*4,"edi"),$D4);
-&set_label("nodata");
        &mov    ("esp","ebp");
        &mov    ("esp","ebp");
+&set_label("nodata");
 &function_end("_poly1305_blocks_sse2");
 
 &align (32);
 &function_end("_poly1305_blocks_sse2");
 
 &align (32);
@@ -1435,7 +1438,7 @@ sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; }
        &test   ("eax","eax");                          # is_base2_26?
        &jz     (&label("enter_blocks"));
 
        &test   ("eax","eax");                          # is_base2_26?
        &jz     (&label("enter_blocks"));
 
-&set_label("enter_avx2",16);
+&set_label("enter_avx2");
        &vzeroupper     ();
 
        &call   (&label("pic_point"));
        &vzeroupper     ();
 
        &call   (&label("pic_point"));
@@ -1731,31 +1734,31 @@ sub vlazy_reduction {
 
        &vpmuladd       (sub {  my $i=shift; &QWP(4+32*$i-128,"edx");   });
 
 
        &vpmuladd       (sub {  my $i=shift; &QWP(4+32*$i-128,"edx");   });
 
-       &vlazy_reduction();
-
        ################################################################
        # horizontal addition
 
        ################################################################
        # horizontal addition
 
+       &vpsrldq        ($T0,$D4,8);
+       &vpsrldq        ($T1,$D3,8);
+       &vpaddq         ($D4,$D4,$T0);
        &vpsrldq        ($T0,$D0,8);
        &vpsrldq        ($T0,$D0,8);
+       &vpaddq         ($D3,$D3,$T1);
        &vpsrldq        ($T1,$D1,8);
        &vpaddq         ($D0,$D0,$T0);
        &vpsrldq        ($T0,$D2,8);
        &vpaddq         ($D1,$D1,$T1);
        &vpsrldq        ($T1,$D1,8);
        &vpaddq         ($D0,$D0,$T0);
        &vpsrldq        ($T0,$D2,8);
        &vpaddq         ($D1,$D1,$T1);
-       &vpsrldq        ($T1,$D3,8);
+       &vpermq         ($T1,$D4,2);                    # keep folding
        &vpaddq         ($D2,$D2,$T0);
        &vpaddq         ($D2,$D2,$T0);
-       &vpsrldq        ($T0,$D4,8);
-       &vpaddq         ($D3,$D3,$T1);
-       &vpermq         ($T1,$D0,2);                    # keep folding
-       &vpaddq         ($D4,$D4,$T0);
+       &vpermq         ($T0,$D3,2);
+       &vpaddq         ($D4,$D4,$T1);
+       &vpermq         ($T1,$D0,2);
+       &vpaddq         ($D3,$D3,$T0);
        &vpermq         ($T0,$D1,2);
        &vpaddq         ($D0,$D0,$T1);
        &vpermq         ($T1,$D2,2);
        &vpaddq         ($D1,$D1,$T0);
        &vpermq         ($T0,$D1,2);
        &vpaddq         ($D0,$D0,$T1);
        &vpermq         ($T1,$D2,2);
        &vpaddq         ($D1,$D1,$T0);
-       &vpermq         ($T0,$D3,2);
        &vpaddq         ($D2,$D2,$T1);
        &vpaddq         ($D2,$D2,$T1);
-       &vpermq         ($T1,$D4,2);
-       &vpaddq         ($D3,$D3,$T0);
-       &vpaddq         ($D4,$D4,$T1);
+
+       &vlazy_reduction();
 
        &cmp            ("ecx",0);
        &je             (&label("done"));
 
        &cmp            ("ecx",0);
        &je             (&label("done"));
@@ -1772,14 +1775,14 @@ sub vlazy_reduction {
        &jmp            (&label("even"));
 
 &set_label("done",16);
        &jmp            (&label("even"));
 
 &set_label("done",16);
-       &vmovd          (&DWP(-16*3+4*0,"edi"),"xmm0"); # store hash value
-       &vmovd          (&DWP(-16*3+4*1,"edi"),"xmm1");
-       &vmovd          (&DWP(-16*3+4*2,"edi"),"xmm2");
-       &vmovd          (&DWP(-16*3+4*3,"edi"),"xmm3");
-       &vmovd          (&DWP(-16*3+4*4,"edi"),"xmm4");
+       &vmovd          (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value
+       &vmovd          (&DWP(-16*3+4*1,"edi"),&X($D1));
+       &vmovd          (&DWP(-16*3+4*2,"edi"),&X($D2));
+       &vmovd          (&DWP(-16*3+4*3,"edi"),&X($D3));
+       &vmovd          (&DWP(-16*3+4*4,"edi"),&X($D4));
        &vzeroupper     ();
        &vzeroupper     ();
-&set_label("nodata");
        &mov    ("esp","ebp");
        &mov    ("esp","ebp");
+&set_label("nodata");
 &function_end("_poly1305_blocks_avx2");
 }
 &set_label("const_sse2",64);
 &function_end("_poly1305_blocks_avx2");
 }
 &set_label("const_sse2",64);
index b827d24..2265664 100755 (executable)
@@ -1198,6 +1198,20 @@ $code.=<<___;
 
 .Lshort_tail_avx:
        ################################################################
 
 .Lshort_tail_avx:
        ################################################################
+       # horizontal addition
+
+       vpsrldq         \$8,$D4,$T4
+       vpsrldq         \$8,$D3,$T3
+       vpsrldq         \$8,$D1,$T1
+       vpsrldq         \$8,$D0,$T0
+       vpsrldq         \$8,$D2,$T2
+       vpaddq          $T3,$D3,$D3
+       vpaddq          $T4,$D4,$D4
+       vpaddq          $T0,$D0,$D0
+       vpaddq          $T1,$D1,$D1
+       vpaddq          $T2,$D2,$D2
+
+       ################################################################
        # lazy reduction
 
        vpsrlq          \$26,$D3,$H3
        # lazy reduction
 
        vpsrlq          \$26,$D3,$H3
@@ -1231,25 +1245,11 @@ $code.=<<___;
        vpand           $MASK,$D3,$D3
        vpaddq          $H3,$D4,$D4             # h3 -> h4
 
        vpand           $MASK,$D3,$D3
        vpaddq          $H3,$D4,$D4             # h3 -> h4
 
-       ################################################################
-       # horizontal addition
-
-       vpsrldq         \$8,$D2,$T2
-       vpsrldq         \$8,$D0,$T0
-       vpsrldq         \$8,$D1,$T1
-       vpsrldq         \$8,$D3,$T3
-       vpsrldq         \$8,$D4,$T4
-       vpaddq          $T2,$D2,$H2
-       vpaddq          $T0,$D0,$H0
-       vpaddq          $T1,$D1,$H1
-       vpaddq          $T3,$D3,$H3
-       vpaddq          $T4,$D4,$H4
-
-       vmovd           $H0,`4*0-48-64`($ctx)   # save partially reduced
-       vmovd           $H1,`4*1-48-64`($ctx)
-       vmovd           $H2,`4*2-48-64`($ctx)
-       vmovd           $H3,`4*3-48-64`($ctx)
-       vmovd           $H4,`4*4-48-64`($ctx)
+       vmovd           $D0,`4*0-48-64`($ctx)   # save partially reduced
+       vmovd           $D1,`4*1-48-64`($ctx)
+       vmovd           $D2,`4*2-48-64`($ctx)
+       vmovd           $D3,`4*3-48-64`($ctx)
+       vmovd           $D4,`4*4-48-64`($ctx)
 ___
 $code.=<<___   if ($win64);
        vmovdqa         0x50(%r11),%xmm6
 ___
 $code.=<<___   if ($win64);
        vmovdqa         0x50(%r11),%xmm6
@@ -1888,6 +1888,31 @@ $code.=<<___;
        vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
 
        ################################################################
        vpaddq          $H0,$D0,$H0             # h0 = d0 + h1*s4
 
        ################################################################
+       # horizontal addition
+
+       vpsrldq         \$8,$D1,$T1
+       vpsrldq         \$8,$H2,$T2
+       vpsrldq         \$8,$H3,$T3
+       vpsrldq         \$8,$H4,$T4
+       vpsrldq         \$8,$H0,$T0
+       vpaddq          $T1,$D1,$D1
+       vpaddq          $T2,$H2,$H2
+       vpaddq          $T3,$H3,$H3
+       vpaddq          $T4,$H4,$H4
+       vpaddq          $T0,$H0,$H0
+
+       vpermq          \$0x2,$H3,$T3
+       vpermq          \$0x2,$H4,$T4
+       vpermq          \$0x2,$H0,$T0
+       vpermq          \$0x2,$D1,$T1
+       vpermq          \$0x2,$H2,$T2
+       vpaddq          $T3,$H3,$H3
+       vpaddq          $T4,$H4,$H4
+       vpaddq          $T0,$H0,$H0
+       vpaddq          $T1,$D1,$D1
+       vpaddq          $T2,$H2,$H2
+
+       ################################################################
        # lazy reduction
 
        vpsrlq          \$26,$H3,$D3
        # lazy reduction
 
        vpsrlq          \$26,$H3,$D3
@@ -1921,31 +1946,6 @@ $code.=<<___;
        vpand           $MASK,$H3,$H3
        vpaddq          $D3,$H4,$H4             # h3 -> h4
 
        vpand           $MASK,$H3,$H3
        vpaddq          $D3,$H4,$H4             # h3 -> h4
 
-       ################################################################
-       # horizontal addition
-
-       vpsrldq         \$8,$H2,$T2
-       vpsrldq         \$8,$H0,$T0
-       vpsrldq         \$8,$H1,$T1
-       vpsrldq         \$8,$H3,$T3
-       vpsrldq         \$8,$H4,$T4
-       vpaddq          $T2,$H2,$H2
-       vpaddq          $T0,$H0,$H0
-       vpaddq          $T1,$H1,$H1
-       vpaddq          $T3,$H3,$H3
-       vpaddq          $T4,$H4,$H4
-
-       vpermq          \$0x2,$H2,$T2
-       vpermq          \$0x2,$H0,$T0
-       vpermq          \$0x2,$H1,$T1
-       vpermq          \$0x2,$H3,$T3
-       vpermq          \$0x2,$H4,$T4
-       vpaddq          $T2,$H2,$H2
-       vpaddq          $T0,$H0,$H0
-       vpaddq          $T1,$H1,$H1
-       vpaddq          $T3,$H3,$H3
-       vpaddq          $T4,$H4,$H4
-
        vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
        vmovd           %x#$H1,`4*1-48-64`($ctx)
        vmovd           %x#$H2,`4*2-48-64`($ctx)
        vmovd           %x#$H0,`4*0-48-64`($ctx)# save partially reduced
        vmovd           %x#$H1,`4*1-48-64`($ctx)
        vmovd           %x#$H2,`4*2-48-64`($ctx)
index 7c9f302..303822e 100644 (file)
@@ -668,6 +668,20 @@ static const struct poly1305_test poly1305_tests[] = {
      "f248312e578d9d58f8b7bb4d19105431"
     },
     /*
      "f248312e578d9d58f8b7bb4d19105431"
     },
     /*
+     * AVX2 in poly1305-x86.pl failed this with 176+32 split
+     */
+    {
+    "248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd"
+    "2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e8"
+    "74cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c"
+    "8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936a"
+    "ff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a37"
+    "09894e4eb0a4eedc4ae19468e66b81f2"
+    "71351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb",
+    "000102030405060708090a0b0c0d0e0f""00000000000000000000000000000000",
+    "bc939bc5281480fa99c6d68c258ec42f"
+    },
+    /*
      * test vectors from Google
      */
     {
      * test vectors from Google
      */
     {
@@ -844,6 +858,23 @@ int main()
                 printf("\n");
                 return 1;
             }
                 printf("\n");
                 return 1;
             }
+
+            for (half = 16; half < inlen; half += 16) {
+                Poly1305_Init(&poly1305, key);
+                Poly1305_Update(&poly1305, in, half);
+                Poly1305_Update(&poly1305, in+half, inlen-half);
+                Poly1305_Final(&poly1305, out);
+
+                if (memcmp(out, expected, sizeof(expected)) != 0) {
+                    printf("Poly1305 test #%d/%d failed.\n", i, half);
+                    printf("got:      ");
+                    hexdump(out, sizeof(out));
+                    printf("\nexpected: ");
+                    hexdump(expected, sizeof(expected));
+                    printf("\n");
+                    return 1;
+                }
+            }
         }
 
         free(in);
         }
 
         free(in);