From: Andy Polyakov Date: Sun, 28 Feb 2016 20:48:43 +0000 (+0100) Subject: poly1305/asm/poly1305-*.pl: flip horizontal add and reduction. X-Git-Tag: OpenSSL_1_1_0-pre4~414 X-Git-Url: https://git.openssl.org/?p=openssl.git;a=commitdiff_plain;h=1ea8ae5090f557fea2e5b4d5758b10566825d74b;hp=bdbd3aea590e45d52c7b120ea6eaff38295b5011 poly1305/asm/poly1305-*.pl: flip horizontal add and reduction. Formally only 32-bit AVX2 code path needs this, but I choose to harmonize all vector code paths. RT#4346 Reviewed-by: Richard Levitte --- diff --git a/crypto/poly1305/asm/poly1305-armv4.pl b/crypto/poly1305/asm/poly1305-armv4.pl index 86a6070bf4..06301aa2e1 100755 --- a/crypto/poly1305/asm/poly1305-armv4.pl +++ b/crypto/poly1305/asm/poly1305-armv4.pl @@ -1056,6 +1056,15 @@ poly1305_blocks_neon: vmlal.u32 $D2,$H3#lo,$S4 .Lshort_tail: + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + @ horizontal addition + + vadd.i64 $D3#lo,$D3#lo,$D3#hi + vadd.i64 $D0#lo,$D0#lo,$D0#hi + vadd.i64 $D4#lo,$D4#lo,$D4#hi + vadd.i64 $D1#lo,$D1#lo,$D1#hi + vadd.i64 $D2#lo,$D2#lo,$D2#hi + @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ @ lazy reduction, but without narrowing @@ -1086,15 +1095,6 @@ poly1305_blocks_neon: vadd.i64 $D1,$D1,$T0 @ h0 -> h1 vadd.i64 $D4,$D4,$T1 @ h3 -> h4 - @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - @ horizontal addition - - vadd.i64 $D2#lo,$D2#lo,$D2#hi - vadd.i64 $D0#lo,$D0#lo,$D0#hi - vadd.i64 $D3#lo,$D3#lo,$D3#hi - vadd.i64 $D1#lo,$D1#lo,$D1#hi - vadd.i64 $D4#lo,$D4#lo,$D4#hi - cmp $len,#0 bne .Leven diff --git a/crypto/poly1305/asm/poly1305-armv8.pl b/crypto/poly1305/asm/poly1305-armv8.pl index 79185d2bdd..f1359fd44a 100755 --- a/crypto/poly1305/asm/poly1305-armv8.pl +++ b/crypto/poly1305/asm/poly1305-armv8.pl @@ -790,6 +790,19 @@ poly1305_blocks_neon: umlal $ACC2,$IN01_4,${S3} .Lshort_tail: + //////////////////////////////////////////////////////////////// + // horizontal add + + addp $ACC3,$ACC3,$ACC3 + ldp d8,d9,[sp,#16] // meet ABI requirements + addp $ACC0,$ACC0,$ACC0 + ldp d10,d11,[sp,#32] + addp $ACC4,$ACC4,$ACC4 + ldp d12,d13,[sp,#48] + addp $ACC1,$ACC1,$ACC1 + ldp d14,d15,[sp,#64] + addp $ACC2,$ACC2,$ACC2 + //////////////////////////////////////////////////////////////// // lazy reduction, but without narrowing @@ -821,19 +834,6 @@ poly1305_blocks_neon: add $ACC1,$ACC1,$T0.2d // h0 -> h1 add $ACC4,$ACC4,$T1.2d // h3 -> h4 - //////////////////////////////////////////////////////////////// - // horizontal add - - addp $ACC2,$ACC2,$ACC2 - ldp d8,d9,[sp,#16] // meet ABI requirements - addp $ACC0,$ACC0,$ACC0 - ldp d10,d11,[sp,#32] - addp $ACC1,$ACC1,$ACC1 - ldp d12,d13,[sp,#48] - addp $ACC3,$ACC3,$ACC3 - ldp d14,d15,[sp,#64] - addp $ACC4,$ACC4,$ACC4 - //////////////////////////////////////////////////////////////// // write the result, can be partially reduced diff --git a/crypto/poly1305/asm/poly1305-x86.pl b/crypto/poly1305/asm/poly1305-x86.pl index 7c1aee5fe0..fb9fa2bc34 100755 --- a/crypto/poly1305/asm/poly1305-x86.pl +++ b/crypto/poly1305/asm/poly1305-x86.pl @@ -536,6 +536,8 @@ my $base = shift; $base = "esp" if (!defined($base)); },"edx"); sub lazy_reduction { +my $extra = shift; + ################################################################ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein # and P. Schwabe @@ -543,6 +545,7 @@ sub lazy_reduction { &movdqa ($T0,$D3); &pand ($D3,$MASK); &psrlq ($T0,26); + &$extra () if (defined($extra)); &paddq ($T0,$D4); # h3 -> h4 &movdqa ($T1,$D0); &pand ($D0,$MASK); @@ -1091,21 +1094,21 @@ my $addr = shift; &set_label("short_tail"); - &lazy_reduction (); - ################################################################ # horizontal addition + &pshufd ($T1,$D4,0b01001110); + &pshufd ($T0,$D3,0b01001110); + &paddq ($D4,$T1); + &paddq ($D3,$T0); &pshufd ($T1,$D0,0b01001110); &pshufd ($T0,$D1,0b01001110); - &paddd ($D0,$T1); + &paddq ($D0,$T1); + &paddq ($D1,$T0); &pshufd ($T1,$D2,0b01001110); - &paddd ($D1,$T0); - &pshufd ($T0,$D3,0b01001110); - &paddd ($D2,$T1); - &pshufd ($T1,$D4,0b01001110); - &paddd ($D3,$T0); - &paddd ($D4,$T1); + #&paddq ($D2,$T1); + + &lazy_reduction (sub { &paddq ($D2,$T1) }); &set_label("done"); &movd (&DWP(-16*3+4*0,"edi"),$D0); # store hash value @@ -1113,8 +1116,8 @@ my $addr = shift; &movd (&DWP(-16*3+4*2,"edi"),$D2); &movd (&DWP(-16*3+4*3,"edi"),$D3); &movd (&DWP(-16*3+4*4,"edi"),$D4); -&set_label("nodata"); &mov ("esp","ebp"); +&set_label("nodata"); &function_end("_poly1305_blocks_sse2"); &align (32); @@ -1435,7 +1438,7 @@ sub X { my $reg=shift; $reg=~s/^ymm/xmm/; $reg; } &test ("eax","eax"); # is_base2_26? &jz (&label("enter_blocks")); -&set_label("enter_avx2",16); +&set_label("enter_avx2"); &vzeroupper (); &call (&label("pic_point")); @@ -1731,31 +1734,31 @@ sub vlazy_reduction { &vpmuladd (sub { my $i=shift; &QWP(4+32*$i-128,"edx"); }); - &vlazy_reduction(); - ################################################################ # horizontal addition + &vpsrldq ($T0,$D4,8); + &vpsrldq ($T1,$D3,8); + &vpaddq ($D4,$D4,$T0); &vpsrldq ($T0,$D0,8); + &vpaddq ($D3,$D3,$T1); &vpsrldq ($T1,$D1,8); &vpaddq ($D0,$D0,$T0); &vpsrldq ($T0,$D2,8); &vpaddq ($D1,$D1,$T1); - &vpsrldq ($T1,$D3,8); + &vpermq ($T1,$D4,2); # keep folding &vpaddq ($D2,$D2,$T0); - &vpsrldq ($T0,$D4,8); - &vpaddq ($D3,$D3,$T1); - &vpermq ($T1,$D0,2); # keep folding - &vpaddq ($D4,$D4,$T0); + &vpermq ($T0,$D3,2); + &vpaddq ($D4,$D4,$T1); + &vpermq ($T1,$D0,2); + &vpaddq ($D3,$D3,$T0); &vpermq ($T0,$D1,2); &vpaddq ($D0,$D0,$T1); &vpermq ($T1,$D2,2); &vpaddq ($D1,$D1,$T0); - &vpermq ($T0,$D3,2); &vpaddq ($D2,$D2,$T1); - &vpermq ($T1,$D4,2); - &vpaddq ($D3,$D3,$T0); - &vpaddq ($D4,$D4,$T1); + + &vlazy_reduction(); &cmp ("ecx",0); &je (&label("done")); @@ -1772,14 +1775,14 @@ sub vlazy_reduction { &jmp (&label("even")); &set_label("done",16); - &vmovd (&DWP(-16*3+4*0,"edi"),"xmm0"); # store hash value - &vmovd (&DWP(-16*3+4*1,"edi"),"xmm1"); - &vmovd (&DWP(-16*3+4*2,"edi"),"xmm2"); - &vmovd (&DWP(-16*3+4*3,"edi"),"xmm3"); - &vmovd (&DWP(-16*3+4*4,"edi"),"xmm4"); + &vmovd (&DWP(-16*3+4*0,"edi"),&X($D0));# store hash value + &vmovd (&DWP(-16*3+4*1,"edi"),&X($D1)); + &vmovd (&DWP(-16*3+4*2,"edi"),&X($D2)); + &vmovd (&DWP(-16*3+4*3,"edi"),&X($D3)); + &vmovd (&DWP(-16*3+4*4,"edi"),&X($D4)); &vzeroupper (); -&set_label("nodata"); &mov ("esp","ebp"); +&set_label("nodata"); &function_end("_poly1305_blocks_avx2"); } &set_label("const_sse2",64); diff --git a/crypto/poly1305/asm/poly1305-x86_64.pl b/crypto/poly1305/asm/poly1305-x86_64.pl index b827d24b1a..2265664180 100755 --- a/crypto/poly1305/asm/poly1305-x86_64.pl +++ b/crypto/poly1305/asm/poly1305-x86_64.pl @@ -1197,6 +1197,20 @@ $code.=<<___; vpaddq $T3,$D0,$D0 # d0 += h1*s4 .Lshort_tail_avx: + ################################################################ + # horizontal addition + + vpsrldq \$8,$D4,$T4 + vpsrldq \$8,$D3,$T3 + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$D0,$T0 + vpsrldq \$8,$D2,$T2 + vpaddq $T3,$D3,$D3 + vpaddq $T4,$D4,$D4 + vpaddq $T0,$D0,$D0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$D2,$D2 + ################################################################ # lazy reduction @@ -1231,25 +1245,11 @@ $code.=<<___; vpand $MASK,$D3,$D3 vpaddq $H3,$D4,$D4 # h3 -> h4 - ################################################################ - # horizontal addition - - vpsrldq \$8,$D2,$T2 - vpsrldq \$8,$D0,$T0 - vpsrldq \$8,$D1,$T1 - vpsrldq \$8,$D3,$T3 - vpsrldq \$8,$D4,$T4 - vpaddq $T2,$D2,$H2 - vpaddq $T0,$D0,$H0 - vpaddq $T1,$D1,$H1 - vpaddq $T3,$D3,$H3 - vpaddq $T4,$D4,$H4 - - vmovd $H0,`4*0-48-64`($ctx) # save partially reduced - vmovd $H1,`4*1-48-64`($ctx) - vmovd $H2,`4*2-48-64`($ctx) - vmovd $H3,`4*3-48-64`($ctx) - vmovd $H4,`4*4-48-64`($ctx) + vmovd $D0,`4*0-48-64`($ctx) # save partially reduced + vmovd $D1,`4*1-48-64`($ctx) + vmovd $D2,`4*2-48-64`($ctx) + vmovd $D3,`4*3-48-64`($ctx) + vmovd $D4,`4*4-48-64`($ctx) ___ $code.=<<___ if ($win64); vmovdqa 0x50(%r11),%xmm6 @@ -1887,6 +1887,31 @@ $code.=<<___; vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4 vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4 + ################################################################ + # horizontal addition + + vpsrldq \$8,$D1,$T1 + vpsrldq \$8,$H2,$T2 + vpsrldq \$8,$H3,$T3 + vpsrldq \$8,$H4,$T4 + vpsrldq \$8,$H0,$T0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + + vpermq \$0x2,$H3,$T3 + vpermq \$0x2,$H4,$T4 + vpermq \$0x2,$H0,$T0 + vpermq \$0x2,$D1,$T1 + vpermq \$0x2,$H2,$T2 + vpaddq $T3,$H3,$H3 + vpaddq $T4,$H4,$H4 + vpaddq $T0,$H0,$H0 + vpaddq $T1,$D1,$D1 + vpaddq $T2,$H2,$H2 + ################################################################ # lazy reduction @@ -1921,31 +1946,6 @@ $code.=<<___; vpand $MASK,$H3,$H3 vpaddq $D3,$H4,$H4 # h3 -> h4 - ################################################################ - # horizontal addition - - vpsrldq \$8,$H2,$T2 - vpsrldq \$8,$H0,$T0 - vpsrldq \$8,$H1,$T1 - vpsrldq \$8,$H3,$T3 - vpsrldq \$8,$H4,$T4 - vpaddq $T2,$H2,$H2 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - - vpermq \$0x2,$H2,$T2 - vpermq \$0x2,$H0,$T0 - vpermq \$0x2,$H1,$T1 - vpermq \$0x2,$H3,$T3 - vpermq \$0x2,$H4,$T4 - vpaddq $T2,$H2,$H2 - vpaddq $T0,$H0,$H0 - vpaddq $T1,$H1,$H1 - vpaddq $T3,$H3,$H3 - vpaddq $T4,$H4,$H4 - vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced vmovd %x#$H1,`4*1-48-64`($ctx) vmovd %x#$H2,`4*2-48-64`($ctx) diff --git a/crypto/poly1305/poly1305.c b/crypto/poly1305/poly1305.c index 7c9f302bfc..303822e62a 100644 --- a/crypto/poly1305/poly1305.c +++ b/crypto/poly1305/poly1305.c @@ -667,6 +667,20 @@ static const struct poly1305_test poly1305_tests[] = { "95d5c005503e510d8cd0aa072c4a4d06""6eabc52d11653df47fbf63ab198bcc26", "f248312e578d9d58f8b7bb4d19105431" }, + /* + * AVX2 in poly1305-x86.pl failed this with 176+32 split + */ + { + "248ac31085b6c2adaaa38259a0d7192c5c35d1bb4ef39ad94c38d1c82479e2dd" + "2159a077024b0589bc8a20101b506f0a1ad0bbab76e83a83f1b94be6beae74e8" + "74cab692c5963a75436b776121ec9f62399a3e66b2d22707dae81933b6277f3c" + "8516bcbe26dbbd86f373103d7cf4cad1888c952118fbfbd0d7b4bedc4ae4936a" + "ff91157e7aa47c54442ea78d6ac251d324a0fbe49d89cc3521b66d16e9c66a37" + "09894e4eb0a4eedc4ae19468e66b81f2" + "71351b1d921ea551047abcc6b87a901fde7db79fa1818c11336dbc07244a40eb", + "000102030405060708090a0b0c0d0e0f""00000000000000000000000000000000", + "bc939bc5281480fa99c6d68c258ec42f" + }, /* * test vectors from Google */ @@ -844,6 +858,23 @@ int main() printf("\n"); return 1; } + + for (half = 16; half < inlen; half += 16) { + Poly1305_Init(&poly1305, key); + Poly1305_Update(&poly1305, in, half); + Poly1305_Update(&poly1305, in+half, inlen-half); + Poly1305_Final(&poly1305, out); + + if (memcmp(out, expected, sizeof(expected)) != 0) { + printf("Poly1305 test #%d/%d failed.\n", i, half); + printf("got: "); + hexdump(out, sizeof(out)); + printf("\nexpected: "); + hexdump(expected, sizeof(expected)); + printf("\n"); + return 1; + } + } } free(in);