poly1305/asm/poly1305-x86.pl: don't loose 59-th bit.
[openssl.git] / crypto / poly1305 / asm / poly1305-x86.pl
index 4307c9978a2da3ed303a9a6247825ddbbc789781..01c3cbcda93f05aecc2a187b29ec3955b96fbd4c 100755 (executable)
@@ -540,6 +540,7 @@ my $base = shift; $base = "esp" if (!defined($base));
 
 sub lazy_reduction {
 my $extra = shift;
+my $paddx = defined($extra) ? paddq : paddd;
 
        ################################################################
        # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
@@ -563,12 +564,12 @@ my $extra = shift;
                                                        # possible, because
                                                        # paddq is "broken"
                                                        # on Atom
-       &pand           ($D1,$MASK);
-       &paddq          ($T1,$D2);                      # h1 -> h2
         &psllq         ($T0,2);
+       &paddq          ($T1,$D2);                      # h1 -> h2
+        &$paddx        ($T0,$D0);                      # h4 -> h0
+       &pand           ($D1,$MASK);
        &movdqa         ($D2,$T1);
        &psrlq          ($T1,26);
-        &paddd         ($T0,$D0);                      # h4 -> h0
        &pand           ($D2,$MASK);
        &paddd          ($T1,$D3);                      # h2 -> h3
         &movdqa        ($D0,$T0);
@@ -1708,18 +1709,18 @@ sub vlazy_reduction {
        &vpsrlq         ($T1,$D1,26);
        &vpand          ($D1,$D1,$MASK);
        &vpaddq         ($D2,$D2,$T1);                  # h1 -> h2
-        &vpaddd        ($D0,$D0,$T0);
+        &vpaddq        ($D0,$D0,$T0);
         &vpsllq        ($T0,$T0,2);
        &vpsrlq         ($T1,$D2,26);
        &vpand          ($D2,$D2,$MASK);
-        &vpaddd        ($D0,$D0,$T0);                  # h4 -> h0
-       &vpaddd         ($D3,$D3,$T1);                  # h2 -> h3
+        &vpaddq        ($D0,$D0,$T0);                  # h4 -> h0
+       &vpaddq         ($D3,$D3,$T1);                  # h2 -> h3
        &vpsrlq         ($T1,$D3,26);
         &vpsrlq        ($T0,$D0,26);
         &vpand         ($D0,$D0,$MASK);
        &vpand          ($D3,$D3,$MASK);
-        &vpaddd        ($D1,$D1,$T0);                  # h0 -> h1
-       &vpaddd         ($D4,$D4,$T1);                  # h3 -> h4
+        &vpaddq        ($D1,$D1,$T0);                  # h0 -> h1
+       &vpaddq         ($D4,$D4,$T1);                  # h3 -> h4
 }
        &vlazy_reduction();