bn/asm/x86_64-mont5.pl: fix carry bug in bn_sqr8x_internal.

[openssl.git] / crypto / bn / asm / x86_64-mont5.pl
diff --git a/crypto/bn/asm/x86_64-mont5.pl b/crypto/bn/asm/x86_64-mont5.pl

index 8f49391727f5f3a894fbd9756a82dde30a23563e..d041d738cfd5e3f49eae6d16274aa7af2b315de1 100755 (executable)
--- a/crypto/bn/asm/x86_64-mont5.pl
+++ b/crypto/bn/asm/x86_64-mont5.pl
@@ -1934,6 +1934,7 @@ __bn_sqr8x_reduction:
  
  .align 32
  .L8x_tail_done:
  
  .align 32
  .L8x_tail_done:
+       xor     %rax,%rax
         add     (%rdx),%r8              # can this overflow?
         adc     \$0,%r9
         adc     \$0,%r10
         add     (%rdx),%r8              # can this overflow?
         adc     \$0,%r9
         adc     \$0,%r10
@@ -1941,10 +1942,8 @@ __bn_sqr8x_reduction:
         adc     \$0,%r12
         adc     \$0,%r13
         adc     \$0,%r14
         adc     \$0,%r12
         adc     \$0,%r13
         adc     \$0,%r14
-       adc     \$0,%r15                # can't overflow, because we
-                                       # started with "overhung" part
-                                       # of multiplication
-       xor     %rax,%rax
+       adc     \$0,%r15
+       adc     \$0,%rax
  
         neg     $carry
  .L8x_no_tail:
  
         neg     $carry
  .L8x_no_tail:
@@ -3384,6 +3383,7 @@ __bn_sqrx8x_reduction:
  
  .align 32
  .Lsqrx8x_tail_done:
  
  .align 32
  .Lsqrx8x_tail_done:
+       xor     %rax,%rax
         add     24+8(%rsp),%r8          # can this overflow?
         adc     \$0,%r9
         adc     \$0,%r10
         add     24+8(%rsp),%r8          # can this overflow?
         adc     \$0,%r9
         adc     \$0,%r10
@@ -3391,10 +3391,8 @@ __bn_sqrx8x_reduction:
         adc     \$0,%r12
         adc     \$0,%r13
         adc     \$0,%r14
         adc     \$0,%r12
         adc     \$0,%r13
         adc     \$0,%r14
-       adc     \$0,%r15                # can't overflow, because we
-                                       # started with "overhung" part
-                                       # of multiplication
-       mov     $carry,%rax             # xor   %rax,%rax
+       adc     \$0,%r15
+       adc     \$0,%rax
  
         sub     16+8(%rsp),$carry       # mov 16(%rsp),%cf
  .Lsqrx8x_no_tail:                      # %cf is 0 if jumped here
  
         sub     16+8(%rsp),$carry       # mov 16(%rsp),%cf
  .Lsqrx8x_no_tail:                      # %cf is 0 if jumped here
@@ -3409,7 +3407,7 @@ __bn_sqrx8x_reduction:
         adc     8*5($tptr),%r13
         adc     8*6($tptr),%r14
         adc     8*7($tptr),%r15
         adc     8*5($tptr),%r13
         adc     8*6($tptr),%r14
         adc     8*7($tptr),%r15
-       adc     %rax,%rax               # top-most carry
+       adc     \$0,%rax                # top-most carry
  
         mov     32+8(%rsp),%rbx         # n0
         mov     8*8($tptr,%rcx),%rdx    # modulo-scheduled "%r8"
  
         mov     32+8(%rsp),%rbx         # n0
         mov     8*8($tptr,%rcx),%rdx    # modulo-scheduled "%r8"