bn/asm/x86_64-mont5.pl: fix carry bug in bn_sqr8x_internal.
[openssl.git] / crypto / bn / asm / x86_64-mont5.pl
index 8f49391727f5f3a894fbd9756a82dde30a23563e..d041d738cfd5e3f49eae6d16274aa7af2b315de1 100755 (executable)
@@ -1934,6 +1934,7 @@ __bn_sqr8x_reduction:
 
 .align 32
 .L8x_tail_done:
 
 .align 32
 .L8x_tail_done:
+       xor     %rax,%rax
        add     (%rdx),%r8              # can this overflow?
        adc     \$0,%r9
        adc     \$0,%r10
        add     (%rdx),%r8              # can this overflow?
        adc     \$0,%r9
        adc     \$0,%r10
@@ -1941,10 +1942,8 @@ __bn_sqr8x_reduction:
        adc     \$0,%r12
        adc     \$0,%r13
        adc     \$0,%r14
        adc     \$0,%r12
        adc     \$0,%r13
        adc     \$0,%r14
-       adc     \$0,%r15                # can't overflow, because we
-                                       # started with "overhung" part
-                                       # of multiplication
-       xor     %rax,%rax
+       adc     \$0,%r15
+       adc     \$0,%rax
 
        neg     $carry
 .L8x_no_tail:
 
        neg     $carry
 .L8x_no_tail:
@@ -3384,6 +3383,7 @@ __bn_sqrx8x_reduction:
 
 .align 32
 .Lsqrx8x_tail_done:
 
 .align 32
 .Lsqrx8x_tail_done:
+       xor     %rax,%rax
        add     24+8(%rsp),%r8          # can this overflow?
        adc     \$0,%r9
        adc     \$0,%r10
        add     24+8(%rsp),%r8          # can this overflow?
        adc     \$0,%r9
        adc     \$0,%r10
@@ -3391,10 +3391,8 @@ __bn_sqrx8x_reduction:
        adc     \$0,%r12
        adc     \$0,%r13
        adc     \$0,%r14
        adc     \$0,%r12
        adc     \$0,%r13
        adc     \$0,%r14
-       adc     \$0,%r15                # can't overflow, because we
-                                       # started with "overhung" part
-                                       # of multiplication
-       mov     $carry,%rax             # xor   %rax,%rax
+       adc     \$0,%r15
+       adc     \$0,%rax
 
        sub     16+8(%rsp),$carry       # mov 16(%rsp),%cf
 .Lsqrx8x_no_tail:                      # %cf is 0 if jumped here
 
        sub     16+8(%rsp),$carry       # mov 16(%rsp),%cf
 .Lsqrx8x_no_tail:                      # %cf is 0 if jumped here
@@ -3409,7 +3407,7 @@ __bn_sqrx8x_reduction:
        adc     8*5($tptr),%r13
        adc     8*6($tptr),%r14
        adc     8*7($tptr),%r15
        adc     8*5($tptr),%r13
        adc     8*6($tptr),%r14
        adc     8*7($tptr),%r15
-       adc     %rax,%rax               # top-most carry
+       adc     \$0,%rax                # top-most carry
 
        mov     32+8(%rsp),%rbx         # n0
        mov     8*8($tptr,%rcx),%rdx    # modulo-scheduled "%r8"
 
        mov     32+8(%rsp),%rbx         # n0
        mov     8*8($tptr,%rcx),%rdx    # modulo-scheduled "%r8"