- imulq 16($tptr,$j),$m0 # m0=t[0]*n0 # modsched #
- xor $A1[0],$A1[0]
- mov 8($nptr,$j),$Ni[1] # n[1] # modsched #
- add $A0[0],$A1[1]
- mov 16($tptr,$j),$A0[0] # t[0] # modsched #
- adc \$0,$A1[0]
- mul $m1 # n[7]*m1
- add %rax,$A1[1] # n[7]*m1+"t[8]"
- mov $Ni[0],%rax # # modsched #
- adc %rdx,$A1[0]
- mov $A1[1],($tptr) # "t[8]"
-
- xor $topbit,$topbit
- add 8($tptr),$A1[0] # +t[9]
- adc $topbit,$topbit
- add $A0[1],$A1[0]
- lea 16($tptr),$tptr # "t[$num]>>128"
- adc \$0,$topbit
- mov $A1[0],-8($tptr) # "t[9]"
- cmp 8(%rsp),$tptr # are we done?
- jb .Lsqr4x_mont_outer
-
- mov 0(%rsp),$num # restore $num
- mov $topbit,($tptr) # save $topbit
+ mulq $m0
+ add %rax,%r14
+ mov 8*7($nptr),%rax
+ adc \$0,%rdx
+ add %r14,%r13
+ mov %rdx,%r14
+ adc \$0,%r14
+
+ mulq $m0
+ mov 64-16(%rsp,%rcx,8),$m0 # pull n0*a[i]
+ add %rax,%r15
+ adc \$0,%rdx
+ add %r15,%r14
+ mov 8*0($nptr),%rax # pull n[0]
+ mov %rdx,%r15
+ adc \$0,%r15
+
+ dec %ecx
+ jnz .L8x_tail
+
+ lea 8*8($nptr),$nptr
+ mov 8(%rsp),%rdx # pull end of t[]
+ cmp 0(%rsp),$nptr # end of n[]?
+ jae .L8x_tail_done # break out of loop
+
+ mov 64+56(%rsp),$m0 # pull n0*a[0]
+ neg $carry
+ mov 8*0($nptr),%rax # pull n[0]
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ sbb $carry,$carry # top carry
+
+ mov \$8,%ecx
+ jmp .L8x_tail
+
+.align 32
+.L8x_tail_done:
+ add (%rdx),%r8 # can this overflow?
+ adc \$0,%r9
+ adc \$0,%r10
+ adc \$0,%r11
+ adc \$0,%r12
+ adc \$0,%r13
+ adc \$0,%r14
+ adc \$0,%r15
+ sbb %rax,%rax
+
+.L8x_no_tail:
+ neg $carry
+ adc 8*0($tptr),%r8
+ adc 8*1($tptr),%r9
+ adc 8*2($tptr),%r10
+ adc 8*3($tptr),%r11
+ adc 8*4($tptr),%r12
+ adc 8*5($tptr),%r13
+ adc 8*6($tptr),%r14
+ adc 8*7($tptr),%r15
+ sbb $carry,$carry
+ neg %rax
+ sub $carry,%rax # top-most carry
+
+ mov 40(%rsp),$nptr # restore $nptr
+
+ mov %r8,8*0($tptr) # store top 512 bits
+ mov %r9,8*1($tptr)
+ mov $nptr,$num # $num is %r9, can't be moved upwards
+ mov %r10,8*2($tptr)
+ sub 0(%rsp),$num # -$num
+ mov %r11,8*3($tptr)
+ mov %r12,8*4($tptr)
+ mov %r13,8*5($tptr)
+ mov %r14,8*6($tptr)
+ mov %r15,8*7($tptr)
+ lea 8*8($tptr),$tptr
+ mov %rax,(%rdx) # store top-most carry
+
+ cmp %rdx,$tptr # end of t[]?
+ jb .L8x_reduction_loop
+
+ neg $num # restore $num