-{
-my ($shift,$carry)=($a0,$a1);
-my @S=(@A1,$ai,$n0);
-$code.=<<___;
- add \$16,$i
- xor $shift,$shift
- sub $num,$i # $i=16-$num
- xor $carry,$carry
-
- add $A1[0],%rax # t[5]
- adc \$0,%rdx
- mov %rax,8($tptr) # t[5]
- mov %rdx,16($tptr) # t[6]
- mov $carry,24($tptr) # t[7]
-
- mov -16($aptr,$i),%rax # a[0]
- lea 64(%rsp,$num,2),$tptr
- xor $A0[0],$A0[0] # t[0]
- mov -24($tptr,$i,2),$A0[1] # t[1]
-
- lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[1] # | t[2*i]>>63
- mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[0]
- mov -8($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[0],-32($tptr,$i,2)
- adc %rdx,$S[1]
-
- lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
- mov $S[1],-24($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[3] # | t[2*i]>>63
- mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[2]
- mov 0($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[2],-16($tptr,$i,2)
- adc %rdx,$S[3]
- lea 16($i),$i
- mov $S[3],-40($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- jmp .Lsqr4x_shift_n_add
-
-.align 16
-.Lsqr4x_shift_n_add:
- lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[1] # | t[2*i]>>63
- mov -16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov -8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[0]
- mov -8($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[0],-32($tptr,$i,2)
- adc %rdx,$S[1]
-
- lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
- mov $S[1],-24($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[3] # | t[2*i]>>63
- mov 0($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov 8($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[2]
- mov 0($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[2],-16($tptr,$i,2)
- adc %rdx,$S[3]
-
- lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
- mov $S[3],-8($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[1] # | t[2*i]>>63
- mov 16($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov 24($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[0]
- mov 8($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[0],0($tptr,$i,2)
- adc %rdx,$S[1]
-
- lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1 | shift
- mov $S[1],8($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[3] # | t[2*i]>>63
- mov 32($tptr,$i,2),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov 40($tptr,$i,2),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[2]
- mov 16($aptr,$i),%rax # a[i+1] # prefetch
- mov $S[2],16($tptr,$i,2)
- adc %rdx,$S[3]
- mov $S[3],24($tptr,$i,2)
- sbb $carry,$carry # mov cf,$carry
- add \$32,$i
- jnz .Lsqr4x_shift_n_add
-
- lea ($shift,$A0[0],2),$S[0] # t[2*i]<<1 | shift
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[1] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[1] # | t[2*i]>>63
- mov -16($tptr),$A0[0] # t[2*i+2] # prefetch
- mov $A0[1],$shift # shift=t[2*i+1]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- mov -8($tptr),$A0[1] # t[2*i+2+1] # prefetch
- adc %rax,$S[0]
- mov -8($aptr),%rax # a[i+1] # prefetch
- mov $S[0],-32($tptr)
- adc %rdx,$S[1]
-
- lea ($shift,$A0[0],2),$S[2] # t[2*i]<<1|shift
- mov $S[1],-24($tptr)
- sbb $carry,$carry # mov cf,$carry
- shr \$63,$A0[0]
- lea ($j,$A0[1],2),$S[3] # t[2*i+1]<<1 |
- shr \$63,$A0[1]
- or $A0[0],$S[3] # | t[2*i]>>63
- mul %rax # a[i]*a[i]
- neg $carry # mov $carry,cf
- adc %rax,$S[2]
- adc %rdx,$S[3]
- mov $S[2],-16($tptr)
- mov $S[3],-8($tptr)