# do first part (X2 = Xh * M2)
add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords
# Xh is actually { [rdi+8*1], rbp }
# do first part (X2 = Xh * M2)
add \$8*10, %rdi # rdi -> pXh ; 128 bits, 2 qwords
# Xh is actually { [rdi+8*1], rbp }
lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords
___
unshift(@X,pop(@X)); unshift(@X,pop(@X));
lea (+$Reduce_Data_offset+$X2_offset+$STACK_DEPTH)(%rsp), %rcx # rcx -> pX2 ; 641 bits, 11 qwords
___
unshift(@X,pop(@X)); unshift(@X,pop(@X));
mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords
mov %rax, (+$Reduce_Data_offset+$Carries_offset+$STACK_DEPTH)(%rsp)
lea (+$Reduce_Data_offset+$Q_offset+$STACK_DEPTH)(%rsp), %rdi # rdi -> pQ ; 128 bits, 2 qwords
# MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half)
# B1:B0 = rsi[1:0] = K1[1:0]
# MUL_128x128t128 rdi, rcx, rsi ; Q = X2 * K1 (bottom half)
# B1:B0 = rsi[1:0] = K1[1:0]