$code .= " mov %edx, %r12d /* (NEXT STEP) z' = %edx */\n" if ($pos == -1);
$code .= <<EOF;
not %r11d /* not z */
- lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
and $x, %r12d /* x & z */
+ lea $T_i($dst,%r10d),$dst /* Const + dst + ... */
and $y, %r11d /* y & (not z) */
mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
or %r11d, %r12d /* (y & (not z)) | (x & z) */
# %r10d = X[k_next]
# %r11d = y' (copy of y for the next step)
# Each round3_step() takes about 4.2 clocks (8 instructions, 1.9 IPC)
+{ my $round3_alter=0;
sub round3_step
{
my ($pos, $dst, $x, $y, $z, $k_next, $T_i, $s) = @_;
mov $k_next*4(%rsi),%r10d /* (NEXT STEP) X[$k_next] */
xor $x, %r11d /* x ^ ... */
add %r11d, $dst /* dst += ... */
+EOF
+ $code .= <<EOF if ($round3_alter);
rol \$$s, $dst /* dst <<< s */
mov $x, %r11d /* (NEXT STEP) y' = $x */
+EOF
+ $code .= <<EOF if (!$round3_alter);
+ mov $x, %r11d /* (NEXT STEP) y' = $x */
+ rol \$$s, $dst /* dst <<< s */
+EOF
+ $code .= <<EOF;
add $x, $dst /* dst += x */
EOF
+ $round3_alter^=1;
+}
}
# round4_step() does:
die "can't locate x86_64-xlate.pl";
no warnings qw(uninitialized);
-open STDOUT,"| $^X $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour $output";
+*STDOUT=*OUT;
$code .= <<EOF;
.text