#
# (*) I've sketched even non-MMX assembler, but for the record
# I've failed to beat the Intel compiler on P4, without using
-* MMX that is...
+# MMX that is...
# (**) ... on AMD on the other hand non-MMX assembler was observed
# to perform significantly better, but I figured this MMX
# implementation is even faster anyway, so why bother? As for
for($i=0;$i<8;$i++) { &movq(@mm[$i],&QWP($i*8,"esi")); } # L=H
&set_label("outerloop");
- for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esp"),@mm[$i]); } # K=H
+ for($i=0;$i<8;$i++) { &movq(&QWP($i*8,"esp"),@mm[$i]); } # K=L
for($i=0;$i<8;$i++) { &pxor(@mm[$i],&QWP($i*8,"edi")); } # L^=inp
- for($i=0;$i<8;$i++) { &movq(&QWP(64+$i*8,"esp"),@mm[$i]); } #S=L
+ for($i=0;$i<8;$i++) { &movq(&QWP(64+$i*8,"esp"),@mm[$i]); } # S=L
&xor ("esi","esi");
&mov (&DWP(12,"ebx"),"esi"); # zero round counter
&set_label("round",16);
- &movq (@mm[0],&DWP(2048*$SCALE,$tbl,"esi",8)); # rc[r]
+ &movq (@mm[0],&QWP(2048*$SCALE,$tbl,"esi",8)); # rc[r]
&mov ("eax",&DWP(0,"esp"));
&mov ("ebx",&DWP(4,"esp"));
for($i=0;$i<8;$i++) {
&L(0xfb,0xee,0x7c,0x66,0xdd,0x17,0x47,0x9e);
&L(0xca,0x2d,0xbf,0x07,0xad,0x5a,0x83,0x33);
-&function_end_B("whrilpool_block_mmx");
+&function_end_B("whirlpool_block_mmx");
&asm_finish();