+&set_label("10rounds_alt",16);
+ &movdqa ("xmm5",&QWP(0x00,"ebx"));
+ &mov ($rounds,8);
+ &movdqa ("xmm4",&QWP(0x20,"ebx"));
+ &movdqa ("xmm2","xmm0");
+ &movdqu (&DWP(-16,$key),"xmm0");
+
+&set_label("loop_key128");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+ &lea ($key,&DWP(16,$key));
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(-16,$key),"xmm0");
+ &movdqa ("xmm2","xmm0");
+
+ &dec ($rounds);
+ &jnz (&label("loop_key128"));
+
+ &movdqa ("xmm4",&QWP(0x30,"ebx"));
+
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+ &pslld ("xmm4",1);
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(0,$key),"xmm0");
+
+ &movdqa ("xmm2","xmm0");
+ &pshufb ("xmm0","xmm5");
+ &aesenclast ("xmm0","xmm4");
+
+ &movdqa ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm3","xmm2");
+ &pslldq ("xmm2",4);
+ &pxor ("xmm2","xmm3");
+
+ &pxor ("xmm0","xmm2");
+ &movdqu (&QWP(16,$key),"xmm0");
+
+ &mov ($rounds,9);
+ &mov (&DWP(96,$key),$rounds);
+
+ &jmp (&label("good_key"));
+