+ &set_label("enc_skip_in_place");
+ &mov ($s2,$s1);
+ &xor ($s0,$s0);
+ &align (4);
+ &data_word(0xAAF3F689); # rep stosb # zero tail
+
+ &lea ($key,&DWP(-16,$s3)); # restore ivp
+ &mov ($acc,$s3); # output as input
+ &mov ($s0,&DWP(0,$key));
+ &mov ($s1,&DWP(4,$key));
+ &mov ($_len,16); # len=16
+ &jmp (&label("slow_enc_loop_x86")); # one more spin...
+
+#--------------------------- SLOW DECRYPT ---------------------------#
+&set_label("slow_decrypt",16);
+ if (!$x86only) {
+ &bt ($_tmp,25); # check for SSE bit
+ &jnc (&label("slow_dec_loop_x86"));
+
+ &set_label("slow_dec_loop_sse",4);
+ &movq ("mm0",&QWP(0,$acc)); # read input
+ &movq ("mm4",&QWP(8,$acc));
+
+ &mov ($key,$_key);
+ &call ("_sse_AES_decrypt_compact");
+
+ &mov ($acc,$_inp); # load inp
+ &lea ($s0,$ivec);
+ &mov ($s1,$_out); # load out
+ &mov ($s2,$_len); # load len
+ &mov ($key,$_ivp); # load ivp
+
+ &movq ("mm1",&QWP(0,$acc)); # re-read input
+ &movq ("mm5",&QWP(8,$acc));
+
+ &pxor ("mm0",&QWP(0,$key)); # xor iv
+ &pxor ("mm4",&QWP(8,$key));
+
+ &movq (&QWP(0,$key),"mm1"); # copy input to iv
+ &movq (&QWP(8,$key),"mm5");
+
+ &sub ($s2,16); # decrease len
+ &jc (&label("slow_dec_partial_sse"));
+
+ &movq (&QWP(0,$s1),"mm0"); # write output
+ &movq (&QWP(8,$s1),"mm4");
+
+ &lea ($s1,&DWP(16,$s1)); # advance out
+ &mov ($_out,$s1); # save out
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &mov ($_len,$s2); # save len
+ &jnz (&label("slow_dec_loop_sse"));
+ &emms ();
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+ &set_label("slow_dec_partial_sse",16);
+ &movq (&QWP(0,$s0),"mm0"); # save output to temp
+ &movq (&QWP(8,$s0),"mm4");
+ &emms ();
+
+ &add ($s2 eq "ecx" ? "ecx":"",16);
+ &mov ("edi",$s1); # out
+ &mov ("esi",$s0); # temp
+ &align (4);
+ &data_word(0xA4F3F689); # rep movsb # copy partial output
+
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+ }
+ &set_label("slow_dec_loop_x86",16);
+ &mov ($s0,&DWP(0,$acc)); # read input
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &lea ($key,$ivec);
+ &mov (&DWP(0,$key),$s0); # copy to temp
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($key,$_key); # load key
+ &call ("_x86_AES_decrypt_compact");
+
+ &mov ($key,$_ivp); # load ivp
+ &mov ($acc,$_len); # load len
+ &xor ($s0,&DWP(0,$key)); # xor iv
+ &xor ($s1,&DWP(4,$key));
+ &xor ($s2,&DWP(8,$key));
+ &xor ($s3,&DWP(12,$key));
+
+ &sub ($acc,16);
+ &jc (&label("slow_dec_partial_x86"));
+
+ &mov ($_len,$acc); # save len
+ &mov ($acc,$_out); # load out
+
+ &mov (&DWP(0,$acc),$s0); # write output
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &lea ($acc,&DWP(16,$acc)); # advance out
+ &mov ($_out,$acc); # save out
+
+ &lea ($acc,$ivec);
+ &mov ($s0,&DWP(0,$acc)); # read temp
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &mov (&DWP(0,$key),$s0); # copy it to iv
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ($acc,$_inp); # load inp
+ &lea ($acc,&DWP(16,$acc)); # advance inp
+ &mov ($_inp,$acc); # save inp
+ &mov ($_len,$s2); # save len
+ &jnz (&label("slow_dec_loop_x86"));
+ &mov ("esp",$_esp);
+ &popf ();
+ &function_end_A();
+ &pushf (); # kludge, never executed
+
+ &set_label("slow_dec_partial_x86",16);
+ &lea ($acc,$ivec);
+ &mov (&DWP(0,$acc),$s0); # save output to temp
+ &mov (&DWP(4,$acc),$s1);
+ &mov (&DWP(8,$acc),$s2);
+ &mov (&DWP(12,$acc),$s3);
+
+ &mov ($acc,$_inp);
+ &mov ($s0,&DWP(0,$acc)); # re-read input
+ &mov ($s1,&DWP(4,$acc));
+ &mov ($s2,&DWP(8,$acc));
+ &mov ($s3,&DWP(12,$acc));
+
+ &mov (&DWP(0,$key),$s0); # copy it to iv
+ &mov (&DWP(4,$key),$s1);
+ &mov (&DWP(8,$key),$s2);
+ &mov (&DWP(12,$key),$s3);
+
+ &mov ("ecx",$_len);
+ &mov ("edi",$_out);
+ &lea ("esi",$ivec);
+ &align (4);
+ &data_word(0xA4F3F689); # rep movsb # copy partial output
+
+ &mov ("esp",$_esp);
+ &popf ();