+ # allocate aligned stack frame...
+ &lea ($key,&DWP(-44,"esp"));
+ &and ($key,-64);
+
+ # ... and make sure it doesn't alias with AES_Te modulo 4096
+ &mov ($s1,"ebp");
+ &mov ($s3,$key);
+ &and ($s1,0xfff); # t = %ebp&0xfff
+ &and ($s3,0xfff); # p = %esp&0xfff
+
+ &cmp ($s3,$s1); # if (p<t) goto ok
+ &jb (&label("te_ok"));
+ &lea ($acc,&DWP(2048,$s1));
+ &cmp ($s3,$acc); # if (p>=(t+2048)) goto ok
+ &jae (&label("te_ok"));
+ &sub ($s1,$s3); # t -= p
+ &lea ($key,&DWP(-64,$key,$s1));# %esp -= (p-t)+64
+ &set_label("te_ok");
+
+ &mov ($s0,&wparam(0)); # load inp
+ &mov ($s1,&wparam(1)); # load out
+ &mov ($s3,&wparam(3)); # load key
+ &mov ($acc,&wparam(4)); # load ivp
+
+ &exch ("esp",$key);
+ &add ("esp",4); # reserve for return address!
+ &mov ($_esp,$key); # save %esp
+
+ &mov ($_inp,$s0); # save copy of inp
+ &mov ($_out,$s1); # save copy of out
+ &mov ($_len,$s2); # save copy of len
+ &mov ($_key,$s3); # save copy of key
+ &mov ($_ivp,$acc); # save copy of ivp
+
+ &mov ($acc,$s0);
+ &mov ($key,16);
+ &align (4);
+ &set_label("prefetch_te");
+ &mov ($s0,&DWP(0,"ebp"));
+ &mov ($s1,&DWP(32,"ebp"));
+ &mov ($s2,&DWP(64,"ebp"));
+ &mov ($s3,&DWP(96,"ebp"));
+ &lea ("ebp",&DWP(128,"ebp"));
+ &dec ($key);
+ &jnz (&label("prefetch_te"));
+ &sub ("ebp",2048);
+
+ &mov ($s2,$_len);
+ &mov ($key,$_ivp);