Pedantic polish to aes-586.pl:-)
[openssl.git] / crypto / aes / asm / aes-586.pl
index 0015a32..e279420 100755 (executable)
@@ -6,7 +6,7 @@
 # forms are granted according to the OpenSSL license.
 # ====================================================================
 #
-# Version 3.2.
+# Version 3.5.
 #
 # You might fail to appreciate this module performance from the first
 # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
 # misaligned, which unfortunately has negative impact on elder IA-32
 # implementations, Pentium suffered 30% penalty, PIII - 10%.
 #
+# Version 3.3 avoids L1 cache aliasing between stack frame and
+# S-boxes, and 3.4 - L1 cache aliasing even between key schedule. The
+# latter is achieved by copying the key schedule to controlled place in
+# stack. This unfortunately has rather strong impact on small block CBC
+# performance, ~2x deterioration on 16-byte block if compared to 3.3.
+#
+# Version 3.5 checks if there is L1 cache aliasing between user-supplied
+# key schedule and S-boxes and abstains from copying the former if
+# there is no. This allows end-user to consciously retain small block
+# performance by aligning key schedule in specific manner.
+#
 # Current ECB performance numbers for 128-bit key in CPU cycles per
 # processed byte [measure commonly used by AES benchmarkers] are:
 #
@@ -81,6 +92,12 @@ $s3="edx";
 $key="edi";
 $acc="esi";
 
+$compromise=0;         # $compromise=128 abstains from copying key
+                       # schedule to stack when encrypting inputs
+                       # shorter than 128 bytes at the cost of
+                       # risksing aliasing with S-boxes. In return
+                       # you get way better, up to +70%, small block
+                       # performance.
 $small_footprint=1;    # $small_footprint=1 code is ~5% slower [on
                        # recent ยต-archs], but ~5 times smaller!
                        # I favor compact code to minimize cache
@@ -104,9 +121,9 @@ sub encvert()
   my $v0 = $acc, $v1 = $key;
 
        &mov    ($v0,$s[3]);                            # copy s3
-       &mov    (&DWP(0,"esp"),$s[2]);                  # save s2
+       &mov    (&DWP(4,"esp"),$s[2]);                  # save s2
        &mov    ($v1,$s[0]);                            # copy s0
-       &mov    (&DWP(4,"esp"),$s[1]);                  # save s1
+       &mov    (&DWP(8,"esp"),$s[1]);                  # save s1
 
        &movz   ($s[2],&HB($s[0]));
        &and    ($s[0],0xFF);
@@ -127,7 +144,7 @@ sub encvert()
        &movz   ($v0,&HB($v1));
        &and    ($v1,0xFF);
        &xor    ($s[1],&DWP(2,$te,$v1,8));              # s3>>16
-        &mov   ($v1,&DWP(0,"esp"));                    # restore s2
+        &mov   ($v1,&DWP(4,"esp"));                    # restore s2
        &xor    ($s[0],&DWP(1,$te,$v0,8));              # s3>>24
 
        &mov    ($v0,$v1);
@@ -139,7 +156,7 @@ sub encvert()
        &movz   ($v1,&HB($v0));
        &and    ($v0,0xFF);
        &xor    ($s[0],&DWP(2,$te,$v0,8));              # s2>>16
-        &mov   ($v0,&DWP(4,"esp"));                    # restore s1
+        &mov   ($v0,&DWP(8,"esp"));                    # restore s1
        &xor    ($s[3],&DWP(1,$te,$v1,8));              # s2>>24
 
        &mov    ($v1,$v0);
@@ -172,19 +189,19 @@ sub encstep()
                        &movz   ($tmp,&HB($s[1]));
                        &xor    ($out,&DWP(3,$te,$tmp,8));
 
-       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }##%ebx
+       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
        else        {   &mov    ($tmp,$s[2]);
                        &shr    ($tmp,16);                      }
        if ($i==2)  {   &and    ($s[1],0xFF);                   }#%edx[2]
                        &and    ($tmp,0xFF);
                        &xor    ($out,&DWP(2,$te,$tmp,8));
 
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }##%ecx
+       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
        elsif($i==2){   &movz   ($tmp,&HB($s[3]));              }#%ebx[2]
        else        {   &mov    ($tmp,$s[3]); 
                        &shr    ($tmp,24)                       }
                        &xor    ($out,&DWP(1,$te,$tmp,8));
-       if ($i<2)   {   &mov    (&DWP(4*$i,"esp"),$out);        }
+       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
        if ($i==3)  {   &mov    ($s[3],$acc);                   }
                        &comment();
 }
@@ -208,7 +225,7 @@ sub enclast()
                        &and    ($tmp,0x0000ff00);
                        &xor    ($out,$tmp);
 
-       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],&DWP(0,"esp")); }##%ebx
+       if ($i==3)  {   $tmp=$s[2]; &mov ($s[1],&DWP(4,"esp")); }##%ebx
        else        {   mov     ($tmp,$s[2]);
                        &shr    ($tmp,16);                      }
        if ($i==2)  {   &and    ($s[1],0xFF);                   }#%edx[2]
@@ -217,14 +234,14 @@ sub enclast()
                        &and    ($tmp,0x00ff0000);
                        &xor    ($out,$tmp);
 
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }##%ecx
+       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }##%ecx
        elsif($i==2){   &movz   ($tmp,&HB($s[3]));              }#%ebx[2]
        else        {   &mov    ($tmp,$s[3]);
                        &shr    ($tmp,24);                      }
                        &mov    ($tmp,&DWP(2,$te,$tmp,8));
                        &and    ($tmp,0xff000000);
                        &xor    ($out,$tmp);
-       if ($i<2)   {   &mov    (&DWP(4*$i,"esp"),$out);        }
+       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
        if ($i==3)  {   &mov    ($s[3],$acc);                   }
 }
 
@@ -238,13 +255,8 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
                &mov    ($s2="esi",$acc="ecx");
        }
 
-       # allocate aligned stack frame
-       &mov    ($acc,"esp");
-       &sub    ("esp",20);
-       &and    ("esp",-16);
-
+       # note that caller is expected to allocate stack frame for me!
        &mov    (&DWP(12,"esp"),$key);          # save key
-       &mov    (&DWP(16,"esp"),$acc);          # save %esp
 
        &xor    ($s0,&DWP(0,$key));             # xor with key
        &xor    ($s1,&DWP(4,$key));
@@ -256,7 +268,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
        if ($small_footprint) {
            &lea        ($acc,&DWP(-2,$acc,$acc));
            &lea        ($acc,&DWP(0,$key,$acc,8));
-           &mov        (&DWP(8,"esp"),$acc);   # end of key schedule
+           &mov        (&DWP(16,"esp"),$acc);  # end of key schedule
            &align      (4);
            &set_label("loop");
                if ($vertical_spin) {
@@ -267,12 +279,12 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
                    &encstep(2,"ebp",$s2,$s3,$s0,$s1);
                    &encstep(3,"ebp",$s3,$s0,$s1,$s2);
                }
-               &add    ($key,16);                      # advance rd_key
+               &add    ($key,16);              # advance rd_key
                &xor    ($s0,&DWP(0,$key));
                &xor    ($s1,&DWP(4,$key));
                &xor    ($s2,&DWP(8,$key));
                &xor    ($s3,&DWP(12,$key));
-           &cmp        ($key,&DWP(8,"esp"));
+           &cmp        ($key,&DWP(16,"esp"));
            &mov        (&DWP(12,"esp"),$key);
            &jb         (&label("loop"));
        }
@@ -298,7 +310,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
                &xor    ($s3,&DWP(16*$i+12,$key));
            }
            &add        ($key,32);
-           &mov        (&DWP(12,"esp"),$key);          # advance rd_key
+           &mov        (&DWP(12,"esp"),$key);  # advance rd_key
        &set_label("12rounds");
            for ($i=1;$i<3;$i++) {
                if ($vertical_spin) {
@@ -315,7 +327,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
                &xor    ($s3,&DWP(16*$i+12,$key));
            }
            &add        ($key,32);
-           &mov        (&DWP(12,"esp"),$key);          # advance rd_key
+           &mov        (&DWP(12,"esp"),$key);  # advance rd_key
        &set_label("10rounds");
            for ($i=1;$i<10;$i++) {
                if ($vertical_spin) {
@@ -343,7 +355,6 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
        &enclast(2,"ebp",$s2,$s3,$s0,$s1);
        &enclast(3,"ebp",$s3,$s0,$s1,$s2);
 
-       &mov    ("esp",&DWP(16,"esp"));         # restore %esp
        &add    ($key,$small_footprint?16:160);
        &xor    ($s0,&DWP(0,$key));
        &xor    ($s1,&DWP(4,$key));
@@ -420,7 +431,7 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
 #rcon:
        &data_word(0x00000001, 0x00000002, 0x00000004, 0x00000008);
        &data_word(0x00000010, 0x00000020, 0x00000040, 0x00000080);
-       &data_word(0x0000001b, 0x00000036);
+       &data_word(0x0000001b, 0x00000036, 0, 0, 0, 0, 0, 0);
 &function_end_B("_x86_AES_encrypt");
 
 # void AES_encrypt (const void *inp,void *out,const AES_KEY *key);
@@ -429,6 +440,12 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
        &mov    ($acc,&wparam(0));              # load inp
        &mov    ($key,&wparam(2));              # load key
 
+       &mov    ($s0,"esp");
+       &sub    ("esp",24);
+       &and    ("esp",-64);
+       &add    ("esp",4);
+       &mov    (&DWP(16,"esp"),$s0);
+
        &call   (&label("pic_point"));          # make it PIC!
        &set_label("pic_point");
        &blindpop("ebp");
@@ -441,6 +458,8 @@ sub _data_word() { my $i; while(defined($i=shift)) { &data_word($i,$i); } }
 
        &call   ("_x86_AES_encrypt");
 
+       &mov    ("esp",&DWP(16,"esp"));
+
        &mov    ($acc,&wparam(1));              # load out
        &mov    (&DWP(0,$acc),$s0);             # write output data
        &mov    (&DWP(4,$acc),$s1);
@@ -474,12 +493,12 @@ sub decstep()
                        &and    ($tmp,0xFF);
                        &xor    ($out,&DWP(2,$td,$tmp,8));
 
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }
+       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
        else        {   &mov    ($tmp,$s[3]);                   }
                        &shr    ($tmp,24);
                        &xor    ($out,&DWP(1,$td,$tmp,8));
-       if ($i<2)   {   &mov    (&DWP(4*$i,"esp"),$out);        }
-       if ($i==3)  {   &mov    ($s[3],&DWP(0,"esp"));          }
+       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
+       if ($i==3)  {   &mov    ($s[3],&DWP(4,"esp"));          }
                        &comment();
 }
 
@@ -508,25 +527,20 @@ sub declast()
                        &and    ($tmp,0x00ff0000);
                        &xor    ($out,$tmp);
 
-       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(4,"esp")); }
+       if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
        else        {   &mov    ($tmp,$s[3]);                   }
                        &shr    ($tmp,24);
                        &mov    ($tmp,&DWP(2048,$td,$tmp,4));
                        &and    ($tmp,0xff000000);
                        &xor    ($out,$tmp);
-       if ($i<2)   {   &mov    (&DWP(4*$i,"esp"),$out);        }
-       if ($i==3)  {   &mov    ($s[3],&DWP(0,"esp"));          }
+       if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
+       if ($i==3)  {   &mov    ($s[3],&DWP(4,"esp"));          }
 }
 
 &public_label("AES_Td");
 &function_begin_B("_x86_AES_decrypt");
-       # allocate aligned stack frame
-       &mov    ($acc,"esp");
-       &sub    ("esp",20);
-       &and    ("esp",-16);
-
+       # note that caller is expected to allocate stack frame for me!
        &mov    (&DWP(12,"esp"),$key);          # save key
-       &mov    (&DWP(16,"esp"),$acc);          # save %esp
 
        &xor    ($s0,&DWP(0,$key));             # xor with key
        &xor    ($s1,&DWP(4,$key));
@@ -538,19 +552,19 @@ sub declast()
        if ($small_footprint) {
            &lea        ($acc,&DWP(-2,$acc,$acc));
            &lea        ($acc,&DWP(0,$key,$acc,8));
-           &mov        (&DWP(8,"esp"),$acc);   # end of key schedule
+           &mov        (&DWP(16,"esp"),$acc);  # end of key schedule
            &align      (4);
            &set_label("loop");
                &decstep(0,"ebp",$s0,$s3,$s2,$s1);
                &decstep(1,"ebp",$s1,$s0,$s3,$s2);
                &decstep(2,"ebp",$s2,$s1,$s0,$s3);
                &decstep(3,"ebp",$s3,$s2,$s1,$s0);
-               &add    ($key,16);                      # advance rd_key
+               &add    ($key,16);              # advance rd_key
                &xor    ($s0,&DWP(0,$key));
                &xor    ($s1,&DWP(4,$key));
                &xor    ($s2,&DWP(8,$key));
                &xor    ($s3,&DWP(12,$key));
-           &cmp        ($key,&DWP(8,"esp"));
+           &cmp        ($key,&DWP(16,"esp"));
            &mov        (&DWP(12,"esp"),$key);
            &jb         (&label("loop"));
        }
@@ -572,7 +586,7 @@ sub declast()
                &xor    ($s3,&DWP(16*$i+12,$key));
            }
            &add        ($key,32);
-           &mov        (&DWP(12,"esp"),$key);          # advance rd_key
+           &mov        (&DWP(12,"esp"),$key);  # advance rd_key
        &set_label("12rounds");
            for ($i=1;$i<3;$i++) {
                &decstep(0,"ebp",$s0,$s3,$s2,$s1);
@@ -585,7 +599,7 @@ sub declast()
                &xor    ($s3,&DWP(16*$i+12,$key));
            }
            &add        ($key,32);
-           &mov        (&DWP(12,"esp"),$key);          # advance rd_key
+           &mov        (&DWP(12,"esp"),$key);  # advance rd_key
        &set_label("10rounds");
            for ($i=1;$i<10;$i++) {
                &decstep(0,"ebp",$s0,$s3,$s2,$s1);
@@ -604,7 +618,6 @@ sub declast()
        &declast(2,"ebp",$s2,$s1,$s0,$s3);
        &declast(3,"ebp",$s3,$s2,$s1,$s0);
 
-       &mov    ("esp",&DWP(16,"esp"));         # restore %esp
        &add    ($key,$small_footprint?16:160);
        &xor    ($s0,&DWP(0,$key));
        &xor    ($s1,&DWP(4,$key));
@@ -751,6 +764,12 @@ sub declast()
        &mov    ($acc,&wparam(0));              # load inp
        &mov    ($key,&wparam(2));              # load key
 
+       &mov    ($s0,"esp");
+       &sub    ("esp",24);
+       &and    ("esp",-64);
+       &add    ("esp",4);
+       &mov    (&DWP(16,"esp"),$s0);
+
        &call   (&label("pic_point"));          # make it PIC!
        &set_label("pic_point");
        &blindpop("ebp");
@@ -763,6 +782,8 @@ sub declast()
 
        &call   ("_x86_AES_decrypt");
 
+       &mov    ("esp",&DWP(16,"esp"));
+
        &mov    ($acc,&wparam(1));              # load out
        &mov    (&DWP(0,$acc),$s0);             # write output data
        &mov    (&DWP(4,$acc),$s1);
@@ -772,7 +793,25 @@ sub declast()
 
 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
 #                      size_t length, const AES_KEY *key,
-#                      unsigned char *ivp,const int enc); 
+#                      unsigned char *ivp,const int enc);
+{
+# stack frame layout
+# -4(%esp)     0(%esp)         return address
+# 0(%esp)      4(%esp)         tmp1
+# 4(%esp)      8(%esp)         tmp2
+# 8(%esp)      12(%esp)        key
+# 12(%esp)     16(%esp)        end of key schedule
+my $_esp=&DWP(16,"esp");       #saved %esp
+my $_inp=&DWP(20,"esp");       #copy of wparam(0)
+my $_out=&DWP(24,"esp");       #copy of wparam(1)
+my $_len=&DWP(28,"esp");       #copy of wparam(2)
+my $_key=&DWP(32,"esp");       #copy of wparam(3)
+my $_ivp=&DWP(36,"esp");       #copy of wparam(4)
+my $_tmp=&DWP(40,"esp");       #volatile variable
+my $ivec=&DWP(44,"esp");       #ivec[16]
+my $aes_key=&DWP(60,"esp");    #copy of aes_key
+my $mark=&DWP(60+240,"esp");   #copy of aes_key->rounds
+
 &public_label("AES_Te");
 &public_label("AES_Td");
 &function_begin("AES_cbc_encrypt");
@@ -780,29 +819,98 @@ sub declast()
        &cmp    ($s2,0);
        &je     (&label("enc_out"));
 
-       &call   (&label("pic_point"));          # make it PIC!
+       &call   (&label("pic_point"));          # make it PIC!
        &set_label("pic_point");
        &blindpop("ebp");
 
+       &pushf  ();
+       &cld    ();
+
        &cmp    (&wparam(5),0);
        &je     (&label("DECRYPT"));
 
        &lea    ("ebp",&DWP(&label("AES_Te")."-".&label("pic_point"),"ebp"));
 
-       &mov    ($acc,&wparam(0));              # load inp
-       &mov    ($key,&wparam(4));              # load ivp
+       # allocate aligned stack frame...
+       &lea    ($key,&DWP(-64-244,"esp"));
+       &and    ($key,-64);
+
+       # ... and make sure it doesn't alias with AES_Te modulo 4096
+       &mov    ($s0,"ebp");
+       &lea    ($s1,&DWP(2048,"ebp"));
+       &mov    ($s3,$key);
+       &and    ($s0,0xfff);            # s = %ebp&0xfff
+       &and    ($s1,0xfff);            # e = (%ebp+2048)&0xfff
+       &and    ($s3,0xfff);            # p = %esp&0xfff
+
+       &cmp    ($s3,$s1);              # if (p>=e) %esp =- (p-e);
+       &jb     (&label("te_break_out"));
+       &sub    ($s3,$s1);
+       &sub    ($key,$s3);
+       &jmp    (&label("te_ok"));
+       &set_label("te_break_out");     # else %esp -= (p-s)&0xfff + framesz;
+       &sub    ($s3,$s0);
+       &and    ($s3,0xfff);
+       &add    ($s3,64+256);
+       &sub    ($key,$s3);
+       &align  (4);
+       &set_label("te_ok");
+
+       &mov    ($s0,&wparam(0));       # load inp
+       &mov    ($s1,&wparam(1));       # load out
+       &mov    ($s3,&wparam(3));       # load key
+       &mov    ($acc,&wparam(4));      # load ivp
+
+       &exch   ("esp",$key);
+       &add    ("esp",4);              # reserve for return address!
+       &mov    ($_esp,$key);           # save %esp
+
+       &mov    ($_inp,$s0);            # save copy of inp
+       &mov    ($_out,$s1);            # save copy of out
+       &mov    ($_len,$s2);            # save copy of len
+       &mov    ($_key,$s3);            # save copy of key
+       &mov    ($_ivp,$acc);           # save copy of ivp
+
+       &mov    ($mark,0);              # copy of aes_key->rounds = 0;
+       if ($compromise) {
+               &cmp    ($s2,$compromise);
+               &jb     (&label("skip_ecopy"));
+       }
+       # do we copy key schedule to stack?
+       &mov    ($s1 eq "ebx" ? $s1 : "",$s3);
+       &mov    ($s2 eq "ecx" ? $s2 : "",244/4);
+       &sub    ($s1,"ebp");
+       &mov    ("esi",$s3);
+       &and    ($s1,0xfff);
+       &lea    ("edi",$aes_key);
+       &cmp    ($s1,2048);
+       &jb     (&label("do_ecopy"));
+       &cmp    ($s1,4096-244);
+       &jb     (&label("skip_ecopy"));
+       &align  (4);
+       &set_label("do_ecopy");
+               &mov    ($_key,"edi");
+               &data_word(0xA5F3F689); # rep movsd
+       &set_label("skip_ecopy");
 
+       &mov    ($acc,$s0);
+       &mov    ($key,16);
+       &align  (4);
+       &set_label("prefetch_te");
+               &mov    ($s0,&DWP(0,"ebp"));
+               &mov    ($s1,&DWP(32,"ebp"));
+               &mov    ($s2,&DWP(64,"ebp"));
+               &mov    ($s3,&DWP(96,"ebp"));
+               &lea    ("ebp",&DWP(128,"ebp"));
+               &dec    ($key);
+       &jnz    (&label("prefetch_te"));
+       &sub    ("ebp",2048);
+
+       &mov    ($s2,$_len);
+       &mov    ($key,$_ivp);
        &test   ($s2,0xFFFFFFF0);
        &jz     (&label("enc_tail"));           # short input...
 
-       # prefetch AES_Te
-       for ($i=0;$i<2048;$i+=128)
-       {       &mov    ($s0,&DWP($i+0,"ebp"));
-               &mov    ($s1,&DWP($i+32,"ebp"));
-               &mov    ($s2,&DWP($i+64,"ebp"));
-               &mov    ($s3,&DWP($i+96,"ebp"));
-       }
-
        &mov    ($s0,&DWP(0,$key));             # load iv
        &mov    ($s1,&DWP(4,$key));
 
@@ -811,141 +919,210 @@ sub declast()
                &mov    ($s2,&DWP(8,$key));
                &mov    ($s3,&DWP(12,$key));
 
-               &xor    ($s0,&DWP(0,$acc));             # xor input data
+               &xor    ($s0,&DWP(0,$acc));     # xor input data
                &xor    ($s1,&DWP(4,$acc));
                &xor    ($s2,&DWP(8,$acc));
                &xor    ($s3,&DWP(12,$acc));
 
-               &mov    ($key,&wparam(3));              # load key
+               &mov    ($key,$_key);           # load key
                &call   ("_x86_AES_encrypt");
 
-               &mov    ($acc,&wparam(0));              # load inp
-               &mov    ($key,&wparam(1));              # load out
+               &mov    ($acc,$_inp);           # load inp
+               &mov    ($key,$_out);           # load out
 
-               &mov    (&DWP(0,$key),$s0);             # save output data
+               &mov    (&DWP(0,$key),$s0);     # save output data
                &mov    (&DWP(4,$key),$s1);
                &mov    (&DWP(8,$key),$s2);
                &mov    (&DWP(12,$key),$s3);
 
-               &mov    ($s2,&wparam(2));               # load len
+               &mov    ($s2,$_len);            # load len
 
                &lea    ($acc,&DWP(16,$acc));
-               &mov    (&wparam(0),$acc);              # save inp
+               &mov    ($_inp,$acc);           # save inp
 
                &lea    ($s3,&DWP(16,$key));
-               &mov    (&wparam(1),$s3);               # save out
+               &mov    ($_out,$s3);            # save out
 
                &sub    ($s2,16);
                &test   ($s2,0xFFFFFFF0);
-               &mov    (&wparam(2),$s2);               # save len
+               &mov    ($_len,$s2);            # save len
        &jnz    (&label("enc_loop"));
        &test   ($s2,15);
        &jnz    (&label("enc_tail"));
-       &mov    ($acc,&wparam(4));              # load ivp
-       &mov    ($s2,&DWP(8,$key));             # restore last dwords
+       &mov    ($acc,$_ivp);           # load ivp
+       &mov    ($s2,&DWP(8,$key));     # restore last dwords
        &mov    ($s3,&DWP(12,$key));
-       &mov    (&DWP(0,$acc),$s0);             # save iv
+       &mov    (&DWP(0,$acc),$s0);     # save ivec
        &mov    (&DWP(4,$acc),$s1);
        &mov    (&DWP(8,$acc),$s2);
        &mov    (&DWP(12,$acc),$s3);
+
+       &cmp    ($mark,0);              # was the key schedule copied?
+       &mov    ("edi",$_key);
+       &mov    ("esp",$_esp);
+       &je     (&label("skip_ezero"));
+       # zero copy of key schedule
+       &mov    ("ecx",240/4);
+       &xor    ("eax","eax");
+       &align  (4);
+       &data_word(0xABF3F689); # rep stosd
+       &set_label("skip_ezero")
+       &popf   ();
     &set_label("enc_out");
        &function_end_A();
+       &pushf  ();                     # kludge, never executed
 
     &align     (4);
     &set_label("enc_tail");
        &push   ($key eq "edi" ? $key : "");    # push ivp
-       &pushf  ();
-       &mov    ($key,&wparam(1));              # load out
+       &mov    ($key,$_out);                   # load out
        &mov    ($s1,16);
        &sub    ($s1,$s2);
        &cmp    ($key,$acc);                    # compare with inp
        &je     (&label("enc_in_place"));
-       &data_word(0x90A4F3FC); # cld; rep movsb; nop   # copy input
+       &align  (4);
+       &data_word(0xA4F3F689); # rep movsb     # copy input
        &jmp    (&label("enc_skip_in_place"));
     &set_label("enc_in_place");
        &lea    ($key,&DWP(0,$key,$s2));
     &set_label("enc_skip_in_place");
        &mov    ($s2,$s1);
        &xor    ($s0,$s0);
-       &data_word(0x90AAF3FC); # cld; rep stosb; nop   # zero tail
-       &popf   ();
+       &align  (4);
+       &data_word(0xAAF3F689); # rep stosb     # zero tail
        &pop    ($key);                         # pop ivp
 
-       # prefetch AES_Te
-       for ($i=0;$i<2048;$i+=128)
-       {       &mov    ($s0,&DWP($i+0,"ebp"));
-               &mov    ($s1,&DWP($i+32,"ebp"));
-               &mov    ($s2,&DWP($i+64,"ebp"));
-               &mov    ($s3,&DWP($i+96,"ebp"));
-       }
-
-       &mov    ($acc,&wparam(1));              # output as input
+       &mov    ($acc,$_out);                   # output as input
        &mov    ($s0,&DWP(0,$key));
        &mov    ($s1,&DWP(4,$key));
-       &mov    (&wparam(2),16);                # len=16
+       &mov    ($_len,16);                     # len=16
        &jmp    (&label("enc_loop"));           # one more spin...
 
 #----------------------------- DECRYPT -----------------------------#
 &align (4);
 &set_label("DECRYPT");
-    &stack_push(5);                            # allocate temp + ivp
-
        &lea    ("ebp",&DWP(&label("AES_Td")."-".&label("pic_point"),"ebp"));
 
-       # prefetch AES_Td
-       for ($i=0;$i<3072;$i+=128)
-       {       &mov    ($s0,&DWP($i+0,"ebp"));
-               &mov    ($s1,&DWP($i+32,"ebp"));
-               &mov    ($s2,&DWP($i+64,"ebp"));
-               &mov    ($s3,&DWP($i+96,"ebp"));
+       # allocate aligned stack frame...
+       &lea    ($key,&DWP(-64-244,"esp"));
+       &and    ($key,-64);
+
+       # ... and make sure it doesn't alias with AES_Td modulo 4096
+       &mov    ($s0,"ebp");
+       &lea    ($s1,&DWP(3072,"ebp"));
+       &mov    ($s3,$key);
+       &and    ($s0,0xfff);            # s = %ebp&0xfff
+       &and    ($s1,0xfff);            # e = (%ebp+3072)&0xfff
+       &and    ($s3,0xfff);            # p = %esp&0xfff
+
+       &cmp    ($s3,$s1);              # if (p>=e) %esp =- (p-e);
+       &jb     (&label("td_break_out"));
+       &sub    ($s3,$s1);
+       &sub    ($key,$s3);
+       &jmp    (&label("td_ok"));
+       &set_label("td_break_out");     # else %esp -= (p-s)&0xfff + framesz;
+       &sub    ($s3,$s0);
+       &and    ($s3,0xfff);
+       &add    ($s3,64+256);
+       &sub    ($key,$s3);
+       &align  (4);
+       &set_label("td_ok");
+
+       &mov    ($s0,&wparam(0));       # load inp
+       &mov    ($s1,&wparam(1));       # load out
+       &mov    ($s3,&wparam(3));       # load key
+       &mov    ($acc,&wparam(4));      # load ivp
+
+       &exch   ("esp",$key);
+       &add    ("esp",4);              # reserve for return address!
+       &mov    ($_esp,$key);           # save %esp
+
+       &mov    ($_inp,$s0);            # save copy of inp
+       &mov    ($_out,$s1);            # save copy of out
+       &mov    ($_len,$s2);            # save copy of len
+       &mov    ($_key,$s3);            # save copy of key
+       &mov    ($_ivp,$acc);           # save copy of ivp
+
+       &mov    ($mark,0);              # copy of aes_key->rounds = 0;
+       if ($compromise) {
+               &cmp    ($s2,$compromise);
+               &jb     (&label("skip_dcopy"));
        }
+       # do we copy key schedule to stack?
+       &mov    ($s1 eq "ebx" ? $s1 : "",$s3);
+       &mov    ($s2 eq "ecx" ? $s2 : "",244/4);
+       &sub    ($s1,"ebp");
+       &mov    ("esi",$s3);
+       &and    ($s1,0xfff);
+       &lea    ("edi",$aes_key);
+       &cmp    ($s1,3072);
+       &jb     (&label("do_dcopy"));
+       &cmp    ($s1,4096-244);
+       &jb     (&label("skip_dcopy"));
+       &align  (4);
+       &set_label("do_dcopy");
+               &mov    ($_key,"edi");
+               &data_word(0xA5F3F689); # rep movsd
+       &set_label("skip_dcopy");
 
-       &mov    ($acc,&wparam(0));              # load inp
-       &cmp    ($acc,&wparam(1));
+       &mov    ($acc,$s0);
+       &mov    ($key,24);
+       &align  (4);
+       &set_label("prefetch_td");
+               &mov    ($s0,&DWP(0,"ebp"));
+               &mov    ($s1,&DWP(32,"ebp"));
+               &mov    ($s2,&DWP(64,"ebp"));
+               &mov    ($s3,&DWP(96,"ebp"));
+               &lea    ("ebp",&DWP(128,"ebp"));
+               &dec    ($key);
+       &jnz    (&label("prefetch_td"));
+       &sub    ("ebp",3072);
+
+       &cmp    ($acc,$_out);
        &je     (&label("dec_in_place"));       # in-place processing...
 
-       &mov    ($key,&wparam(4));              # load ivp
-       &mov    (&swtmp(4),$key);
+       &mov    ($key,$_ivp);           # load ivp
+       &mov    ($_tmp,$key);
 
        &align  (4);
        &set_label("dec_loop");
-               &mov    ($s0,&DWP(0,$acc));             # read input
+               &mov    ($s0,&DWP(0,$acc));     # read input
                &mov    ($s1,&DWP(4,$acc));
                &mov    ($s2,&DWP(8,$acc));
                &mov    ($s3,&DWP(12,$acc));
 
-               &mov    ($key,&wparam(3));              # load key
+               &mov    ($key,$_key);           # load key
                &call   ("_x86_AES_decrypt");
 
-               &mov    ($key,&swtmp(4));               # load ivp
-               &mov    ($acc,&wparam(2));              # load len
-               &xor    ($s0,&DWP(0,$key));             # xor iv
+               &mov    ($key,$_tmp);           # load ivp
+               &mov    ($acc,$_len);           # load len
+               &xor    ($s0,&DWP(0,$key));     # xor iv
                &xor    ($s1,&DWP(4,$key));
                &xor    ($s2,&DWP(8,$key));
                &xor    ($s3,&DWP(12,$key));
 
                &sub    ($acc,16);
                &jc     (&label("dec_partial"));
-               &mov    (&wparam(2),$acc);              # save len
-               &mov    ($acc,&wparam(0));              # load inp
-               &mov    ($key,&wparam(1));              # load out
+               &mov    ($_len,$acc);           # save len
+               &mov    ($acc,$_inp);           # load inp
+               &mov    ($key,$_out);           # load out
 
-               &mov    (&DWP(0,$key),$s0);             # write output
+               &mov    (&DWP(0,$key),$s0);     # write output
                &mov    (&DWP(4,$key),$s1);
                &mov    (&DWP(8,$key),$s2);
                &mov    (&DWP(12,$key),$s3);
 
-               &mov    (&swtmp(4),$acc);               # save ivp
+               &mov    ($_tmp,$acc);           # save ivp
                &lea    ($acc,&DWP(16,$acc));
-               &mov    (&wparam(0),$acc);              # save inp
+               &mov    ($_inp,$acc);           # save inp
 
                &lea    ($key,&DWP(16,$key));
-               &mov    (&wparam(1),$key);              # save out
+               &mov    ($_out,$key);           # save out
 
        &jnz    (&label("dec_loop"));
-       &mov    ($key,&swtmp(4));       # load temp ivp
+       &mov    ($key,$_tmp);           # load temp ivp
     &set_label("dec_end");
-       &mov    ($acc,&wparam(4));      # load user ivp
+       &mov    ($acc,$_ivp);           # load user ivp
        &mov    ($s0,&DWP(0,$key));     # load iv
        &mov    ($s1,&DWP(4,$key));
        &mov    ($s2,&DWP(8,$key));
@@ -958,91 +1135,98 @@ sub declast()
 
     &align     (4);
     &set_label("dec_partial");
-       &lea    ($key,&swtmp(0));
+       &lea    ($key,$ivec);
        &mov    (&DWP(0,$key),$s0);     # dump output to stack
        &mov    (&DWP(4,$key),$s1);
        &mov    (&DWP(8,$key),$s2);
        &mov    (&DWP(12,$key),$s3);
        &lea    ($s2 eq "ecx" ? $s2 : "",&DWP(16,$acc));
        &mov    ($acc eq "esi" ? $acc : "",$key);
-       &mov    ($key eq "edi" ? $key : "",&wparam(1));
-       &pushf  ();
-       &data_word(0x90A4F3FC); # cld; rep movsb; nop   # copy output
-       &popf   ();
-       &mov    ($key,&wparam(0));      # load temp ivp
+       &mov    ($key eq "edi" ? $key : "",$_out);      # load out
+       &data_word(0xA4F3F689); # rep movsb             # copy output
+       &mov    ($key,$_inp);                           # use inp as temp ivp
        &jmp    (&label("dec_end"));
 
     &align     (4);
     &set_label("dec_in_place");
        &set_label("dec_in_place_loop");
-               &lea    ($key,&swtmp(0));
-               &mov    ($s0,&DWP(0,$acc));             # read input
+               &lea    ($key,$ivec);
+               &mov    ($s0,&DWP(0,$acc));     # read input
                &mov    ($s1,&DWP(4,$acc));
                &mov    ($s2,&DWP(8,$acc));
                &mov    ($s3,&DWP(12,$acc));
 
-               &mov    (&DWP(0,$key),$s0);             # copy to temp
+               &mov    (&DWP(0,$key),$s0);     # copy to temp
                &mov    (&DWP(4,$key),$s1);
                &mov    (&DWP(8,$key),$s2);
                &mov    (&DWP(12,$key),$s3);
 
-               &mov    ($key,&wparam(3));              # load key
+               &mov    ($key,$_key);           # load key
                &call   ("_x86_AES_decrypt");
 
-               &mov    ($key,&wparam(4));              # load ivp
-               &mov    ($acc,&wparam(1));              # load out
-               &xor    ($s0,&DWP(0,$key));             # xor iv
+               &mov    ($key,$_ivp);           # load ivp
+               &mov    ($acc,$_out);           # load out
+               &xor    ($s0,&DWP(0,$key));     # xor iv
                &xor    ($s1,&DWP(4,$key));
                &xor    ($s2,&DWP(8,$key));
                &xor    ($s3,&DWP(12,$key));
 
-               &mov    (&DWP(0,$acc),$s0);             # write output
+               &mov    (&DWP(0,$acc),$s0);     # write output
                &mov    (&DWP(4,$acc),$s1);
                &mov    (&DWP(8,$acc),$s2);
                &mov    (&DWP(12,$acc),$s3);
 
                &lea    ($acc,&DWP(16,$acc));
-               &mov    (&wparam(1),$acc);              # save out
+               &mov    ($_out,$acc);           # save out
 
-               &lea    ($acc,&swtmp(0));
-               &mov    ($s0,&DWP(0,$acc));             # read temp
+               &lea    ($acc,$ivec);
+               &mov    ($s0,&DWP(0,$acc));     # read temp
                &mov    ($s1,&DWP(4,$acc));
                &mov    ($s2,&DWP(8,$acc));
                &mov    ($s3,&DWP(12,$acc));
 
-               &mov    (&DWP(0,$key),$s0);             # copy iv
+               &mov    (&DWP(0,$key),$s0);     # copy iv
                &mov    (&DWP(4,$key),$s1);
                &mov    (&DWP(8,$key),$s2);
                &mov    (&DWP(12,$key),$s3);
 
-               &mov    ($acc,&wparam(0));              # load inp
+               &mov    ($acc,$_inp);           # load inp
 
                &lea    ($acc,&DWP(16,$acc));
-               &mov    (&wparam(0),$acc);              # save inp
+               &mov    ($_inp,$acc);           # save inp
 
-               &mov    ($s2,&wparam(2));               # load len
+               &mov    ($s2,$_len);            # load len
                &sub    ($s2,16);
                &jc     (&label("dec_in_place_partial"));
-               &mov    (&wparam(2),$s2);               # save len
+               &mov    ($_len,$s2);            # save len
        &jnz    (&label("dec_in_place_loop"));
        &jmp    (&label("dec_out"));
 
     &align     (4);
     &set_label("dec_in_place_partial");
        # one can argue if this is actually required...
-       &mov    ($key eq "edi" ? $key : "",&wparam(1));
-       &lea    ($acc eq "esi" ? $acc : "",&swtmp(0));
+       &mov    ($key eq "edi" ? $key : "",$_out);
+       &lea    ($acc eq "esi" ? $acc : "",$ivec);
        &lea    ($key,&DWP(0,$key,$s2));
        &lea    ($acc,&DWP(16,$acc,$s2));
        &neg    ($s2 eq "ecx" ? $s2 : "");
-       &pushf  ();
-       &data_word(0x90A4F3FC); # cld; rep movsb; nop   # restore tail
-       &popf   ();
+       &data_word(0xA4F3F689); # rep movsb     # restore tail
 
     &align     (4);
     &set_label("dec_out");
-    &stack_pop(5);
+    &cmp       ($mark,0);              # was the key schedule copied?
+    &mov       ("edi",$_key);
+    &mov       ("esp",$_esp);
+    &je                (&label("skip_dzero"));
+    # zero copy of key schedule
+    &mov       ("ecx",240/4);
+    &xor       ("eax","eax");
+    &align     (4);
+    &data_word(0xABF3F689);    # rep stosd
+    &set_label("skip_dzero")
+    &popf      ();
 &function_end("AES_cbc_encrypt");
+}
 
 #------------------------------------------------------------------#