Switch to compact S-box when generating AES key schedule.
authorAndy Polyakov <appro@openssl.org>
Wed, 2 Aug 2006 07:46:56 +0000 (07:46 +0000)
committerAndy Polyakov <appro@openssl.org>
Wed, 2 Aug 2006 07:46:56 +0000 (07:46 +0000)
crypto/aes/asm/aes-586.pl

index 07243ab4482cf08b39525eae19bdfe6cfca6d1e0..f6e487b1578252c7711a9656454250fafd112690 100755 (executable)
@@ -6,7 +6,7 @@
 # forms are granted according to the OpenSSL license.
 # ====================================================================
 #
-# Version 4.0.
+# Version 4.1.
 #
 # You might fail to appreciate this module performance from the first
 # try. If compared to "vanilla" linux-ia32-icc target, i.e. considered
 # byte for 128-bit key.
 #
 #              ECB encrypt     ECB decrypt     CBC large chunk
-# P4           57[60]          84[100]         23
+# P4           56[60]          84[100]         23
 # AMD K8       48[44]          70[79]          18
 # PIII         41[50]          61[91]          24
 # Pentium      120             160             77
+#
+# Version 4.1 switches to compact S-box even in key schedule setup.
 
 push(@INC,"perlasm","../../perlasm");
 require "x86asm.pl";
@@ -263,56 +265,56 @@ sub enchoriz()
 # *all* references to stack, it's not faster...
 sub mmx_encbody()
 {
-       &movz   ("esi",&LB("eax"));             #  0
-       &mov    ("ecx",&DWP(0,$tbl,"esi",8));   #  0
+       &movz   ($acc,&LB("eax"));              #  0
+       &mov    ("ecx",&DWP(0,$tbl,$acc,8));    #  0
        &pshufw ("mm2","mm0",0x0d);             #  7, 6, 3, 2
        &movz   ("edx",&HB("eax"));             #  1
        &mov    ("edx",&DWP(3,$tbl,"edx",8));   #  1
        &shr    ("eax",16);                     #  5, 4
 
-       &movz   ("esi",&LB("ebx"));             # 10
-       &xor    ("ecx",&DWP(2,$tbl,"esi",8));   # 10
+       &movz   ($acc,&LB("ebx"));              # 10
+       &xor    ("ecx",&DWP(2,$tbl,$acc,8));    # 10
        &pshufw ("mm6","mm4",0x08);             # 13,12, 9, 8
-       &movz   ("esi",&HB("ebx"));             # 11
-       &xor    ("edx",&DWP(1,$tbl,"esi",8));   # 11
+       &movz   ($acc,&HB("ebx"));              # 11
+       &xor    ("edx",&DWP(1,$tbl,$acc,8));    # 11
        &shr    ("ebx",16);                     # 15,14
 
-       &movz   ("esi",&HB("eax"));             #  5
-       &xor    ("ecx",&DWP(3,$tbl,"esi",8));   #  5
+       &movz   ($acc,&HB("eax"));              #  5
+       &xor    ("ecx",&DWP(3,$tbl,$acc,8));    #  5
        &movq   ("mm3",QWP(16,$key));
-       &movz   ("esi",&HB("ebx"));             # 15
-       &xor    ("ecx",&DWP(1,$tbl,"esi",8));   # 15
+       &movz   ($acc,&HB("ebx"));              # 15
+       &xor    ("ecx",&DWP(1,$tbl,$acc,8));    # 15
        &movd   ("mm0","ecx");                  # t[0] collected
 
-       &movz   ("esi",&LB("eax"));             #  4
-       &mov    ("ecx",&DWP(0,$tbl,"esi",8));   #  4
+       &movz   ($acc,&LB("eax"));              #  4
+       &mov    ("ecx",&DWP(0,$tbl,$acc,8));    #  4
        &movd   ("eax","mm2");                  #  7, 6, 3, 2
-       &movz   ("esi",&LB("ebx"));             # 14
-       &xor    ("ecx",&DWP(2,$tbl,"esi",8));   # 14
+       &movz   ($acc,&LB("ebx"));              # 14
+       &xor    ("ecx",&DWP(2,$tbl,$acc,8));    # 14
        &movd   ("ebx","mm6");                  # 13,12, 9, 8
 
-       &movz   ("esi",&HB("eax"));             #  3
-       &xor    ("ecx",&DWP(1,$tbl,"esi",8));   #  3
-       &movz   ("esi",&HB("ebx"));             #  9
-       &xor    ("ecx",&DWP(3,$tbl,"esi",8));   #  9
+       &movz   ($acc,&HB("eax"));              #  3
+       &xor    ("ecx",&DWP(1,$tbl,$acc,8));    #  3
+       &movz   ($acc,&HB("ebx"));              #  9
+       &xor    ("ecx",&DWP(3,$tbl,$acc,8));    #  9
        &movd   ("mm1","ecx");                  # t[1] collected
 
-       &movz   ("esi",&LB("eax"));             #  2
-       &mov    ("ecx",&DWP(2,$tbl,"esi",8));   #  2
+       &movz   ($acc,&LB("eax"));              #  2
+       &mov    ("ecx",&DWP(2,$tbl,$acc,8));    #  2
        &shr    ("eax",16);                     #  7, 6
        &punpckldq      ("mm0","mm1");          # t[0,1] collected
-       &movz   ("esi",&LB("ebx"));             #  8
-       &xor    ("ecx",&DWP(0,$tbl,"esi",8));   #  8
+       &movz   ($acc,&LB("ebx"));              #  8
+       &xor    ("ecx",&DWP(0,$tbl,$acc,8));    #  8
        &shr    ("ebx",16);                     # 13,12
 
-       &movz   ("esi",&HB("eax"));             #  7
-       &xor    ("ecx",&DWP(1,$tbl,"esi",8));   #  7
+       &movz   ($acc,&HB("eax"));              #  7
+       &xor    ("ecx",&DWP(1,$tbl,$acc,8));    #  7
        &pxor   ("mm0","mm3");
        &movz   ("eax",&LB("eax"));             #  6
        &xor    ("edx",&DWP(2,$tbl,"eax",8));   #  6
        &pshufw ("mm1","mm0",0x08);             #  5, 4, 1, 0
-       &movz   ("esi",&HB("ebx"));             # 13
-       &xor    ("ecx",&DWP(3,$tbl,"esi",8));   # 13
+       &movz   ($acc,&HB("ebx"));              # 13
+       &xor    ("ecx",&DWP(3,$tbl,$acc,8));    # 13
        &xor    ("ecx",&DWP(24,$key));          # t[2]
        &movd   ("mm4","ecx");                  # t[2] collected
        &movz   ("ebx",&LB("ebx"));             # 12
@@ -347,11 +349,11 @@ sub enccompact()
                        &and    ($out,0xFF);
        if ($i==1)  {   &shr    ($s[0],16);                     }#%ebx[1]
        if ($i==2)  {   &shr    ($s[0],24);                     }#%ecx[2]
-                       &movz   ($out,&DWP(-128,$te,$out,1));
+                       &movz   ($out,&BP(-128,$te,$out,1));
 
        if ($i==3)  {   $tmp=$s[1];                             }##%eax
                        &movz   ($tmp,&HB($s[1]));
-                       &movz   ($tmp,&DWP(-128,$te,$tmp,1));
+                       &movz   ($tmp,&BP(-128,$te,$tmp,1));
                        &shl    ($tmp,8);
                        &xor    ($out,$tmp);
 
@@ -360,7 +362,7 @@ sub enccompact()
                        &shr    ($tmp,16);                      }
        if ($i==2)  {   &and    ($s[1],0xFF);                   }#%edx[2]
                        &and    ($tmp,0xFF);
-                       &movz   ($tmp,&DWP(-128,$te,$tmp,1));
+                       &movz   ($tmp,&BP(-128,$te,$tmp,1));
                        &shl    ($tmp,16);
                        &xor    ($out,$tmp);
 
@@ -368,7 +370,7 @@ sub enccompact()
        elsif($i==2){   &movz   ($tmp,&HB($s[3]));              }#%ebx[2]
        else        {   &mov    ($tmp,$s[3]);
                        &shr    ($tmp,24);                      }
-                       &movz   ($tmp,&DWP(-128,$te,$tmp,1));
+                       &movz   ($tmp,&BP(-128,$te,$tmp,1));
                        &shl    ($tmp,24);
                        &xor    ($out,$tmp);
        if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
@@ -469,9 +471,9 @@ sub enctransform()
 #
 # Performance is not actually extraordinary in comparison to pure
 # x86 code. In particular encrypt performance is virtually the same.
-# same. Decrypt performance on the other hand is 15-20% better on
-# newer µ-archs [but we're thankful for *any* improvement here], and
-# ~50% better on PIII:-) And additionally on the pros side this code
+# Decrypt performance on the other hand is 15-20% better on newer
+# µ-archs [but we're thankful for *any* improvement here], and ~50%
+# better on PIII:-) And additionally on the pros side this code
 # eliminates redundant references to stack and thus relieves/
 # minimizes the pressure on the memory bus.
 #
@@ -516,80 +518,80 @@ sub mmx_enccompact()
        &movd   ("eax","mm1");                  #  5, 4, 1, 0
        &movd   ("ebx","mm5");                  # 15,14,11,10
 
-       &movz   ("esi",&LB("eax"));             #  0
-       &movz   ("ecx",&DWP(-128,$tbl,"esi",1));#  0
+       &movz   ($acc,&LB("eax"));              #  0
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
        &pshufw ("mm2","mm0",0x0d);             #  7, 6, 3, 2
        &movz   ("edx",&HB("eax"));             #  1
-       &movz   ("edx",&DWP(-128,$tbl,"edx",1));#  1
+       &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
        &shl    ("edx",8);                      #  1
        &shr    ("eax",16);                     #  5, 4
 
-       &movz   ("esi",&LB("ebx"));             # 10
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 10
-       &shl    ("esi",16);                     # 10
-       &or     ("ecx","esi");                  # 10
+       &movz   ($acc,&LB("ebx"));              # 10
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 10
+       &shl    ($acc,16);                      # 10
+       &or     ("ecx",$acc);                   # 10
        &pshufw ("mm6","mm4",0x08);             # 13,12, 9, 8
-       &movz   ("esi",&HB("ebx"));             # 11
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 11
-       &shl    ("esi",24);                     # 11
-       &or     ("edx","esi");                  # 11
+       &movz   ($acc,&HB("ebx"));              # 11
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 11
+       &shl    ($acc,24);                      # 11
+       &or     ("edx",$acc);                   # 11
        &shr    ("ebx",16);                     # 15,14
 
-       &movz   ("esi",&HB("eax"));             #  5
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  5
-       &shl    ("esi",8);                      #  5
-       &or     ("ecx","esi");                  #  5
-       &movz   ("esi",&HB("ebx"));             # 15
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 15
-       &shl    ("esi",24);                     # 15
-       &or     ("ecx","esi");                  # 15
+       &movz   ($acc,&HB("eax"));              #  5
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  5
+       &shl    ($acc,8);                       #  5
+       &or     ("ecx",$acc);                   #  5
+       &movz   ($acc,&HB("ebx"));              # 15
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 15
+       &shl    ($acc,24);                      # 15
+       &or     ("ecx",$acc);                   # 15
        &movd   ("mm0","ecx");                  # t[0] collected
 
-       &movz   ("esi",&LB("eax"));             #  4
-       &movz   ("ecx",&DWP(-128,$tbl,"esi",1));#  4
+       &movz   ($acc,&LB("eax"));              #  4
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  4
        &movd   ("eax","mm2");                  #  7, 6, 3, 2
-       &movz   ("esi",&LB("ebx"));             # 14
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 14
-       &shl    ("esi",16);                     # 14
-       &or     ("ecx","esi");                  # 14
+       &movz   ($acc,&LB("ebx"));              # 14
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 14
+       &shl    ($acc,16);                      # 14
+       &or     ("ecx",$acc);                   # 14
 
        &movd   ("ebx","mm6");                  # 13,12, 9, 8
-       &movz   ("esi",&HB("eax"));             #  3
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  3
-       &shl    ("esi",24);                     #  3
-       &or     ("ecx","esi");                  #  3
-       &movz   ("esi",&HB("ebx"));             #  9
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  9
-       &shl    ("esi",8);                      #  9
-       &or     ("ecx","esi");                  #  9
+       &movz   ($acc,&HB("eax"));              #  3
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  3
+       &shl    ($acc,24);                      #  3
+       &or     ("ecx",$acc);                   #  3
+       &movz   ($acc,&HB("ebx"));              #  9
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  9
+       &shl    ($acc,8);                       #  9
+       &or     ("ecx",$acc);                   #  9
        &movd   ("mm1","ecx");                  # t[1] collected
 
-       &movz   ("esi",&LB("ebx"));             #  8
-       &movz   ("ecx",&DWP(-128,$tbl,"esi",1));#  8
+       &movz   ($acc,&LB("ebx"));              #  8
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  8
        &shr    ("ebx",16);                     # 13,12
-       &movz   ("esi",&LB("eax"));             #  2
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  2
-       &shl    ("esi",16);                     #  2
-       &or     ("ecx","esi");                  #  2
+       &movz   ($acc,&LB("eax"));              #  2
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  2
+       &shl    ($acc,16);                      #  2
+       &or     ("ecx",$acc);                   #  2
        &shr    ("eax",16);                     #  7, 6
 
        &punpckldq      ("mm0","mm1");          # t[0,1] collected
 
-       &movz   ("esi",&HB("eax"));             #  7
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  7
-       &shl    ("esi",24);                     #  7
-       &or     ("ecx","esi");                  #  7
+       &movz   ($acc,&HB("eax"));              #  7
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  7
+       &shl    ($acc,24);                      #  7
+       &or     ("ecx",$acc);                   #  7
        &and    ("eax",0xff);                   #  6
-       &movz   ("eax",&DWP(-128,$tbl,"eax",1));#  6
+       &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  6
        &shl    ("eax",16);                     #  6
        &or     ("edx","eax");                  #  6
-       &movz   ("esi",&HB("ebx"));             # 13
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 13
-       &shl    ("esi",8);                      # 13
-       &or     ("ecx","esi");                  # 13
+       &movz   ($acc,&HB("ebx"));              # 13
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 13
+       &shl    ($acc,8);                       # 13
+       &or     ("ecx",$acc);                   # 13
        &movd   ("mm4","ecx");                  # t[2] collected
        &and    ("ebx",0xff);                   # 12
-       &movz   ("ebx",&DWP(-128,$tbl,"ebx",1));# 12
+       &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
        &or     ("edx","ebx");                  # 12
        &movd   ("mm5","edx");                  # t[3] collected
 
@@ -632,24 +634,22 @@ sub mmx_enccompact()
                &movq   ("mm1","mm0");          &movq   ("mm5","mm4");  # r0
                &pcmpgtb("mm3","mm0");          &pcmpgtb("mm7","mm4");
                &pand   ("mm3","mm2");          &pand   ("mm7","mm2");
-               &movq   ("mm2","mm0");          &movq   ("mm6","mm4");  # r0
+               &pshufw ("mm2","mm0",0xb1);     &pshufw ("mm6","mm4",0xb1);# ROTATE(r0,16)
                &paddb  ("mm0","mm0");          &paddb  ("mm4","mm4");
                &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # = r2
-               &movq   ("mm3","mm2");          &movq   ("mm7","mm6");
-               &pxor   ("mm1","mm0");          &pxor   ("mm5","mm4");  # r2^r0
+               &pshufw ("mm3","mm2",0xb1);     &pshufw ("mm7","mm6",0xb1);# r0
+               &pxor   ("mm1","mm0");          &pxor   ("mm5","mm4");  # r0^r2
+               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= ROTATE(r0,16)
 
+               &movq   ("mm2","mm3");          &movq   ("mm6","mm7");
                &pslld  ("mm3",8);              &pslld  ("mm7",8);
-               &psrld  ("mm2",16);             &psrld  ("mm6",16);
+               &psrld  ("mm2",24);             &psrld  ("mm6",24);
                &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= r0<<8
-               &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= r0>>16
-               &pslld  ("mm3",8);              &pslld  ("mm7",8);
-               &psrld  ("mm2",8);              &psrld  ("mm6",8);
-               &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= r0<<16
-               &movq   ("mm3","mm1");          &movq   ("mm7","mm5");
                &pxor   ("mm0","mm2");          &pxor   ("mm4","mm6");  # ^= r0>>24
 
-               &psrld  ("mm1",8);              &psrld  ("mm5",8);
+               &movq   ("mm3","mm1");          &movq   ("mm7","mm5");
                &movq   ("mm2",&QWP(0,$key));   &movq   ("mm6",&QWP(8,$key));
+               &psrld  ("mm1",8);              &psrld  ("mm5",8);
                &pslld  ("mm3",24);             &pslld  ("mm7",24);
                &pxor   ("mm0","mm1");          &pxor   ("mm4","mm5");  # ^= (r2^r0)<<8
                &pxor   ("mm0","mm3");          &pxor   ("mm4","mm7");  # ^= (r2^r0)>>24
@@ -1105,7 +1105,7 @@ sub enclast()
        &mov    (&DWP(4,$acc),$s1);
        &mov    (&DWP(8,$acc),$s2);
        &mov    (&DWP(12,$acc),$s3);
-       &jmp    (&label("ret"));
+       &function_end_A();
 
        &set_label("mmx",16);
        &movq   ("mm0",&QWP(0,$acc));
@@ -1116,8 +1116,6 @@ sub enclast()
        &movq   (&QWP(0,$acc),"mm0");           # write output data
        &movq   (&QWP(8,$acc),"mm4");
        &emms   ();
-
-&set_label("ret",4);
 &function_end("AES_encrypt");
 
 #--------------------------------------------------------------------#
@@ -1140,11 +1138,11 @@ sub deccompact()
        if($i==3)   {   &$Fn    ($key,&DWP(20,"esp"));          }
        else        {   &mov    ($out,$s[0]);                   }
                        &and    ($out,0xFF);
-                       &movz   ($out,&DWP(-128,$td,$out,1));
+                       &movz   ($out,&BP(-128,$td,$out,1));
 
        if ($i==3)  {   $tmp=$s[1];                             }
                        &movz   ($tmp,&HB($s[1]));
-                       &movz   ($tmp,&DWP(-128,$td,$tmp,1));
+                       &movz   ($tmp,&BP(-128,$td,$tmp,1));
                        &shl    ($tmp,8);
                        &xor    ($out,$tmp);
 
@@ -1152,14 +1150,14 @@ sub deccompact()
        else        {   mov     ($tmp,$s[2]);                   }
                        &shr    ($tmp,16);
                        &and    ($tmp,0xFF);
-                       &movz   ($tmp,&DWP(-128,$td,$tmp,1));
+                       &movz   ($tmp,&BP(-128,$td,$tmp,1));
                        &shl    ($tmp,16);
                        &xor    ($out,$tmp);
 
        if ($i==3)  {   $tmp=$s[3]; &$Fn ($s[2],&DWP(8,"esp")); }
        else        {   &mov    ($tmp,$s[3]);                   }
                        &shr    ($tmp,24);
-                       &movz   ($tmp,&DWP(-128,$td,$tmp,1));
+                       &movz   ($tmp,&BP(-128,$td,$tmp,1));
                        &shl    ($tmp,24);
                        &xor    ($out,$tmp);
        if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
@@ -1301,80 +1299,80 @@ sub mmx_deccompact()
        &movd   ("eax","mm1");                  #  7, 6, 1, 0
 
        &pshufw ("mm5","mm4",0x09);             # 13,12,11,10
-       &movz   ("esi",&LB("eax"));             #  0
-       &movz   ("ecx",&DWP(-128,$tbl,"esi",1));#  0
+       &movz   ($acc,&LB("eax"));              #  0
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
        &movd   ("ebx","mm5");                  # 13,12,11,10
        &movz   ("edx",&HB("eax"));             #  1
-       &movz   ("edx",&DWP(-128,$tbl,"edx",1));#  1
+       &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
        &shl    ("edx",8);                      #  1
 
        &pshufw ("mm2","mm0",0x06);             #  3, 2, 5, 4
-       &movz   ("esi",&LB("ebx"));             # 10
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 10
-       &shl    ("esi",16);                     # 10
-       &or     ("ecx","esi");                  # 10
+       &movz   ($acc,&LB("ebx"));              # 10
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 10
+       &shl    ($acc,16);                      # 10
+       &or     ("ecx",$acc);                   # 10
        &shr    ("eax",16);                     #  7, 6
-       &movz   ("esi",&HB("ebx"));             # 11
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 11
-       &shl    ("esi",24);                     # 11
-       &or     ("edx","esi");                  # 11
+       &movz   ($acc,&HB("ebx"));              # 11
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 11
+       &shl    ($acc,24);                      # 11
+       &or     ("edx",$acc);                   # 11
        &shr    ("ebx",16);                     # 13,12
 
        &pshufw ("mm6","mm4",0x03);             # 9, 8,15,14
-       &movz   ("esi",&HB("eax"));             #  7
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  7
-       &shl    ("esi",24);                     #  7
-       &or     ("ecx","esi");                  #  7
-       &movz   ("esi",&HB("ebx"));             # 13
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 13
-       &shl    ("esi",8);                      # 13
-       &or     ("ecx","esi");                  # 13
+       &movz   ($acc,&HB("eax"));              #  7
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  7
+       &shl    ($acc,24);                      #  7
+       &or     ("ecx",$acc);                   #  7
+       &movz   ($acc,&HB("ebx"));              # 13
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 13
+       &shl    ($acc,8);                       # 13
+       &or     ("ecx",$acc);                   # 13
        &movd   ("mm0","ecx");                  # t[0] collected
 
-       &movz   ("esi",&LB("eax"));             #  6
+       &movz   ($acc,&LB("eax"));              #  6
        &movd   ("eax","mm2");                  #  3, 2, 5, 4
-       &movz   ("ecx",&DWP(-128,$tbl,"esi",1));#  6
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  6
        &shl    ("ecx",16);                     #  6
-       &movz   ("esi",&LB("ebx"));             # 12
+       &movz   ($acc,&LB("ebx"));              # 12
        &movd   ("ebx","mm6");                  #  9, 8,15,14
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 12
-       &or     ("ecx","esi");                  # 12
-
-       &movz   ("esi",&LB("eax"));             #  4
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  4
-       &or     ("edx","esi");                  #  4
-       &movz   ("esi",&LB("ebx"));             # 14
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 14
-       &shl    ("esi",16);                     # 14
-       &or     ("edx","esi");                  # 14
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 12
+       &or     ("ecx",$acc);                   # 12
+
+       &movz   ($acc,&LB("eax"));              #  4
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  4
+       &or     ("edx",$acc);                   #  4
+       &movz   ($acc,&LB("ebx"));              # 14
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 14
+       &shl    ($acc,16);                      # 14
+       &or     ("edx",$acc);                   # 14
        &movd   ("mm1","edx");                  # t[1] collected
 
-       &movz   ("esi",&HB("eax"));             #  5
-       &movz   ("edx",&DWP(-128,$tbl,"esi",1));#  5
+       &movz   ($acc,&HB("eax"));              #  5
+       &movz   ("edx",&BP(-128,$tbl,$acc,1));  #  5
        &shl    ("edx",8);                      #  5
-       &movz   ("esi",&HB("ebx"));             # 15
+       &movz   ($acc,&HB("ebx"));              # 15
        &shr    ("eax",16);                     #  3, 2
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));# 15
-       &shl    ("esi",24);                     # 15
-       &or     ("edx","esi");                  # 15
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 15
+       &shl    ($acc,24);                      # 15
+       &or     ("edx",$acc);                   # 15
        &shr    ("ebx",16);                     #  9, 8
 
        &punpckldq      ("mm0","mm1");          # t[0,1] collected
 
-       &movz   ("esi",&HB("ebx"));             #  9
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  9
-       &shl    ("esi",8);                      #  9
-       &or     ("ecx","esi");                  #  9
+       &movz   ($acc,&HB("ebx"));              #  9
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  9
+       &shl    ($acc,8);                       #  9
+       &or     ("ecx",$acc);                   #  9
        &and    ("ebx",0xff);                   #  8
-       &movz   ("ebx",&DWP(-128,$tbl,"ebx",1));#  8
+       &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); #  8
        &or     ("edx","ebx");                  #  8
-       &movz   ("esi",&LB("eax"));             #  2
-       &movz   ("esi",&DWP(-128,$tbl,"esi",1));#  2
-       &shl    ("esi",16);                     #  2
-       &or     ("edx","esi");                  #  2
+       &movz   ($acc,&LB("eax"));              #  2
+       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  2
+       &shl    ($acc,16);                      #  2
+       &or     ("edx",$acc);                   #  2
        &movd   ("mm4","edx");                  # t[2] collected
        &movz   ("eax",&HB("eax"));             #  3
-       &movz   ("eax",&DWP(-128,$tbl,"eax",1));#  3
+       &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  3
        &shl    ("eax",24);                     #  3
        &or     ("ecx","eax");                  #  3
        &movd   ("mm5","ecx");                  # t[3] collected
@@ -1407,7 +1405,6 @@ sub mmx_deccompact()
        &mov    ($s2,&DWP(192-128,$tbl));
        &mov    ($s3,&DWP(224-128,$tbl));
 
-       &align  (4);
        &set_label("loop",16);
                &mmx_deccompact();
                &add    ($key,16);
@@ -1536,11 +1533,11 @@ sub declast()
        if($i==3)   {   &mov    ($key,&DWP(20,"esp"));          }
        else        {   &mov    ($out,$s[0]);                   }
                        &and    ($out,0xFF);
-                       &movz   ($out,&DWP(0,$td,$out,1));
+                       &movz   ($out,&BP(0,$td,$out,1));
 
        if ($i==3)  {   $tmp=$s[1];                             }
                        &movz   ($tmp,&HB($s[1]));
-                       &movz   ($tmp,&DWP(0,$td,$tmp,1));
+                       &movz   ($tmp,&BP(0,$td,$tmp,1));
                        &shl    ($tmp,8);
                        &xor    ($out,$tmp);
 
@@ -1548,14 +1545,14 @@ sub declast()
        else        {   mov     ($tmp,$s[2]);                   }
                        &shr    ($tmp,16);
                        &and    ($tmp,0xFF);
-                       &movz   ($tmp,&DWP(0,$td,$tmp,1));
+                       &movz   ($tmp,&BP(0,$td,$tmp,1));
                        &shl    ($tmp,16);
                        &xor    ($out,$tmp);
 
        if ($i==3)  {   $tmp=$s[3]; &mov ($s[2],&DWP(8,"esp")); }
        else        {   &mov    ($tmp,$s[3]);                   }
                        &shr    ($tmp,24);
-                       &movz   ($tmp,&DWP(0,$td,$tmp,1));
+                       &movz   ($tmp,&BP(0,$td,$tmp,1));
                        &shl    ($tmp,24);
                        &xor    ($out,$tmp);
        if ($i<2)   {   &mov    (&DWP(4+4*$i,"esp"),$out);      }
@@ -1895,7 +1892,7 @@ sub declast()
        &mov    (&DWP(4,$acc),$s1);
        &mov    (&DWP(8,$acc),$s2);
        &mov    (&DWP(12,$acc),$s3);
-       &jmp    (&label("ret"));
+       &function_end_A();
 
        &set_label("mmx",16);
        &movq   ("mm0",&QWP(0,$acc));
@@ -1906,8 +1903,6 @@ sub declast()
        &movq   (&QWP(0,$acc),"mm0");           # write output data
        &movq   (&QWP(8,$acc),"mm4");
        &emms   ();
-
-&set_label("ret",4);
 &function_end("AES_decrypt");
 
 # void AES_cbc_encrypt (const void char *inp, unsigned char *out,
@@ -2357,27 +2352,26 @@ my $mark=&DWP(72+240,"esp");    #copy of aes_key->rounds
 sub enckey()
 {
        &movz   ("esi",&LB("edx"));             # rk[i]>>0
-       &mov    ("ebx",&DWP(2,$tbl,"esi",8));
+       &movz   ("ebx",&BP(-128,$tbl,"esi",1));
        &movz   ("esi",&HB("edx"));             # rk[i]>>8
-       &and    ("ebx",0xFF000000);
+       &shl    ("ebx",24);
        &xor    ("eax","ebx");
 
-       &mov    ("ebx",&DWP(2,$tbl,"esi",8));
+       &movz   ("ebx",&BP(-128,$tbl,"esi",1));
        &shr    ("edx",16);
-       &and    ("ebx",0x000000FF);
        &movz   ("esi",&LB("edx"));             # rk[i]>>16
        &xor    ("eax","ebx");
 
-       &mov    ("ebx",&DWP(0,$tbl,"esi",8));
+       &movz   ("ebx",&BP(-128,$tbl,"esi",1));
        &movz   ("esi",&HB("edx"));             # rk[i]>>24
-       &and    ("ebx",0x0000FF00);
+       &shl    ("ebx",8);
        &xor    ("eax","ebx");
 
-       &mov    ("ebx",&DWP(0,$tbl,"esi",8));
-       &and    ("ebx",0x00FF0000);
+       &movz   ("ebx",&BP(-128,$tbl,"esi",1));
+       &shl    ("ebx",16);
        &xor    ("eax","ebx");
 
-       &xor    ("eax",&DWP(2048+1024,$tbl,"ecx",4));   # rcon
+       &xor    ("eax",&BP(1024-128,$tbl,"ecx",4));     # rcon
 }
 
 # int AES_set_encrypt_key(const unsigned char *userKey, const int bits,
@@ -2396,6 +2390,17 @@ sub enckey()
        &set_label("pic_point");
        &blindpop($tbl);
        &lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+       &lea    ($tbl,&DWP(2048+128,$tbl));
+
+       # prefetch Te4
+       &mov    ("eax",&DWP(0-128,$tbl));
+       &mov    ("ebx",&DWP(32-128,$tbl));
+       &mov    ("ecx",&DWP(64-128,$tbl));
+       &mov    ("edx",&DWP(96-128,$tbl));
+       &mov    ("eax",&DWP(128-128,$tbl));
+       &mov    ("ebx",&DWP(160-128,$tbl));
+       &mov    ("ecx",&DWP(192-128,$tbl));
+       &mov    ("edx",&DWP(224-128,$tbl));
 
        &mov    ("ecx",&wparam(1));             # number of bits in key
        &cmp    ("ecx",128);
@@ -2536,24 +2541,23 @@ sub enckey()
                &mov    ("edx","eax");
                &mov    ("eax",&DWP(16,"edi"));         # rk[4]
                &movz   ("esi",&LB("edx"));             # rk[11]>>0
-               &mov    ("ebx",&DWP(2,$tbl,"esi",8));
+               &movz   ("ebx",&BP(-128,$tbl,"esi",1));
                &movz   ("esi",&HB("edx"));             # rk[11]>>8
-               &and    ("ebx",0x000000FF);
                &xor    ("eax","ebx");
 
-               &mov    ("ebx",&DWP(0,$tbl,"esi",8));
+               &movz   ("ebx",&BP(-128,$tbl,"esi",1));
                &shr    ("edx",16);
-               &and    ("ebx",0x0000FF00);
+               &shl    ("ebx",8);
                &movz   ("esi",&LB("edx"));             # rk[11]>>16
                &xor    ("eax","ebx");
 
-               &mov    ("ebx",&DWP(0,$tbl,"esi",8));
+               &movz   ("ebx",&BP(-128,$tbl,"esi",1));
                &movz   ("esi",&HB("edx"));             # rk[11]>>24
-               &and    ("ebx",0x00FF0000);
+               &shl    ("ebx",16);
                &xor    ("eax","ebx");
 
-               &mov    ("ebx",&DWP(2,$tbl,"esi",8));
-               &and    ("ebx",0xFF000000);
+               &movz   ("ebx",&BP(-128,$tbl,"esi",1));
+               &shl    ("ebx",24);
                &xor    ("eax","ebx");
 
                &mov    (&DWP(48,"edi"),"eax");         # rk[12]
@@ -2578,24 +2582,61 @@ sub enckey()
 &function_end("AES_set_encrypt_key");
 
 sub deckey()
-{ my ($i,$ptr,$te,$td) = @_;
+{ my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
+  my $tmp = $tbl;
 
-       &mov    ("eax",&DWP($i,$ptr));
-       &mov    ("edx","eax");
-       &movz   ("ebx",&HB("eax"));
-       &shr    ("edx",16);
-       &and    ("eax",0xFF);
-       &movz   ("eax",&BP(2,$te,"eax",8));
-       &movz   ("ebx",&BP(2,$te,"ebx",8));
-       &mov    ("eax",&DWP(0,$td,"eax",8));
-       &xor    ("eax",&DWP(3,$td,"ebx",8));
-       &movz   ("ebx",&HB("edx"));
-       &and    ("edx",0xFF);
-       &movz   ("edx",&BP(2,$te,"edx",8));
-       &movz   ("ebx",&BP(2,$te,"ebx",8));
-       &xor    ("eax",&DWP(2,$td,"edx",8));
-       &xor    ("eax",&DWP(1,$td,"ebx",8));
-       &mov    (&DWP($i,$ptr),"eax");
+       &mov    ($acc,$tp1);
+       &and    ($acc,0x80808080);
+       &mov    ($tmp,$acc);
+       &mov    ($tp2,$tp1);
+       &shr    ($tmp,7);
+       &and    ($tp2,0x7f7f7f7f);
+       &sub    ($acc,$tmp);
+       &add    ($tp2,$tp2);
+       &and    ($acc,0x1b1b1b1b);
+       &xor    ($acc,$tp2);
+       &mov    ($tp2,$acc);
+
+       &and    ($acc,0x80808080);
+       &mov    ($tmp,$acc);
+       &mov    ($tp4,$tp2);
+        &xor   ($tp2,$tp1);    # tp2^tp1
+       &shr    ($tmp,7);
+       &and    ($tp4,0x7f7f7f7f);
+       &sub    ($acc,$tmp);
+       &add    ($tp4,$tp4);
+       &and    ($acc,0x1b1b1b1b);
+       &xor    ($acc,$tp4);
+       &mov    ($tp4,$acc);
+
+       &and    ($acc,0x80808080);
+       &mov    ($tmp,$acc);
+       &mov    ($tp8,$tp4);
+        &xor   ($tp4,$tp1);    # tp4^tp1
+       &shr    ($tmp,7);
+       &and    ($tp8,0x7f7f7f7f);
+       &sub    ($acc,$tmp);
+       &add    ($tp8,$tp8);
+       &and    ($acc,0x1b1b1b1b);
+        &rotl  ($tp1,8);       # = ROTATE(tp1,8)
+       &xor    ($tp8,$acc);
+
+       &mov    ($tmp,&DWP(4*($i+1),$key));     # modulo-scheduled load
+
+       &xor    ($tp1,$tp2);
+       &xor    ($tp2,$tp8);
+       &xor    ($tp1,$tp4);
+       &rotl   ($tp2,24);
+       &xor    ($tp4,$tp8);
+       &xor    ($tp1,$tp8);    # ^= tp8^(tp4^tp1)^(tp2^tp1)
+       &rotl   ($tp4,16);
+       &xor    ($tp1,$tp2);    # ^= ROTATE(tp8^tp2^tp1,24)
+       &rotl   ($tp8,8);
+       &xor    ($tp1,$tp4);    # ^= ROTATE(tp8^tp4^tp1,16)
+       &mov    ($tp2,$tmp);
+       &xor    ($tp1,$tp8);    # ^= ROTATE(tp8,8)
+
+       &mov    (&DWP(4*$i,$key),$tp1);
 }
 
 # int AES_set_decrypt_key(const unsigned char *userKey, const int bits,
@@ -2627,8 +2668,7 @@ sub deckey()
        &lea    ("ecx",&DWP(0,"","ecx",4));
        &lea    ("edi",&DWP(0,"esi","ecx",4));  # pointer to last chunk
 
-       &align  (4);
-       &set_label("invert");                   # invert order of chunks
+       &set_label("invert",4);                 # invert order of chunks
                &mov    ("eax",&DWP(0,"esi"));
                &mov    ("ebx",&DWP(4,"esi"));
                &mov    ("ecx",&DWP(0,"edi"));
@@ -2650,24 +2690,21 @@ sub deckey()
                &cmp    ("esi","edi");
        &jne    (&label("invert"));
 
-       &call   (&label("pic_point"));
-       &set_label("pic_point");
-       blindpop($tbl);
-       &lea    ("edi",&DWP(&label("AES_Td")."-".&label("pic_point"),$tbl));
-       &lea    ($tbl,&DWP(&label("AES_Te")."-".&label("pic_point"),$tbl));
+       &mov    ($key,&wparam(2));
+       &mov    ($acc,&DWP(240,$key));          # pull number of rounds
+       &lea    ($acc,&DWP(-2,$acc,$acc));
+       &lea    ($acc,&DWP(0,$key,$acc,8));
+       &mov    (&wparam(2),$acc);
 
-       &mov    ("esi",&wparam(2));
-       &mov    ("ecx",&DWP(240,"esi"));        # pull number of rounds
-       &dec    ("ecx");
-       &align  (4);
-       &set_label("permute");                  # permute the key schedule
-               &add    ("esi",16);
-               &deckey (0,"esi",$tbl,"edi");
-               &deckey (4,"esi",$tbl,"edi");
-               &deckey (8,"esi",$tbl,"edi");
-               &deckey (12,"esi",$tbl,"edi");
-               &dec    ("ecx");
-       &jnz    (&label("permute"));
+       &mov    ($s0,&DWP(16,$key));            # modulo-scheduled load
+       &set_label("permute",4);                # permute the key schedule
+               &add    ($key,16);
+               &deckey (0,$key,$s0,$s1,$s2,$s3);
+               &deckey (1,$key,$s1,$s2,$s3,$s0);
+               &deckey (2,$key,$s2,$s3,$s0,$s1);
+               &deckey (3,$key,$s3,$s0,$s1,$s2);
+               &cmp    ($key,&wparam(2));
+       &jb     (&label("permute"));
 
        &xor    ("eax","eax");                  # return success
 &function_end("AES_set_decrypt_key");