aes-586.pl: Atom-specific optimization, +44/29%, minor improvement on others.
authorAndy Polyakov <appro@openssl.org>
Mon, 12 Nov 2012 17:50:19 +0000 (17:50 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 12 Nov 2012 17:50:19 +0000 (17:50 +0000)
vpaes-x86.pl: minor performance squeeze.

crypto/aes/asm/aes-586.pl
crypto/aes/asm/vpaes-x86.pl

index 406cd222f56abbbc014a3a28b01f1c17eaf936da..1c1e23e575996d5730f11d7620fa8f7cb976d2ff 100755 (executable)
 # byte for 128-bit key.
 #
 #              ECB encrypt     ECB decrypt     CBC large chunk
-# P4           56[60]          84[100]         23
-# AMD K8       48[44]          70[79]          18
-# PIII         41[50]          61[91]          24
-# Core 2       32[38]          45[70]          18.5
-# Pentium      120             160             77
+# P4           52[54]          83[95]          23
+# AMD K8       46[41]          66[70]          18
+# PIII         41[50]          60[77]          24
+# Core 2       31[36]          45[64]          18.5
+# Atom         76[100]         96[138]         60
+# Pentium      115             150             77
 #
 # Version 4.1 switches to compact S-box even in key schedule setup.
 #
@@ -476,24 +477,25 @@ sub enctransform()
   my $tmp = $tbl;
   my $r2  = $key ;
 
-       &mov    ($acc,$s[$i]);
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
-       &shr    ($tmp,7);
+       &and    ($tmp,$s[$i]);
        &lea    ($r2,&DWP(0,$s[$i],$s[$i]));
-       &sub    ($acc,$tmp);
+       &mov    ($acc,$tmp);
+       &shr    ($tmp,7);
        &and    ($r2,0xfefefefe);
-       &and    ($acc,0x1b1b1b1b);
+       &sub    ($acc,$tmp);
        &mov    ($tmp,$s[$i]);
+       &and    ($acc,0x1b1b1b1b);
+       &rotr   ($tmp,16);
        &xor    ($acc,$r2);     # r2
+       &mov    ($r2,$s[$i]);
 
        &xor    ($s[$i],$acc);  # r0 ^ r2
+       &rotr   ($r2,16+8);
+       &xor    ($acc,$tmp);
        &rotl   ($s[$i],24);
+       &xor    ($acc,$r2);
+       &mov    ($tmp,0x80808080)       if ($i!=1);
        &xor    ($s[$i],$acc);  # ROTATE(r2^r0,24) ^ r2
-       &rotr   ($tmp,16);
-       &xor    ($s[$i],$tmp);
-       &rotr   ($tmp,8);
-       &xor    ($s[$i],$tmp);
 }
 
 &function_begin_B("_x86_AES_encrypt_compact");
@@ -526,6 +528,7 @@ sub enctransform()
                &enccompact(1,$tbl,$s1,$s2,$s3,$s0,1);
                &enccompact(2,$tbl,$s2,$s3,$s0,$s1,1);
                &enccompact(3,$tbl,$s3,$s0,$s1,$s2,1);
+               &mov    ($tbl,0x80808080);
                &enctransform(2);
                &enctransform(3);
                &enctransform(0);
@@ -607,82 +610,84 @@ sub sse_enccompact()
        &pshufw ("mm5","mm4",0x0d);             # 15,14,11,10
        &movd   ("eax","mm1");                  #  5, 4, 1, 0
        &movd   ("ebx","mm5");                  # 15,14,11,10
+       &mov    ($__key,$key);
 
        &movz   ($acc,&LB("eax"));              #  0
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
-       &pshufw ("mm2","mm0",0x0d);             #  7, 6, 3, 2
        &movz   ("edx",&HB("eax"));             #  1
+       &pshufw ("mm2","mm0",0x0d);             #  7, 6, 3, 2
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
+       &movz   ($key,&LB("ebx"));              # 10
        &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
-       &shl    ("edx",8);                      #  1
        &shr    ("eax",16);                     #  5, 4
+       &shl    ("edx",8);                      #  1
 
-       &movz   ($acc,&LB("ebx"));              # 10
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 10
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 10
+       &movz   ($key,&HB("ebx"));              # 11
        &shl    ($acc,16);                      # 10
-       &or     ("ecx",$acc);                   # 10
        &pshufw ("mm6","mm4",0x08);             # 13,12, 9, 8
-       &movz   ($acc,&HB("ebx"));              # 11
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 11
+       &or     ("ecx",$acc);                   # 10
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 11
+       &movz   ($key,&HB("eax"));              #  5
        &shl    ($acc,24);                      # 11
-       &or     ("edx",$acc);                   # 11
        &shr    ("ebx",16);                     # 15,14
+       &or     ("edx",$acc);                   # 11
 
-       &movz   ($acc,&HB("eax"));              #  5
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  5
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  5
+       &movz   ($key,&HB("ebx"));              # 15
        &shl    ($acc,8);                       #  5
        &or     ("ecx",$acc);                   #  5
-       &movz   ($acc,&HB("ebx"));              # 15
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 15
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 15
+       &movz   ($key,&LB("eax"));              #  4
        &shl    ($acc,24);                      # 15
        &or     ("ecx",$acc);                   # 15
-       &movd   ("mm0","ecx");                  # t[0] collected
 
-       &movz   ($acc,&LB("eax"));              #  4
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  4
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  4
+       &movz   ($key,&LB("ebx"));              # 14
        &movd   ("eax","mm2");                  #  7, 6, 3, 2
-       &movz   ($acc,&LB("ebx"));              # 14
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 14
-       &shl    ($acc,16);                      # 14
+       &movd   ("mm0","ecx");                  # t[0] collected
+       &movz   ("ecx",&BP(-128,$tbl,$key,1));  # 14
+       &movz   ($key,&HB("eax"));              #  3
+       &shl    ("ecx",16);                     # 14
+       &movd   ("ebx","mm6");                  # 13,12, 9, 8
        &or     ("ecx",$acc);                   # 14
 
-       &movd   ("ebx","mm6");                  # 13,12, 9, 8
-       &movz   ($acc,&HB("eax"));              #  3
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  3
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  3
+       &movz   ($key,&HB("ebx"));              #  9
        &shl    ($acc,24);                      #  3
        &or     ("ecx",$acc);                   #  3
-       &movz   ($acc,&HB("ebx"));              #  9
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  9
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  9
+       &movz   ($key,&LB("ebx"));              #  8
        &shl    ($acc,8);                       #  9
+       &shr    ("ebx",16);                     # 13,12
        &or     ("ecx",$acc);                   #  9
-       &movd   ("mm1","ecx");                  # t[1] collected
 
-       &movz   ($acc,&LB("ebx"));              #  8
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  8
-       &shr    ("ebx",16);                     # 13,12
-       &movz   ($acc,&LB("eax"));              #  2
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  2
-       &shl    ($acc,16);                      #  2
-       &or     ("ecx",$acc);                   #  2
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  8
+       &movz   ($key,&LB("eax"));              #  2
        &shr    ("eax",16);                     #  7, 6
+       &movd   ("mm1","ecx");                  # t[1] collected
+       &movz   ("ecx",&BP(-128,$tbl,$key,1));  #  2
+       &movz   ($key,&HB("eax"));              #  7
+       &shl    ("ecx",16);                     #  2
+       &and    ("eax",0xff);                   #  6
+       &or     ("ecx",$acc);                   #  2
 
        &punpckldq      ("mm0","mm1");          # t[0,1] collected
 
-       &movz   ($acc,&HB("eax"));              #  7
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  7
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  7
+       &movz   ($key,&HB("ebx"));              # 13
        &shl    ($acc,24);                      #  7
-       &or     ("ecx",$acc);                   #  7
-       &and    ("eax",0xff);                   #  6
+       &and    ("ebx",0xff);                   # 12
        &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  6
+       &or     ("ecx",$acc);                   #  7
        &shl    ("eax",16);                     #  6
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 13
        &or     ("edx","eax");                  #  6
-       &movz   ($acc,&HB("ebx"));              # 13
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 13
        &shl    ($acc,8);                       # 13
-       &or     ("ecx",$acc);                   # 13
-       &movd   ("mm4","ecx");                  # t[2] collected
-       &and    ("ebx",0xff);                   # 12
        &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); # 12
+       &or     ("ecx",$acc);                   # 13
        &or     ("edx","ebx");                  # 12
+       &mov    ($key,$__key);
+       &movd   ("mm4","ecx");                  # t[2] collected
        &movd   ("mm5","edx");                  # t[3] collected
 
        &punpckldq      ("mm4","mm5");          # t[2,3] collected
@@ -1270,30 +1275,30 @@ sub dectransform()
   my $tp4 = @s[($i+3)%4]; $tp4 = @s[3] if ($i==1);
   my $tp8 = $tbl;
 
-       &mov    ($acc,$s[$i]);
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
+       &mov    ($tmp,0x80808080);
+       &and    ($tmp,$s[$i]);
+       &mov    ($acc,$tmp);
        &shr    ($tmp,7);
        &lea    ($tp2,&DWP(0,$s[$i],$s[$i]));
        &sub    ($acc,$tmp);
        &and    ($tp2,0xfefefefe);
        &and    ($acc,0x1b1b1b1b);
-       &xor    ($acc,$tp2);
-       &mov    ($tp2,$acc);
+       &xor    ($tp2,$acc);
+       &mov    ($tmp,0x80808080);
 
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
+       &and    ($tmp,$tp2);
+       &mov    ($acc,$tmp);
        &shr    ($tmp,7);
        &lea    ($tp4,&DWP(0,$tp2,$tp2));
        &sub    ($acc,$tmp);
        &and    ($tp4,0xfefefefe);
        &and    ($acc,0x1b1b1b1b);
         &xor   ($tp2,$s[$i]);  # tp2^tp1
-       &xor    ($acc,$tp4);
-       &mov    ($tp4,$acc);
+       &xor    ($tp4,$acc);
+       &mov    ($tmp,0x80808080);
 
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
+       &and    ($tmp,$tp4);
+       &mov    ($acc,$tmp);
        &shr    ($tmp,7);
        &lea    ($tp8,&DWP(0,$tp4,$tp4));
        &sub    ($acc,$tmp);
@@ -1305,13 +1310,13 @@ sub dectransform()
 
        &xor    ($s[$i],$tp2);
        &xor    ($tp2,$tp8);
-       &rotl   ($tp2,24);
        &xor    ($s[$i],$tp4);
        &xor    ($tp4,$tp8);
-       &rotl   ($tp4,16);
+       &rotl   ($tp2,24);
        &xor    ($s[$i],$tp8);  # ^= tp8^(tp4^tp1)^(tp2^tp1)
-       &rotl   ($tp8,8);
+       &rotl   ($tp4,16);
        &xor    ($s[$i],$tp2);  # ^= ROTATE(tp8^tp2^tp1,24)
+       &rotl   ($tp8,8);
        &xor    ($s[$i],$tp4);  # ^= ROTATE(tp8^tp4^tp1,16)
         &mov   ($s[0],$__s0)                   if($i==2); #prefetch $s0
         &mov   ($s[1],$__s1)                   if($i==3); #prefetch $s1
@@ -1389,85 +1394,87 @@ sub dectransform()
 sub sse_deccompact()
 {
        &pshufw ("mm1","mm0",0x0c);             #  7, 6, 1, 0
+       &pshufw ("mm5","mm4",0x09);             # 13,12,11,10
        &movd   ("eax","mm1");                  #  7, 6, 1, 0
+       &movd   ("ebx","mm5");                  # 13,12,11,10
+       &mov    ($__key,$key);
 
-       &pshufw ("mm5","mm4",0x09);             # 13,12,11,10
        &movz   ($acc,&LB("eax"));              #  0
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
-       &movd   ("ebx","mm5");                  # 13,12,11,10
        &movz   ("edx",&HB("eax"));             #  1
+       &pshufw ("mm2","mm0",0x06);             #  3, 2, 5, 4
+       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  0
+       &movz   ($key,&LB("ebx"));              # 10
        &movz   ("edx",&BP(-128,$tbl,"edx",1)); #  1
+       &shr    ("eax",16);                     #  7, 6
        &shl    ("edx",8);                      #  1
 
-       &pshufw ("mm2","mm0",0x06);             #  3, 2, 5, 4
-       &movz   ($acc,&LB("ebx"));              # 10
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 10
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 10
+       &movz   ($key,&HB("ebx"));              # 11
        &shl    ($acc,16);                      # 10
+       &pshufw ("mm6","mm4",0x03);             # 9, 8,15,14
        &or     ("ecx",$acc);                   # 10
-       &shr    ("eax",16);                     #  7, 6
-       &movz   ($acc,&HB("ebx"));              # 11
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 11
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 11
+       &movz   ($key,&HB("eax"));              #  7
        &shl    ($acc,24);                      # 11
-       &or     ("edx",$acc);                   # 11
        &shr    ("ebx",16);                     # 13,12
+       &or     ("edx",$acc);                   # 11
 
-       &pshufw ("mm6","mm4",0x03);             # 9, 8,15,14
-       &movz   ($acc,&HB("eax"));              #  7
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  7
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  7
+       &movz   ($key,&HB("ebx"));              # 13
        &shl    ($acc,24);                      #  7
        &or     ("ecx",$acc);                   #  7
-       &movz   ($acc,&HB("ebx"));              # 13
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 13
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 13
+       &movz   ($key,&LB("eax"));              #  6
        &shl    ($acc,8);                       # 13
+       &movd   ("eax","mm2");                  #  3, 2, 5, 4
        &or     ("ecx",$acc);                   # 13
-       &movd   ("mm0","ecx");                  # t[0] collected
 
-       &movz   ($acc,&LB("eax"));              #  6
-       &movd   ("eax","mm2");                  #  3, 2, 5, 4
-       &movz   ("ecx",&BP(-128,$tbl,$acc,1));  #  6
-       &shl    ("ecx",16);                     #  6
-       &movz   ($acc,&LB("ebx"));              # 12
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  6
+       &movz   ($key,&LB("ebx"));              # 12
+       &shl    ($acc,16);                      #  6
        &movd   ("ebx","mm6");                  #  9, 8,15,14
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 12
+       &movd   ("mm0","ecx");                  # t[0] collected
+       &movz   ("ecx",&BP(-128,$tbl,$key,1));  # 12
+       &movz   ($key,&LB("eax"));              #  4
        &or     ("ecx",$acc);                   # 12
 
-       &movz   ($acc,&LB("eax"));              #  4
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  4
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  4
+       &movz   ($key,&LB("ebx"));              # 14
        &or     ("edx",$acc);                   #  4
-       &movz   ($acc,&LB("ebx"));              # 14
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 14
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   # 14
+       &movz   ($key,&HB("eax"));              #  5
        &shl    ($acc,16);                      # 14
+       &shr    ("eax",16);                     #  3, 2
        &or     ("edx",$acc);                   # 14
-       &movd   ("mm1","edx");                  # t[1] collected
 
-       &movz   ($acc,&HB("eax"));              #  5
-       &movz   ("edx",&BP(-128,$tbl,$acc,1));  #  5
-       &shl    ("edx",8);                      #  5
-       &movz   ($acc,&HB("ebx"));              # 15
-       &shr    ("eax",16);                     #  3, 2
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   # 15
-       &shl    ($acc,24);                      # 15
-       &or     ("edx",$acc);                   # 15
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  5
+       &movz   ($key,&HB("ebx"));              # 15
        &shr    ("ebx",16);                     #  9, 8
+       &shl    ($acc,8);                       #  5
+       &movd   ("mm1","edx");                  # t[1] collected
+       &movz   ("edx",&BP(-128,$tbl,$key,1));  # 15
+       &movz   ($key,&HB("ebx"));              #  9
+       &shl    ("edx",24);                     # 15
+       &and    ("ebx",0xff);                   #  8
+       &or     ("edx",$acc);                   # 15
 
        &punpckldq      ("mm0","mm1");          # t[0,1] collected
 
-       &movz   ($acc,&HB("ebx"));              #  9
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  9
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  9
+       &movz   ($key,&LB("eax"));              #  2
        &shl    ($acc,8);                       #  9
-       &or     ("ecx",$acc);                   #  9
-       &and    ("ebx",0xff);                   #  8
+       &movz   ("eax",&HB("eax"));             #  3
        &movz   ("ebx",&BP(-128,$tbl,"ebx",1)); #  8
+       &or     ("ecx",$acc);                   #  9
+       &movz   ($acc,&BP(-128,$tbl,$key,1));   #  2
        &or     ("edx","ebx");                  #  8
-       &movz   ($acc,&LB("eax"));              #  2
-       &movz   ($acc,&BP(-128,$tbl,$acc,1));   #  2
        &shl    ($acc,16);                      #  2
-       &or     ("edx",$acc);                   #  2
-       &movd   ("mm4","edx");                  # t[2] collected
-       &movz   ("eax",&HB("eax"));             #  3
        &movz   ("eax",&BP(-128,$tbl,"eax",1)); #  3
+       &or     ("edx",$acc);                   #  2
        &shl    ("eax",24);                     #  3
        &or     ("ecx","eax");                  #  3
+       &mov    ($key,$__key);
+       &movd   ("mm4","edx");                  # t[2] collected
        &movd   ("mm5","ecx");                  # t[3] collected
 
        &punpckldq      ("mm4","mm5");          # t[2,3] collected
@@ -2865,32 +2872,32 @@ sub deckey()
 { my ($i,$key,$tp1,$tp2,$tp4,$tp8) = @_;
   my $tmp = $tbl;
 
-       &mov    ($acc,$tp1);
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
-       &shr    ($tmp,7);
+       &mov    ($tmp,0x80808080);
+       &and    ($tmp,$tp1);
        &lea    ($tp2,&DWP(0,$tp1,$tp1));
+       &mov    ($acc,$tmp);
+       &shr    ($tmp,7);
        &sub    ($acc,$tmp);
        &and    ($tp2,0xfefefefe);
        &and    ($acc,0x1b1b1b1b);
-       &xor    ($acc,$tp2);
-       &mov    ($tp2,$acc);
+       &xor    ($tp2,$acc);
+       &mov    ($tmp,0x80808080);
 
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
-       &shr    ($tmp,7);
+       &and    ($tmp,$tp2);
        &lea    ($tp4,&DWP(0,$tp2,$tp2));
+       &mov    ($acc,$tmp);
+       &shr    ($tmp,7);
        &sub    ($acc,$tmp);
        &and    ($tp4,0xfefefefe);
        &and    ($acc,0x1b1b1b1b);
         &xor   ($tp2,$tp1);    # tp2^tp1
-       &xor    ($acc,$tp4);
-       &mov    ($tp4,$acc);
+       &xor    ($tp4,$acc);
+       &mov    ($tmp,0x80808080);
 
-       &and    ($acc,0x80808080);
-       &mov    ($tmp,$acc);
-       &shr    ($tmp,7);
+       &and    ($tmp,$tp4);
        &lea    ($tp8,&DWP(0,$tp4,$tp4));
+       &mov    ($acc,$tmp);
+       &shr    ($tmp,7);
         &xor   ($tp4,$tp1);    # tp4^tp1
        &sub    ($acc,$tmp);
        &and    ($tp8,0xfefefefe);
index 1533e2c3042c7f8e0112cd9e30a676babf3c6b47..433912ff57ddf74abfb061a8c67c74ced4fbd77d 100644 (file)
@@ -27,9 +27,9 @@
 #
 #              aes-586.pl              vpaes-x86.pl
 #
-# Core 2(**)   29.1/42.3/18.3          22.0/25.6(***)
-# Nehalem      27.9/40.4/18.1          10.3/12.0
-# Atom         102./119./60.1          64.5/85.3(***)
+# Core 2(**)   28.1/41.4/18.3          21.9/25.2(***)
+# Nehalem      27.9/40.4/18.1          10.2/11.9
+# Atom         70.7/92.1/60.1          61.1/81.0(***)
 #
 # (*)  "Hyper-threading" in the context refers rather to cache shared
 #      among multiple cores, than to specifically Intel HTT. As vast
@@ -40,8 +40,8 @@
 # (**) "Core 2" refers to initial 65nm design, a.k.a. Conroe.
 #
 # (***)        Less impressive improvement on Core 2 and Atom is due to slow
-#      pshufb, yet it's respectable +32%/65%  improvement on Core 2
-#      and +58%/40% on Atom (as implied, over "hyper-threading-safe"
+#      pshufb, yet it's respectable +28%/64%  improvement on Core 2
+#      and +15% on Atom (as implied, over "hyper-threading-safe"
 #      code path).
 #
 #                                              <appro@openssl.org>
@@ -183,35 +183,35 @@ $k_dsbo=0x2c0;            # decryption sbox final output
        &movdqa ("xmm1","xmm6")
        &movdqa ("xmm2",&QWP($k_ipt,$const));
        &pandn  ("xmm1","xmm0");
-       &movdqu ("xmm5",&QWP(0,$key));
-       &psrld  ("xmm1",4);
        &pand   ("xmm0","xmm6");
+       &movdqu ("xmm5",&QWP(0,$key));
        &pshufb ("xmm2","xmm0");
        &movdqa ("xmm0",&QWP($k_ipt+16,$const));
-       &pshufb ("xmm0","xmm1");
        &pxor   ("xmm2","xmm5");
-       &pxor   ("xmm0","xmm2");
+       &psrld  ("xmm1",4);
        &add    ($key,16);
+       &pshufb ("xmm0","xmm1");
        &lea    ($base,&DWP($k_mc_backward,$const));
+       &pxor   ("xmm0","xmm2");
        &jmp    (&label("enc_entry"));
 
 
 &set_label("enc_loop",16);
        # middle of middle round
        &movdqa ("xmm4",&QWP($k_sb1,$const));   # 4 : sb1u
-       &pshufb ("xmm4","xmm2");                # 4 = sb1u
-       &pxor   ("xmm4","xmm5");                # 4 = sb1u + k
        &movdqa ("xmm0",&QWP($k_sb1+16,$const));# 0 : sb1t
+       &pshufb ("xmm4","xmm2");                # 4 = sb1u
        &pshufb ("xmm0","xmm3");                # 0 = sb1t
-       &pxor   ("xmm0","xmm4");                # 0 = A
+       &pxor   ("xmm4","xmm5");                # 4 = sb1u + k
        &movdqa ("xmm5",&QWP($k_sb2,$const));   # 4 : sb2u
-       &pshufb ("xmm5","xmm2");                # 4 = sb2u
+       &pxor   ("xmm0","xmm4");                # 0 = A
        &movdqa ("xmm1",&QWP(-0x40,$base,$magic));# .Lk_mc_forward[]
+       &pshufb ("xmm5","xmm2");                # 4 = sb2u
        &movdqa ("xmm2",&QWP($k_sb2+16,$const));# 2 : sb2t
-       &pshufb ("xmm2","xmm3");                # 2 = sb2t
-       &pxor   ("xmm2","xmm5");                # 2 = 2A
        &movdqa ("xmm4",&QWP(0,$base,$magic));  # .Lk_mc_backward[]
+       &pshufb ("xmm2","xmm3");                # 2 = sb2t
        &movdqa ("xmm3","xmm0");                # 3 = A
+       &pxor   ("xmm2","xmm5");                # 2 = 2A
        &pshufb ("xmm0","xmm1");                # 0 = B
        &add    ($key,16);                      # next key
        &pxor   ("xmm0","xmm2");                # 0 = 2A+B
@@ -220,30 +220,30 @@ $k_dsbo=0x2c0;            # decryption sbox final output
        &pxor   ("xmm3","xmm0");                # 3 = 2A+B+D
        &pshufb ("xmm0","xmm1");                # 0 = 2B+C
        &and    ($magic,0x30);                  # ... mod 4
-       &pxor   ("xmm0","xmm3");                # 0 = 2A+3B+C+D
        &sub    ($round,1);                     # nr--
+       &pxor   ("xmm0","xmm3");                # 0 = 2A+3B+C+D
 
 &set_label("enc_entry");
        # top of round
        &movdqa ("xmm1","xmm6");                # 1 : i
+       &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
        &pandn  ("xmm1","xmm0");                # 1 = i<<4
        &psrld  ("xmm1",4);                     # 1 = i
        &pand   ("xmm0","xmm6");                # 0 = k
-       &movdqa ("xmm5",&QWP($k_inv+16,$const));# 2 : a/k
        &pshufb ("xmm5","xmm0");                # 2 = a/k
-       &pxor   ("xmm0","xmm1");                # 0 = j
        &movdqa ("xmm3","xmm7");                # 3 : 1/i
+       &pxor   ("xmm0","xmm1");                # 0 = j
        &pshufb ("xmm3","xmm1");                # 3 = 1/i
-       &pxor   ("xmm3","xmm5");                # 3 = iak = 1/i + a/k
        &movdqa ("xmm4","xmm7");                # 4 : 1/j
+       &pxor   ("xmm3","xmm5");                # 3 = iak = 1/i + a/k
        &pshufb ("xmm4","xmm0");                # 4 = 1/j
-       &pxor   ("xmm4","xmm5");                # 4 = jak = 1/j + a/k
        &movdqa ("xmm2","xmm7");                # 2 : 1/iak
+       &pxor   ("xmm4","xmm5");                # 4 = jak = 1/j + a/k
        &pshufb ("xmm2","xmm3");                # 2 = 1/iak
-       &pxor   ("xmm2","xmm0");                # 2 = io
        &movdqa ("xmm3","xmm7");                # 3 : 1/jak
-       &movdqu ("xmm5",&QWP(0,$key));
+       &pxor   ("xmm2","xmm0");                # 2 = io
        &pshufb ("xmm3","xmm4");                # 3 = 1/jak
+       &movdqu ("xmm5",&QWP(0,$key));
        &pxor   ("xmm3","xmm1");                # 3 = jo
        &jnz    (&label("enc_loop"));
 
@@ -265,8 +265,8 @@ $k_dsbo=0x2c0;              # decryption sbox final output
 ##  Same API as encryption core.
 ##
 &function_begin_B("_vpaes_decrypt_core");
-       &mov    ($round,&DWP(240,$key));
        &lea    ($base,&DWP($k_dsbd,$const));
+       &mov    ($round,&DWP(240,$key));
        &movdqa ("xmm1","xmm6");
        &movdqa ("xmm2",&QWP($k_dipt-$k_dsbd,$base));
        &pandn  ("xmm1","xmm0");
@@ -292,62 +292,61 @@ $k_dsbo=0x2c0;            # decryption sbox final output
 ##  Inverse mix columns
 ##
        &movdqa ("xmm4",&QWP(-0x20,$base));     # 4 : sb9u
+       &movdqa ("xmm1",&QWP(-0x10,$base));     # 0 : sb9t
        &pshufb ("xmm4","xmm2");                # 4 = sb9u
+       &pshufb ("xmm1","xmm3");                # 0 = sb9t
        &pxor   ("xmm4","xmm0");
-       &movdqa ("xmm0",&QWP(-0x10,$base));     # 0 : sb9t
-       &pshufb ("xmm0","xmm3");                # 0 = sb9t
-       &pxor   ("xmm0","xmm4");                # 0 = ch
        &add    ($key,16);                      # next round key
+       &pxor   ("xmm1","xmm4");                # 0 = ch
 
-       &pshufb ("xmm0","xmm5");                # MC ch
        &movdqa ("xmm4",&QWP(0,$base));         # 4 : sbdu
+       &pshufb ("xmm1","xmm5");                # MC ch
        &pshufb ("xmm4","xmm2");                # 4 = sbdu
-       &pxor   ("xmm4","xmm0");                # 4 = ch
        &movdqa ("xmm0",&QWP(0x10,$base));      # 0 : sbdt
+       &pxor   ("xmm4","xmm1");                # 4 = ch
        &pshufb ("xmm0","xmm3");                # 0 = sbdt
-       &pxor   ("xmm0","xmm4");                # 0 = ch
        &sub    ($round,1);                     # nr--
+       &pxor   ("xmm0","xmm4");                # 0 = ch
 
-       &pshufb ("xmm0","xmm5");                # MC ch
        &movdqa ("xmm4",&QWP(0x20,$base));      # 4 : sbbu
+       &pshufb ("xmm0","xmm5");                # MC ch
+       &movdqa ("xmm1",&QWP(0x30,$base));      # 0 : sbbt
        &pshufb ("xmm4","xmm2");                # 4 = sbbu
+       &pshufb ("xmm1","xmm3");                # 0 = sbbt
        &pxor   ("xmm4","xmm0");                # 4 = ch
-       &movdqa ("xmm0",&QWP(0x30,$base));      # 0 : sbbt
-       &pshufb ("xmm0","xmm3");                # 0 = sbbt
-       &pxor   ("xmm0","xmm4");                # 0 = ch
+       &pxor   ("xmm1","xmm4");                # 0 = ch
 
-       &pshufb ("xmm0","xmm5");                # MC ch
        &movdqa ("xmm4",&QWP(0x40,$base));      # 4 : sbeu
-       &pshufb ("xmm4","xmm2");                # 4 = sbeu
-       &pxor   ("xmm4","xmm0");                # 4 = ch
+       &pshufb ("xmm1","xmm5");                # MC ch
        &movdqa ("xmm0",&QWP(0x50,$base));      # 0 : sbet
+       &pshufb ("xmm4","xmm2");                # 4 = sbeu
        &pshufb ("xmm0","xmm3");                # 0 = sbet
-       &pxor   ("xmm0","xmm4");                # 0 = ch
-
        &palignr("xmm5","xmm5",12);
+       &pxor   ("xmm4","xmm1");                # 4 = ch
+       &pxor   ("xmm0","xmm4");                # 0 = ch
 
 &set_label("dec_entry");
        # top of round
        &movdqa ("xmm1","xmm6");                # 1 : i
        &pandn  ("xmm1","xmm0");                # 1 = i<<4
+       &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
        &psrld  ("xmm1",4);                     # 1 = i
        &pand   ("xmm0","xmm6");                # 0 = k
-       &movdqa ("xmm2",&QWP($k_inv+16,$const));# 2 : a/k
        &pshufb ("xmm2","xmm0");                # 2 = a/k
-       &pxor   ("xmm0","xmm1");                # 0 = j
        &movdqa ("xmm3","xmm7");                # 3 : 1/i
+       &pxor   ("xmm0","xmm1");                # 0 = j
        &pshufb ("xmm3","xmm1");                # 3 = 1/i
-       &pxor   ("xmm3","xmm2");                # 3 = iak = 1/i + a/k
        &movdqa ("xmm4","xmm7");                # 4 : 1/j
+       &pxor   ("xmm3","xmm2");                # 3 = iak = 1/i + a/k
        &pshufb ("xmm4","xmm0");                # 4 = 1/j
        &pxor   ("xmm4","xmm2");                # 4 = jak = 1/j + a/k
        &movdqa ("xmm2","xmm7");                # 2 : 1/iak
        &pshufb ("xmm2","xmm3");                # 2 = 1/iak
-       &pxor   ("xmm2","xmm0");                # 2 = io
        &movdqa ("xmm3","xmm7");                # 3 : 1/jak
+       &pxor   ("xmm2","xmm0");                # 2 = io
        &pshufb ("xmm3","xmm4");                # 3 = 1/jak
-       &pxor   ("xmm3","xmm1");                # 3 = jo
        &movdqu ("xmm0",&QWP(0,$key));
+       &pxor   ("xmm3","xmm1");                # 3 = jo
        &jnz    (&label("dec_loop"));
 
        # middle of last round
@@ -542,12 +541,12 @@ $k_dsbo=0x2c0;            # decryption sbox final output
 ##    %xmm0: b+c+d  b+c  b  a
 ##
 &function_begin_B("_vpaes_schedule_192_smear");
-       &pshufd ("xmm0","xmm6",0x80);           # d c 0 0 -> c 0 0 0
-       &pxor   ("xmm6","xmm0");                # -> c+d c 0 0
+       &pshufd ("xmm1","xmm6",0x80);           # d c 0 0 -> c 0 0 0
        &pshufd ("xmm0","xmm7",0xFE);           # b a _ _ -> b b b a
+       &pxor   ("xmm6","xmm1");                # -> c+d c 0 0
+       &pxor   ("xmm1","xmm1");
        &pxor   ("xmm6","xmm0");                # -> b+c+d b+c b a
        &movdqa ("xmm0","xmm6");
-       &pxor   ("xmm1","xmm1");
        &movhlps("xmm6","xmm1");                # clobber low side with zeros
        &ret    ();
 &function_end_B("_vpaes_schedule_192_smear");