Improve ECB performance (48+14*rounds -> 18+13*rounds) and reserve for
authorAndy Polyakov <appro@openssl.org>
Mon, 24 Jan 2005 14:14:53 +0000 (14:14 +0000)
committerAndy Polyakov <appro@openssl.org>
Mon, 24 Jan 2005 14:14:53 +0000 (14:14 +0000)
hand-coded zero-copy AES_cbc_encrypt.

crypto/aes/asm/aes-ia64.S

index b7d0c9ca80bf870d66178d3fea38d5049d59d69a..542cf335e99558e75539a12bc421d09fc3bccbef 100644 (file)
 // much M-ports as there're I-ports on Itanium 2]. By sacrificing few
 // registers for small constants (255, 24 and 16) to be used with
 // 'shr' and 'and' instructions I can achieve better ILP, Intruction
-// Level Parallelism, and performance. This code outperforms gcc
-// generated code by almost factor of 2 (two). Improvement over HP C
-// is not that impressive, 20%...
+// Level Parallelism, and performance. This code outperforms GCC 3.3
+// generated code by over factor of 2 (two), GCC 3.4 - by 70% and
+// HP C - by 40%. Measured best-case scenario, i.e. aligned
+// big-endian input, ECB timing on Itanium 2 is (18 + 13*rounds)
+// ticks per block, or 9.25 CPU cycles per byte for 128 bit key.
 
-.ident "aes-ia64.S, version 1.0"
+.ident "aes-ia64.S, version 1.1"
 .ident "IA-64 ISA artwork by Andy Polyakov <appro@fy.chalmers.se>"
 .explicit
 .text
@@ -48,129 +50,44 @@ te0=r40;    te1=r41;    te2=r42;    te3=r43;
 # define ADDP  add
 #endif
 
-// Why is the key schedule sparse on 64-bit architectures? When/if we fix
-// it in C, these are the lines to modify accordingly.
+// This implies that AES_KEY comprises 32-bit key schedule elements
+// even on LP64 platforms.
 #ifndef        KSZ
-# define KSZ   8
-# define LDKEY ld8
+# define KSZ   4
+# define LDKEY ld4
 #endif
 
-// void AES_encrypt (const void *in,void *out,const AES_KEY *key);
-// measured timing on Itanium 2 is (48 + 14*rounds) cycles, or
-// 11.75 cycles per byte for 128 bit key...
-.global        AES_encrypt#
-.proc  AES_encrypt#
+.proc  _ia64_AES_encrypt#
+// Input:      rk0-rk1
+//             te0
+//             te3     as AES_KEY->rounds!!!
+//             s0-s3
+//             maskff,twenty4,sixteen
+// Output:     r16,r20,r24,r28 as s0-s3
+// Clobber:    r16-r31,rk0-rk1,r32-r43
 .align 32
-#if !defined(_HPUX_SOURCE)
-.skip  16
-#endif
-AES_encrypt:
-       .prologue
-       .fframe 0
-       .save   ar.pfs,r2
-       .save   ar.lc,r3
-{ .mii;        alloc   r2=ar.pfs,3,10,0,8
-       mov     r3=ar.lc        
-       mov     prsave=pr       };;
-
-       .body
-{ .mmi;        and     r40=3,r32
-       ADDP    r32=0,r32
-       mov     pr.rot=7<<16    };;
-#if defined(_HPUX_SOURCE)      // HPUX is big-endian, cut 15 cycles...
-{ .mib; cmp.ne p6,p0=r40,r0
-       add     r41=4,r32               // 1st arg, borrow teN
-(p6)   br.dpnt.many    .Le_unaligned   };;
-
-{ .mmi;        ld4     r19=[r32],8
-       mov     r44=r33                 // save 2nd arg
-       mov     twenty4=24      }
-{ .mmi;        ld4     r23=[r41],8
-       addl    te0=@ltoff(AES_Te#),gp
-       ADDP    r35=KSZ*60,r34  };;     // &AES_KEY->rounds, borrow s1
-{ .mmi;        ld8     te0=[te0]
-       ld4     r35=[r35]               // AES_KEY->rounds
-       ADDP    rk0=0,r34       }//;;   // 3rd arg
-{ .mmi;        ld4     r27=[r32]
-       ld4     r31=[r41]               
-       ADDP    rk1=KSZ,r34     };;
-
-{ .mfi; LDKEY  t0=[rk0],2*KSZ
-       mov     sixteen=16      }
-{ .mfi;        LDKEY   t1=[rk1],2*KSZ
-       mov     maskff=0xff     };;
-{ .mfi;        LDKEY   t2=[rk0],2*KSZ
-       add     te1=1024,te0    }
+_ia64_AES_encrypt:
+{ .mmi;        alloc   r16=ar.pfs,12,0,0,8
+       LDKEY   t0=[rk0],2*KSZ
+       mov     pr.rot=1<<16    }
+{ .mmi;        LDKEY   t1=[rk1],2*KSZ
+       add     te1=1024,te0
+       add     te3=-3,te3      };;
+{ .mib;        LDKEY   t2=[rk0],2*KSZ
+       mov     ar.ec=3         }
 { .mib;        LDKEY   t3=[rk1],2*KSZ
        add     te2=2048,te0
-       br.many .Le_common      };;
-#endif
-.Le_unaligned:
-{ .mfi;        ADDP    r40=0,r32               // 1st arg, borrow teN
-       ADDP    r41=1,r32       }
-{ .mfi;        ADDP    r42=2,r32
-       ADDP    r43=3,r32       };;
-{ .mmi;        ld1     r16=[r40],4
-       ld1     r17=[r41],4
-       mov     r44=r33         }//;;   // save 2nd arg
-{ .mmi;        ld1     r18=[r42],4
-       ld1     r19=[r43],4
-       ADDP    rk0=0,r34       };;     // 3rd arg
-{ .mmi;        ld1     r20=[r40],4
-       ld1     r21=[r41],4
-       ADDP    rk1=KSZ,r34     }//;;
-{ .mmi;        ld1     r22=[r42],4
-       ld1     r23=[r43],4
-       ADDP    r35=KSZ*60,r34  };;     // &AES_KEY->rounds, borrow s1
-{ .mmi;        ld1     r24=[r40],4
-       ld1     r25=[r41],4
-       mov     twenty4=24      }//;;
-{ .mmi;        ld1     r26=[r42],4
-       ld1     r27=[r43],4
-       mov     sixteen=16      };;
-{ .mmi;        ld1     r28=[r40]
-       ld1     r29=[r41]
-       mov     maskff=0xff     }//;;
-{ .mmi;        ld1     r30=[r42]
-       ld1     r31=[r43]
-       addl    te0=@ltoff(AES_Te#),gp  };;     // that was close...
+       brp.loop.imp    .Le_top,.Le_end-16      };;
 
-{ .mii;        ld8     te0=[te0]
-       dep     r19=r16,r19,24,8        //;;
-       dep     r23=r20,r23,24,8        }//;;
-{ .mii;        ld4     r35=[r35]               // AES_KEY->rounds
-       dep     r27=r24,r27,24,8        //;;
-       dep     r31=r28,r31,24,8        };;
-{ .mii;        LDKEY   t0=[rk0],2*KSZ
-       dep     r19=r17,r19,16,8        //;;
-       dep     r23=r21,r23,16,8        }//;;
-{ .mii;        LDKEY   t1=[rk1],2*KSZ
-       dep     r27=r25,r27,16,8        //;;
-       dep     r31=r29,r31,16,8        };;
-{ .mii;        LDKEY   t2=[rk0],2*KSZ
-       dep     r19=r18,r19,8,8         //;;
-       dep     r23=r22,r23,8,8         }//;;
-{ .mii;        LDKEY   t3=[rk1],2*KSZ
-       dep     r27=r26,r27,8,8         //;;
-       dep     r31=r30,r31,8,8         };;
-       
-{ .mib;        add     te1=1024,te0
-       add     te2=2048,te0    }
-.Le_common:
-{ .mib; add    te3=3072,te0
-       add     r35=-3,r35
-       brp.exit.imp    .Le_rounds_cexit,.Le_cexit_insn
-                               };;
-{ .mii;        mov     ar.lc=r35               // borrowed s1
-       mov     ar.ec=3         };;
-
-{ .mfi;        xor     s0=r19,t0
-       xor     s1=r23,t1       }
-{ .mfi;        xor     s2=r27,t2
-       xor     s3=r31,t3       };;
+{ .mmi;        xor     s0=s0,t0
+       xor     s1=s1,t1
+       mov     ar.lc=te3       }
+{ .mmi;        xor     s2=s2,t2
+       xor     s3=s3,t3
+       add     te3=3072,te0    };;
 
 .align 32
-.Le_rounds:
+.Le_top:
 { .mmi;        (p0)    LDKEY   t0=[rk0],2*KSZ          // 0/0:rk[0]
        (p0)    and     te33=s3,maskff          // 0/0:s3&0xff
        (p0)    extr.u  te22=s2,8,8     }       // 0/0:s2>>8&0xff
@@ -219,103 +136,187 @@ AES_encrypt:
        (p0)    and     te13=te13,maskff}       // 7/2:s3>>16&0xff
 { .mmi;        (p0)    ld4     te03=[te03]             // 7/3:te0[s3>>24]
        (p0)    shladd  te32=te32,2,te3         // 7/3:te3+s2
-       (p16)   cmp.eq  p0,p18=r0,r0    };;     // 7/clear (p18)
+       (p0)    xor     t0=t0,te33      };;     // 7/0:
 { .mmi;        (p0)    ld4     te31=[te31]             // 8/2:te3[s1]
        (p0)    shladd  te13=te13,2,te1         // 8/2:te1+s3>>16
-       (p17)   xor     t0=t0,te33      }       // 8/0:
+       (p0)    xor     t0=t0,te22      }       // 8/0:
 { .mmi;        (p0)    ld4     te32=[te32]             // 8/3:te3[s2]
        (p0)    shladd  te10=te10,2,te1         // 8/3:te1+s0>>16
-       (p17)   xor     t1=t1,te30      };;     // 8/1:
+       (p0)    xor     t1=t1,te30      };;     // 8/1:
 { .mmi;        (p0)    ld4     te13=[te13]             // 9/2:te1[s3>>16]
-       (p17)   xor     t0=t0,te22              // 9/0:
-       (p18)   add     te0=4096,te0    }       // 9/
-.Le_cexit_insn:
-{ .mmb;        (p0)    ld4     te10=[te10]             // 9/3:te1[s0>>16]
-       (p17)   xor     t1=t1,te23              // 9/1:
-       br.cexit.spnt.few       .Le_rounds_cexit
-                                       };;
-{ .mmi;        (p18)   xor     s2=s2,te20              // 10/2:
-       (p18)   xor     s0=s0,te00              // 10/0:
-       (p19)   add     te1=3072,te1    }       // 10/
-{ .mmi;        (p18)   xor     s3=s3,te21              // 10/3:
-       (p18)   xor     s1=s1,te01              // 10/1:
-       (p19)   add     te2=2048,te2    };;     // 10/
-{ .mfi;        (p18)   xor     s0=s0,te11              // 11/0:done!
-       (p18)   xor     s2=s2,te02      }       // 11/2:
-{ .mfi;        (p18)   xor     s1=s1,te12              // 11/1:done!
-       (p18)   xor     s3=s3,te03      };;     // 11/3:
-{ .mmi;        (p18)   xor     s2=s2,te31              // 12/2:
-       (p18)   xor     s3=s3,te32              // 12/3:
-       (p19)   add     te3=1024,te3    };;     // 12/
-{ .mib;        (p18)   xor     s2=s2,te13              // 13/2:done!
-       (p18)   xor     s3=s3,te10              // 13/3:done!
-       br      .Le_rounds      };;
+       (p0)    xor     t0=t0,te00              // 9/0:
+       (p0)    xor     t1=t1,te23      }       // 9/1:         
+{ .mmi;        (p0)    ld4     te10=[te10]             // 9/3:te1[s0>>16]
+       (p0)    xor     t2=t2,te20              // 9/2:
+       (p0)    xor     t3=t3,te21      };;     // 9/3:
+{ .mmi;        (p0)    xor     t0=t0,te11              // 10/0:done!
+       (p0)    xor     t1=t1,te01              // 10/1:
+       (p0)    xor     t2=t2,te02      }       // 10/2:
+{ .mmi;        (p0)    xor     t3=t3,te03              // 10/3:
+       (p16)   cmp.eq  p0,p17=r0,r0    };;     // 10/clear (p17)
+{ .mmi;        (p0)    xor     t1=t1,te12              // 11/1:done!
+       (p0)    xor     t2=t2,te31              // 11/2:
+       (p0)    xor     t3=t3,te32      }       // 11/3:
+{ .mmi;        (p17)   add     te0=4096,te0            // 11/  
+       (p17)   add     te1=4096,te1    };;     // 11/
+{ .mib;        (p0)    xor     t2=t2,te13              // 12/2:done!
+       (p0)    xor     t3=t3,te10      }       // 12/3:done!
+{ .mib;        (p17)   add     te2=4096,te2            // 12/
+       (p17)   add     te3=4096,te3            // 12/
+       br.ctop.sptk    .Le_top         };;
+.Le_end:
+{ .mib;        mov     r16=s0
+       mov     r20=s1                  }
+{ .mib;        mov     r24=s2
+       mov     r28=s3
+       br.ret.sptk     b6              };;
+.endp  _ia64_AES_encrypt#
 
+// void AES_encrypt (const void *in,void *out,const AES_KEY *key);
+.global        AES_encrypt#
+.proc  AES_encrypt#
 .align 32
-.Le_rounds_cexit:
-{ .mfi;        xor     te00=te00,s0            // "s0"
-       xor     te11=te11,s0    }
-{ .mfi;        xor     te22=te22,s0
-       xor     te33=te33,s0    }
-{ .mib;        xor     te01=te01,s1            // "s1"
-       xor     te12=te12,s1    }
-{ .mib;        xor     te23=te23,s1
-       xor     te30=te30,s1    }
-{ .mfi;        xor     te02=te02,s2            // "s2"
-       xor     te13=te13,s2    }
-{ .mfi;        xor     te20=te20,s2
-       xor     te31=te31,s2    }
-{ .mib;        xor     te03=te03,s3            // "s3"
-       xor     te10=te10,s3    }
-{ .mib;        xor     te21=te21,s3
-       xor     te32=te32,s3    };;
+.skip  16
+AES_encrypt:
+       .prologue
+       .fframe 0
+       .save   ar.pfs,r2
+       .save   ar.lc,r3
+{ .mmi;        alloc   r2=ar.pfs,3,0,12,0
+       addl    out8=@ltoff(AES_Te#),gp
+       mov     r3=ar.lc                }
+{ .mmi;        and     out0=3,in0
+       ADDP    in0=0,in0
+       ADDP    out11=KSZ*60,in2        };;     // &AES_KEY->rounds
+
+       .body
+{ .mmi;        ld8     out8=[out8]                     // Te0
+       ld4     out11=[out11]                   // AES_KEY->rounds
+       mov     prsave=pr               }
+
+#if defined(_HPUX_SOURCE)      // HPUX is big-endian, cut 15+15 cycles...
+{ .mib; cmp.ne p6,p0=out0,r0
+       add     out0=4,in0
+(p6)   br.dpnt.many    .Le_i_unaligned };;
+
+{ .mmi;        ld4     out1=[in0],8            // s0
+       and     out9=3,in1
+       mov     twenty4=24              }
+{ .mmi;        ld4     out3=[out0],8           // s1
+       ADDP    rk0=0,in2
+       mov     sixteen=16              };;
+{ .mmi;        ld4     out5=[in0]              // s2
+       cmp.ne  p6,p0=out9,r0
+       mov     maskff=0xff             }
+{ .mmb;        ld4     out7=[out0]             // s3
+       ADDP    rk1=KSZ,in2
+       br.call.sptk.many       b6=_ia64_AES_encrypt    };;
 
-{ .mii;        ADDP    r40=0,r44               // saved 2nd argument, snatch teN
-       extr.u  te22=te22,8,8
-       shr.u   te00=te00,twenty4       }//;;
-{ .mii;        ADDP    r41=1,r44
-       extr.u  te11=te11,16,8
-       shr.u   te01=te01,twenty4       }//;;
-{ .mii;        ADDP    r42=2,r44
-       extr.u  te23=te23,8,8
-       shr.u   te12=te12,sixteen       }//;;
-{ .mii;        ADDP    r43=3,r44
-       extr.u  te20=te20,8,8
-       shr.u   te02=te02,twenty4       };;
-{ .mii;        st1     [r43]=te33,4
-       extr.u  te13=te13,16,8
-       shr.u   te03=te03,twenty4       }//;;
-{ .mii;        st1     [r42]=te22,4
-       extr.u  te21=te21,8,8
-       shr.u   te10=te10,sixteen       }//;;
+{ .mib;        ADDP    in0=4,in1
+       ADDP    in1=0,in1
+(p6)   br.spnt .Le_o_unaligned         };;
 
-{ .mmi;        st1     [r41]=te11,4
-       st1     [r40]=te00,4            };;
-{ .mmi;        st1     [r43]=te30,4
-       st1     [r42]=te23,4            }//;;
-{ .mmi;        st1     [r41]=te12,4
-       st1     [r40]=te01,4            };;
-{ .mmi;        st1     [r43]=te31,4
-       st1     [r42]=te20,4            }//;;
-{ .mmi;        st1     [r41]=te13,4
-       st1     [r40]=te02,4
+{ .mii;        mov     ar.pfs=r2
+       mov     ar.lc=r3                }
+{ .mmi;        st4     [in1]=r16,8             // s0
+       st4     [in0]=r20,8             // s1
        mov     pr=prsave,0x1ffff       };;
-{ .mmi;        st1     [r43]=te32
-       st1     [r42]=te21
+{ .mmb;        st4     [in1]=r24               // s2
+       st4     [in0]=r28               // s3
+       br.ret.sptk.many        b0      };;
+#endif
+
+.align 32
+.Le_i_unaligned:
+{ .mmi;        add     out0=1,in0
+       add     out2=2,in0
+       add     out4=3,in0      };;
+{ .mmi;        ld1     r16=[in0],4
+       ld1     r17=[out0],4    }//;;
+{ .mmi;        ld1     r18=[out2],4
+       ld1     out1=[out4],4   };;     // s0
+{ .mmi;        ld1     r20=[in0],4
+       ld1     r21=[out0],4    }//;;
+{ .mmi;        ld1     r22=[out2],4
+       ld1     out3=[out4],4   };;     // s1
+{ .mmi;        ld1     r24=[in0],4
+       ld1     r25=[out0],4    }//;;
+{ .mmi;        ld1     r26=[out2],4
+       ld1     out5=[out4],4   };;     // s2
+{ .mmi;        ld1     r28=[in0]
+       ld1     r29=[out0]      }//;;
+{ .mmi;        ld1     r30=[out2]
+       ld1     out7=[out4]     };;     // s3
+
+{ .mii;
+       dep     out1=r16,out1,24,8      //;;
+       dep     out3=r20,out3,24,8      }//;;
+{ .mii;        ADDP    rk0=0,in2
+       dep     out5=r24,out5,24,8      //;;
+       dep     out7=r28,out7,24,8      };;
+{ .mii;        ADDP    rk1=KSZ,in2
+       dep     out1=r17,out1,16,8      //;;
+       dep     out3=r21,out3,16,8      }//;;
+{ .mii;        mov     twenty4=24
+       dep     out5=r25,out5,16,8      //;;
+       dep     out7=r29,out7,16,8      };;
+{ .mii;        mov     sixteen=16
+       dep     out1=r18,out1,8,8       //;;
+       dep     out3=r22,out3,8,8       }//;;
+{ .mii;        mov     maskff=0xff
+       dep     out5=r26,out5,8,8       //;;
+       dep     out7=r30,out7,8,8       };;
+
+{ .mib;        br.call.sptk.many       b6=_ia64_AES_encrypt    };;
+
+.Le_o_unaligned:
+{ .mii;        ADDP    out0=0,in1
+       extr.u  r17=r16,8,8                     // s0
+       shr.u   r19=r16,twenty4         }//;;
+{ .mii;        ADDP    out1=1,in1
+       extr.u  r18=r16,16,8
+       shr.u   r23=r20,twenty4         }//;;   // s1
+{ .mii;        ADDP    out2=2,in1
+       extr.u  r21=r20,8,8
+       shr.u   r22=r20,sixteen }//;;
+{ .mii;        ADDP    out3=3,in1
+       extr.u  r25=r24,8,8                     // s2
+       shr.u   r27=r24,twenty4         };;
+{ .mii;        st1     [out3]=r16,4
+       extr.u  r26=r24,16,8
+       shr.u   r31=r28,twenty4 }//;;   // s3
+{ .mii;        st1     [out2]=r17,4
+       extr.u  r29=r28,8,8
+       shr.u   r30=r28,sixteen         }//;;
+
+{ .mmi;        st1     [out1]=r18,4
+       st1     [out0]=r19,4            };;
+{ .mmi;        st1     [out3]=r20,4
+       st1     [out2]=r21,4            }//;;
+{ .mmi;        st1     [out1]=r22,4
+       st1     [out0]=r23,4            };;
+{ .mmi;        st1     [out3]=r24,4
+       st1     [out2]=r25,4
+       mov     pr=prsave,0x1ffff       }//;;
+{ .mmi;        st1     [out1]=r26,4
+       st1     [out0]=r27,4
+       mov     ar.pfs=r2               };;
+{ .mmi;        st1     [out3]=r28
+       st1     [out2]=r29
        mov     ar.lc=r3                }//;;
-{ .mmb;        st1     [r41]=te10
-       st1     [r40]=te03
+{ .mmb;        st1     [out1]=r30
+       st1     [out0]=r31
        br.ret.sptk.many        b0      };;
 .endp  AES_encrypt#
 
-// AES_decrypt is autogenerated by the following script:
+// *AES_decrypt are autogenerated by the following script:
 #if 0
 #!/usr/bin/env perl
-print "// AES_decrypt is autogenerated by the following script:\n#if 0\n";
+print "// *AES_decrypt are autogenerated by the following script:\n#if 0\n";
 open(PROG,'<'.$0); while(<PROG>) { print; } close(PROG);
 print "#endif\n";
 while(<>) {
-       $process=1      if (/\.global\s+AES_encrypt/);
+       $process=1      if (/\.proc\s+_ia64_AES_encrypt/);
        next            if (!$process);
 
        #s/te00=s0/td00=s0/;    s/te00/td00/g;
@@ -349,119 +350,37 @@ while(<>) {
        exit            if (/\.endp\s+AES_decrypt/);
 }
 #endif
-.global        AES_decrypt#
-.proc  AES_decrypt#
+.proc  _ia64_AES_decrypt#
+// Input:      rk0-rk1
+//             te0
+//             te3     as AES_KEY->rounds!!!
+//             s0-s3
+//             maskff,twenty4,sixteen
+// Output:     r16,r20,r24,r28 as s0-s3
+// Clobber:    r16-r31,rk0-rk1,r32-r43
 .align 32
-#if !defined(_HPUX_SOURCE)
-.skip  16
-#endif
-AES_decrypt:
-       .prologue
-       .fframe 0
-       .save   ar.pfs,r2
-       .save   ar.lc,r3
-{ .mii;        alloc   r2=ar.pfs,3,10,0,8
-       mov     r3=ar.lc        
-       mov     prsave=pr       };;
-
-       .body
-{ .mmi;        and     r40=3,r32
-       ADDP    r32=0,r32
-       mov     pr.rot=7<<16    };;
-#if defined(_HPUX_SOURCE)      // HPUX is big-endian, cut 15 cycles...
-{ .mib; cmp.ne p6,p0=r40,r0
-       add     r41=4,r32               // 1st arg, borrow teN
-(p6)   br.dpnt.many    .Ld_unaligned   };;
-
-{ .mmi;        ld4     r19=[r32],8
-       mov     r44=r33                 // save 2nd arg
-       mov     twenty4=24      }
-{ .mmi;        ld4     r23=[r41],8
-       addl    te0=@ltoff(AES_Td#),gp
-       ADDP    r35=KSZ*60,r34  };;     // &AES_KEY->rounds, borrow s1
-{ .mmi;        ld8     te0=[te0]
-       ld4     r35=[r35]               // AES_KEY->rounds
-       ADDP    rk0=0,r34       }//;;   // 3rd arg
-{ .mmi;        ld4     r27=[r32]
-       ld4     r31=[r41]               
-       ADDP    rk1=KSZ,r34     };;
-
-{ .mfi; LDKEY  t0=[rk0],2*KSZ
-       mov     sixteen=16      }
-{ .mfi;        LDKEY   t1=[rk1],2*KSZ
-       mov     maskff=0xff     };;
-{ .mfi;        LDKEY   t2=[rk0],2*KSZ
-       add     te1=1024,te0    }
+_ia64_AES_decrypt:
+{ .mmi;        alloc   r16=ar.pfs,12,0,0,8
+       LDKEY   t0=[rk0],2*KSZ
+       mov     pr.rot=1<<16    }
+{ .mmi;        LDKEY   t1=[rk1],2*KSZ
+       add     te1=1024,te0
+       add     te3=-3,te3      };;
+{ .mib;        LDKEY   t2=[rk0],2*KSZ
+       mov     ar.ec=3         }
 { .mib;        LDKEY   t3=[rk1],2*KSZ
        add     te2=2048,te0
-       br.many .Ld_common      };;
-#endif
-.Ld_unaligned:
-{ .mfi;        ADDP    r40=0,r32               // 1st arg, borrow teN
-       ADDP    r41=1,r32       }
-{ .mfi;        ADDP    r42=2,r32
-       ADDP    r43=3,r32       };;
-{ .mmi;        ld1     r16=[r40],4
-       ld1     r17=[r41],4
-       mov     r44=r33         }//;;   // save 2nd arg
-{ .mmi;        ld1     r18=[r42],4
-       ld1     r19=[r43],4
-       ADDP    rk0=0,r34       };;     // 3rd arg
-{ .mmi;        ld1     r20=[r40],4
-       ld1     r21=[r41],4
-       ADDP    rk1=KSZ,r34     }//;;
-{ .mmi;        ld1     r22=[r42],4
-       ld1     r23=[r43],4
-       ADDP    r35=KSZ*60,r34  };;     // &AES_KEY->rounds, borrow s1
-{ .mmi;        ld1     r24=[r40],4
-       ld1     r25=[r41],4
-       mov     twenty4=24      }//;;
-{ .mmi;        ld1     r26=[r42],4
-       ld1     r27=[r43],4
-       mov     sixteen=16      };;
-{ .mmi;        ld1     r28=[r40]
-       ld1     r29=[r41]
-       mov     maskff=0xff     }//;;
-{ .mmi;        ld1     r30=[r42]
-       ld1     r31=[r43]
-       addl    te0=@ltoff(AES_Td#),gp  };;     // that was close...
-
-{ .mii;        ld8     te0=[te0]
-       dep     r19=r16,r19,24,8        //;;
-       dep     r23=r20,r23,24,8        }//;;
-{ .mii;        ld4     r35=[r35]               // AES_KEY->rounds
-       dep     r27=r24,r27,24,8        //;;
-       dep     r31=r28,r31,24,8        };;
-{ .mii;        LDKEY   t0=[rk0],2*KSZ
-       dep     r19=r17,r19,16,8        //;;
-       dep     r23=r21,r23,16,8        }//;;
-{ .mii;        LDKEY   t1=[rk1],2*KSZ
-       dep     r27=r25,r27,16,8        //;;
-       dep     r31=r29,r31,16,8        };;
-{ .mii;        LDKEY   t2=[rk0],2*KSZ
-       dep     r19=r18,r19,8,8         //;;
-       dep     r23=r22,r23,8,8         }//;;
-{ .mii;        LDKEY   t3=[rk1],2*KSZ
-       dep     r27=r26,r27,8,8         //;;
-       dep     r31=r30,r31,8,8         };;
-       
-{ .mib;        add     te1=1024,te0
-       add     te2=2048,te0    }
-.Ld_common:
-{ .mib; add    te3=3072,te0
-       add     r35=-3,r35
-       brp.exit.imp    .Ld_rounds_cexit,.Ld_cexit_insn
-                               };;
-{ .mii;        mov     ar.lc=r35               // borrowed s1
-       mov     ar.ec=3         };;
+       brp.loop.imp    .Ld_top,.Ld_end-16      };;
 
-{ .mfi;        xor     s0=r19,t0
-       xor     s1=r23,t1       }
-{ .mfi;        xor     s2=r27,t2
-       xor     s3=r31,t3       };;
+{ .mmi;        xor     s0=s0,t0
+       xor     s1=s1,t1
+       mov     ar.lc=te3       }
+{ .mmi;        xor     s2=s2,t2
+       xor     s3=s3,t3
+       add     te3=3072,te0    };;
 
 .align 32
-.Ld_rounds:
+.Ld_top:
 { .mmi;        (p0)    LDKEY   t0=[rk0],2*KSZ          // 0/0:rk[0]
        (p0)    and     te31=s1,maskff          // 0/0:s3&0xff
        (p0)    extr.u  te22=s2,8,8     }       // 0/0:s2>>8&0xff
@@ -510,92 +429,176 @@ AES_decrypt:
        (p0)    and     te11=te11,maskff}       // 7/2:s3>>16&0xff
 { .mmi;        (p0)    ld4     te03=[te03]             // 7/3:te0[s3>>24]
        (p0)    shladd  te30=te30,2,te3         // 7/3:te3+s2
-       (p16)   cmp.eq  p0,p18=r0,r0    };;     // 7/clear (p18)
+       (p0)    xor     t0=t0,te31      };;     // 7/0:
 { .mmi;        (p0)    ld4     te33=[te33]             // 8/2:te3[s1]
        (p0)    shladd  te11=te11,2,te1         // 8/2:te1+s3>>16
-       (p17)   xor     t0=t0,te31      }       // 8/0:
+       (p0)    xor     t0=t0,te22      }       // 8/0:
 { .mmi;        (p0)    ld4     te30=[te30]             // 8/3:te3[s2]
        (p0)    shladd  te12=te12,2,te1         // 8/3:te1+s0>>16
-       (p17)   xor     t1=t1,te32      };;     // 8/1:
+       (p0)    xor     t1=t1,te32      };;     // 8/1:
 { .mmi;        (p0)    ld4     te11=[te11]             // 9/2:te1[s3>>16]
-       (p17)   xor     t0=t0,te22              // 9/0:
-       (p18)   add     te0=4096,te0    }       // 9/
-.Ld_cexit_insn:
-{ .mmb;        (p0)    ld4     te12=[te12]             // 9/3:te1[s0>>16]
-       (p17)   xor     t1=t1,te23              // 9/1:
-       br.cexit.spnt.few       .Ld_rounds_cexit
-                                       };;
-{ .mmi;        (p18)   xor     s2=s2,te20              // 10/2:
-       (p18)   xor     s0=s0,te00              // 10/0:
-       (p19)   add     te1=3072,te1    }       // 10/
-{ .mmi;        (p18)   xor     s3=s3,te21              // 10/3:
-       (p18)   xor     s1=s1,te01              // 10/1:
-       (p19)   add     te2=2048,te2    };;     // 10/
-{ .mfi;        (p18)   xor     s0=s0,te13              // 11/0:done!
-       (p18)   xor     s2=s2,te02      }       // 11/2:
-{ .mfi;        (p18)   xor     s1=s1,te10              // 11/1:done!
-       (p18)   xor     s3=s3,te03      };;     // 11/3:
-{ .mmi;        (p18)   xor     s2=s2,te33              // 12/2:
-       (p18)   xor     s3=s3,te30              // 12/3:
-       (p19)   add     te3=1024,te3    };;     // 12/
-{ .mib;        (p18)   xor     s2=s2,te11              // 13/2:done!
-       (p18)   xor     s3=s3,te12              // 13/3:done!
-       br      .Ld_rounds      };;
+       (p0)    xor     t0=t0,te00              // 9/0:
+       (p0)    xor     t1=t1,te23      }       // 9/1:         
+{ .mmi;        (p0)    ld4     te12=[te12]             // 9/3:te1[s0>>16]
+       (p0)    xor     t2=t2,te20              // 9/2:
+       (p0)    xor     t3=t3,te21      };;     // 9/3:
+{ .mmi;        (p0)    xor     t0=t0,te13              // 10/0:done!
+       (p0)    xor     t1=t1,te01              // 10/1:
+       (p0)    xor     t2=t2,te02      }       // 10/2:
+{ .mmi;        (p0)    xor     t3=t3,te03              // 10/3:
+       (p16)   cmp.eq  p0,p17=r0,r0    };;     // 10/clear (p17)
+{ .mmi;        (p0)    xor     t1=t1,te10              // 11/1:done!
+       (p0)    xor     t2=t2,te33              // 11/2:
+       (p0)    xor     t3=t3,te30      }       // 11/3:
+{ .mmi;        (p17)   add     te0=4096,te0            // 11/  
+       (p17)   add     te1=4096,te1    };;     // 11/
+{ .mib;        (p0)    xor     t2=t2,te11              // 12/2:done!
+       (p0)    xor     t3=t3,te12      }       // 12/3:done!
+{ .mib;        (p17)   add     te2=4096,te2            // 12/
+       (p17)   add     te3=4096,te3            // 12/
+       br.ctop.sptk    .Ld_top         };;
+.Ld_end:
+{ .mib;        mov     r16=s0
+       mov     r20=s1                  }
+{ .mib;        mov     r24=s2
+       mov     r28=s3
+       br.ret.sptk     b6              };;
+.endp  _ia64_AES_decrypt#
 
+// void AES_decrypt (const void *in,void *out,const AES_KEY *key);
+.global        AES_decrypt#
+.proc  AES_decrypt#
 .align 32
-.Ld_rounds_cexit:
-{ .mfi;        xor     te00=te00,s0            // "s0"
-       xor     te13=te13,s0    }
-{ .mfi;        xor     te22=te22,s0
-       xor     te31=te31,s0    }
-{ .mib;        xor     te01=te01,s1            // "s1"
-       xor     te10=te10,s1    }
-{ .mib;        xor     te23=te23,s1
-       xor     te32=te32,s1    }
-{ .mfi;        xor     te02=te02,s2            // "s2"
-       xor     te11=te11,s2    }
-{ .mfi;        xor     te20=te20,s2
-       xor     te33=te33,s2    }
-{ .mib;        xor     te03=te03,s3            // "s3"
-       xor     te12=te12,s3    }
-{ .mib;        xor     te21=te21,s3
-       xor     te30=te30,s3    };;
+.skip  16
+AES_decrypt:
+       .prologue
+       .fframe 0
+       .save   ar.pfs,r2
+       .save   ar.lc,r3
+{ .mmi;        alloc   r2=ar.pfs,3,0,12,0
+       addl    out8=@ltoff(AES_Td#),gp
+       mov     r3=ar.lc                }
+{ .mmi;        and     out0=3,in0
+       ADDP    in0=0,in0
+       ADDP    out11=KSZ*60,in2        };;     // &AES_KEY->rounds
+
+       .body
+{ .mmi;        ld8     out8=[out8]                     // Te0
+       ld4     out11=[out11]                   // AES_KEY->rounds
+       mov     prsave=pr               }
+
+#if defined(_HPUX_SOURCE)      // HPUX is big-endian, cut 15+15 cycles...
+{ .mib; cmp.ne p6,p0=out0,r0
+       add     out0=4,in0
+(p6)   br.dpnt.many    .Ld_i_unaligned };;
 
-{ .mii;        ADDP    r40=0,r44               // saved 2nd argument, snatch teN
-       extr.u  te22=te22,8,8
-       shr.u   te00=te00,twenty4       }//;;
-{ .mii;        ADDP    r41=1,r44
-       extr.u  te13=te13,16,8
-       shr.u   te01=te01,twenty4       }//;;
-{ .mii;        ADDP    r42=2,r44
-       extr.u  te23=te23,8,8
-       shr.u   te10=te10,sixteen       }//;;
-{ .mii;        ADDP    r43=3,r44
-       extr.u  te20=te20,8,8
-       shr.u   te02=te02,twenty4       };;
-{ .mii;        st1     [r43]=te31,4
-       extr.u  te11=te11,16,8
-       shr.u   te03=te03,twenty4       }//;;
-{ .mii;        st1     [r42]=te22,4
-       extr.u  te21=te21,8,8
-       shr.u   te12=te12,sixteen       }//;;
+{ .mmi;        ld4     out1=[in0],8            // s0
+       and     out9=3,in1
+       mov     twenty4=24              }
+{ .mmi;        ld4     out3=[out0],8           // s1
+       ADDP    rk0=0,in2
+       mov     sixteen=16              };;
+{ .mmi;        ld4     out5=[in0]              // s2
+       cmp.ne  p6,p0=out9,r0
+       mov     maskff=0xff             }
+{ .mmb;        ld4     out7=[out0]             // s3
+       ADDP    rk1=KSZ,in2
+       br.call.sptk.many       b6=_ia64_AES_decrypt    };;
 
-{ .mmi;        st1     [r41]=te13,4
-       st1     [r40]=te00,4            };;
-{ .mmi;        st1     [r43]=te32,4
-       st1     [r42]=te23,4            }//;;
-{ .mmi;        st1     [r41]=te10,4
-       st1     [r40]=te01,4            };;
-{ .mmi;        st1     [r43]=te33,4
-       st1     [r42]=te20,4            }//;;
-{ .mmi;        st1     [r41]=te11,4
-       st1     [r40]=te02,4
+{ .mib;        ADDP    in0=4,in1
+       ADDP    in1=0,in1
+(p6)   br.spnt .Ld_o_unaligned         };;
+
+{ .mii;        mov     ar.pfs=r2
+       mov     ar.lc=r3                }
+{ .mmi;        st4     [in1]=r16,8             // s0
+       st4     [in0]=r20,8             // s1
        mov     pr=prsave,0x1ffff       };;
-{ .mmi;        st1     [r43]=te30
-       st1     [r42]=te21
+{ .mmb;        st4     [in1]=r24               // s2
+       st4     [in0]=r28               // s3
+       br.ret.sptk.many        b0      };;
+#endif
+
+.align 32
+.Ld_i_unaligned:
+{ .mmi;        add     out0=1,in0
+       add     out2=2,in0
+       add     out4=3,in0      };;
+{ .mmi;        ld1     r16=[in0],4
+       ld1     r17=[out0],4    }//;;
+{ .mmi;        ld1     r18=[out2],4
+       ld1     out1=[out4],4   };;     // s0
+{ .mmi;        ld1     r20=[in0],4
+       ld1     r21=[out0],4    }//;;
+{ .mmi;        ld1     r22=[out2],4
+       ld1     out3=[out4],4   };;     // s1
+{ .mmi;        ld1     r24=[in0],4
+       ld1     r25=[out0],4    }//;;
+{ .mmi;        ld1     r26=[out2],4
+       ld1     out5=[out4],4   };;     // s2
+{ .mmi;        ld1     r28=[in0]
+       ld1     r29=[out0]      }//;;
+{ .mmi;        ld1     r30=[out2]
+       ld1     out7=[out4]     };;     // s3
+
+{ .mii;
+       dep     out1=r16,out1,24,8      //;;
+       dep     out3=r20,out3,24,8      }//;;
+{ .mii;        ADDP    rk0=0,in2
+       dep     out5=r24,out5,24,8      //;;
+       dep     out7=r28,out7,24,8      };;
+{ .mii;        ADDP    rk1=KSZ,in2
+       dep     out1=r17,out1,16,8      //;;
+       dep     out3=r21,out3,16,8      }//;;
+{ .mii;        mov     twenty4=24
+       dep     out5=r25,out5,16,8      //;;
+       dep     out7=r29,out7,16,8      };;
+{ .mii;        mov     sixteen=16
+       dep     out1=r18,out1,8,8       //;;
+       dep     out3=r22,out3,8,8       }//;;
+{ .mii;        mov     maskff=0xff
+       dep     out5=r26,out5,8,8       //;;
+       dep     out7=r30,out7,8,8       };;
+
+{ .mib;        br.call.sptk.many       b6=_ia64_AES_decrypt    };;
+
+.Ld_o_unaligned:
+{ .mii;        ADDP    out0=0,in1
+       extr.u  r17=r16,8,8                     // s0
+       shr.u   r19=r16,twenty4         }//;;
+{ .mii;        ADDP    out1=1,in1
+       extr.u  r18=r16,16,8
+       shr.u   r23=r20,twenty4         }//;;   // s1
+{ .mii;        ADDP    out2=2,in1
+       extr.u  r21=r20,8,8
+       shr.u   r22=r20,sixteen }//;;
+{ .mii;        ADDP    out3=3,in1
+       extr.u  r25=r24,8,8                     // s2
+       shr.u   r27=r24,twenty4         };;
+{ .mii;        st1     [out3]=r16,4
+       extr.u  r26=r24,16,8
+       shr.u   r31=r28,twenty4 }//;;   // s3
+{ .mii;        st1     [out2]=r17,4
+       extr.u  r29=r28,8,8
+       shr.u   r30=r28,sixteen         }//;;
+
+{ .mmi;        st1     [out1]=r18,4
+       st1     [out0]=r19,4            };;
+{ .mmi;        st1     [out3]=r20,4
+       st1     [out2]=r21,4            }//;;
+{ .mmi;        st1     [out1]=r22,4
+       st1     [out0]=r23,4            };;
+{ .mmi;        st1     [out3]=r24,4
+       st1     [out2]=r25,4
+       mov     pr=prsave,0x1ffff       }//;;
+{ .mmi;        st1     [out1]=r26,4
+       st1     [out0]=r27,4
+       mov     ar.pfs=r2               };;
+{ .mmi;        st1     [out3]=r28
+       st1     [out2]=r29
        mov     ar.lc=r3                }//;;
-{ .mmb;        st1     [r41]=te12
-       st1     [r40]=te03
+{ .mmb;        st1     [out1]=r30
+       st1     [out0]=r31
        br.ret.sptk.many        b0      };;
 .endp  AES_decrypt#
 
@@ -863,71 +866,266 @@ AES_Te:  data4   0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d
        data4   0x4141c382, 0x9999b029, 0x2d2d775a, 0x0f0f111e
        data4   0xb0b0cb7b, 0x5454fca8, 0xbbbbd66d, 0x16163a2c
 // Te4:
-       data4   0x63636363, 0x7c7c7c7c, 0x77777777, 0x7b7b7b7b
-       data4   0xf2f2f2f2, 0x6b6b6b6b, 0x6f6f6f6f, 0xc5c5c5c5
-       data4   0x30303030, 0x01010101, 0x67676767, 0x2b2b2b2b
-       data4   0xfefefefe, 0xd7d7d7d7, 0xabababab, 0x76767676
-       data4   0xcacacaca, 0x82828282, 0xc9c9c9c9, 0x7d7d7d7d
-       data4   0xfafafafa, 0x59595959, 0x47474747, 0xf0f0f0f0
-       data4   0xadadadad, 0xd4d4d4d4, 0xa2a2a2a2, 0xafafafaf
-       data4   0x9c9c9c9c, 0xa4a4a4a4, 0x72727272, 0xc0c0c0c0
-       data4   0xb7b7b7b7, 0xfdfdfdfd, 0x93939393, 0x26262626
-       data4   0x36363636, 0x3f3f3f3f, 0xf7f7f7f7, 0xcccccccc
-       data4   0x34343434, 0xa5a5a5a5, 0xe5e5e5e5, 0xf1f1f1f1
-       data4   0x71717171, 0xd8d8d8d8, 0x31313131, 0x15151515
-       data4   0x04040404, 0xc7c7c7c7, 0x23232323, 0xc3c3c3c3
-       data4   0x18181818, 0x96969696, 0x05050505, 0x9a9a9a9a
-       data4   0x07070707, 0x12121212, 0x80808080, 0xe2e2e2e2
-       data4   0xebebebeb, 0x27272727, 0xb2b2b2b2, 0x75757575
-       data4   0x09090909, 0x83838383, 0x2c2c2c2c, 0x1a1a1a1a
-       data4   0x1b1b1b1b, 0x6e6e6e6e, 0x5a5a5a5a, 0xa0a0a0a0
-       data4   0x52525252, 0x3b3b3b3b, 0xd6d6d6d6, 0xb3b3b3b3
-       data4   0x29292929, 0xe3e3e3e3, 0x2f2f2f2f, 0x84848484
-       data4   0x53535353, 0xd1d1d1d1, 0x00000000, 0xedededed
-       data4   0x20202020, 0xfcfcfcfc, 0xb1b1b1b1, 0x5b5b5b5b
-       data4   0x6a6a6a6a, 0xcbcbcbcb, 0xbebebebe, 0x39393939
-       data4   0x4a4a4a4a, 0x4c4c4c4c, 0x58585858, 0xcfcfcfcf
-       data4   0xd0d0d0d0, 0xefefefef, 0xaaaaaaaa, 0xfbfbfbfb
-       data4   0x43434343, 0x4d4d4d4d, 0x33333333, 0x85858585
-       data4   0x45454545, 0xf9f9f9f9, 0x02020202, 0x7f7f7f7f
-       data4   0x50505050, 0x3c3c3c3c, 0x9f9f9f9f, 0xa8a8a8a8
-       data4   0x51515151, 0xa3a3a3a3, 0x40404040, 0x8f8f8f8f
-       data4   0x92929292, 0x9d9d9d9d, 0x38383838, 0xf5f5f5f5
-       data4   0xbcbcbcbc, 0xb6b6b6b6, 0xdadadada, 0x21212121
-       data4   0x10101010, 0xffffffff, 0xf3f3f3f3, 0xd2d2d2d2
-       data4   0xcdcdcdcd, 0x0c0c0c0c, 0x13131313, 0xecececec
-       data4   0x5f5f5f5f, 0x97979797, 0x44444444, 0x17171717
-       data4   0xc4c4c4c4, 0xa7a7a7a7, 0x7e7e7e7e, 0x3d3d3d3d
-       data4   0x64646464, 0x5d5d5d5d, 0x19191919, 0x73737373
-       data4   0x60606060, 0x81818181, 0x4f4f4f4f, 0xdcdcdcdc
-       data4   0x22222222, 0x2a2a2a2a, 0x90909090, 0x88888888
-       data4   0x46464646, 0xeeeeeeee, 0xb8b8b8b8, 0x14141414
-       data4   0xdededede, 0x5e5e5e5e, 0x0b0b0b0b, 0xdbdbdbdb
-       data4   0xe0e0e0e0, 0x32323232, 0x3a3a3a3a, 0x0a0a0a0a
-       data4   0x49494949, 0x06060606, 0x24242424, 0x5c5c5c5c
-       data4   0xc2c2c2c2, 0xd3d3d3d3, 0xacacacac, 0x62626262
-       data4   0x91919191, 0x95959595, 0xe4e4e4e4, 0x79797979
-       data4   0xe7e7e7e7, 0xc8c8c8c8, 0x37373737, 0x6d6d6d6d
-       data4   0x8d8d8d8d, 0xd5d5d5d5, 0x4e4e4e4e, 0xa9a9a9a9
-       data4   0x6c6c6c6c, 0x56565656, 0xf4f4f4f4, 0xeaeaeaea
-       data4   0x65656565, 0x7a7a7a7a, 0xaeaeaeae, 0x08080808
-       data4   0xbabababa, 0x78787878, 0x25252525, 0x2e2e2e2e
-       data4   0x1c1c1c1c, 0xa6a6a6a6, 0xb4b4b4b4, 0xc6c6c6c6
-       data4   0xe8e8e8e8, 0xdddddddd, 0x74747474, 0x1f1f1f1f
-       data4   0x4b4b4b4b, 0xbdbdbdbd, 0x8b8b8b8b, 0x8a8a8a8a
-       data4   0x70707070, 0x3e3e3e3e, 0xb5b5b5b5, 0x66666666
-       data4   0x48484848, 0x03030303, 0xf6f6f6f6, 0x0e0e0e0e
-       data4   0x61616161, 0x35353535, 0x57575757, 0xb9b9b9b9
-       data4   0x86868686, 0xc1c1c1c1, 0x1d1d1d1d, 0x9e9e9e9e
-       data4   0xe1e1e1e1, 0xf8f8f8f8, 0x98989898, 0x11111111
-       data4   0x69696969, 0xd9d9d9d9, 0x8e8e8e8e, 0x94949494
-       data4   0x9b9b9b9b, 0x1e1e1e1e, 0x87878787, 0xe9e9e9e9
-       data4   0xcececece, 0x55555555, 0x28282828, 0xdfdfdfdf
-       data4   0x8c8c8c8c, 0xa1a1a1a1, 0x89898989, 0x0d0d0d0d
-       data4   0xbfbfbfbf, 0xe6e6e6e6, 0x42424242, 0x68686868
-       data4   0x41414141, 0x99999999, 0x2d2d2d2d, 0x0f0f0f0f
-       data4   0xb0b0b0b0, 0x54545454, 0xbbbbbbbb, 0x16161616
-.size  AES_Te#,5*256*4 // HP-UX assembler fails to ".-AES_Te#"
+       data4   0x63000000, 0x7c000000, 0x77000000, 0x7b000000
+       data4   0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000
+       data4   0x30000000, 0x01000000, 0x67000000, 0x2b000000
+       data4   0xfe000000, 0xd7000000, 0xab000000, 0x76000000
+       data4   0xca000000, 0x82000000, 0xc9000000, 0x7d000000
+       data4   0xfa000000, 0x59000000, 0x47000000, 0xf0000000
+       data4   0xad000000, 0xd4000000, 0xa2000000, 0xaf000000
+       data4   0x9c000000, 0xa4000000, 0x72000000, 0xc0000000
+       data4   0xb7000000, 0xfd000000, 0x93000000, 0x26000000
+       data4   0x36000000, 0x3f000000, 0xf7000000, 0xcc000000
+       data4   0x34000000, 0xa5000000, 0xe5000000, 0xf1000000
+       data4   0x71000000, 0xd8000000, 0x31000000, 0x15000000
+       data4   0x04000000, 0xc7000000, 0x23000000, 0xc3000000
+       data4   0x18000000, 0x96000000, 0x05000000, 0x9a000000
+       data4   0x07000000, 0x12000000, 0x80000000, 0xe2000000
+       data4   0xeb000000, 0x27000000, 0xb2000000, 0x75000000
+       data4   0x09000000, 0x83000000, 0x2c000000, 0x1a000000
+       data4   0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000
+       data4   0x52000000, 0x3b000000, 0xd6000000, 0xb3000000
+       data4   0x29000000, 0xe3000000, 0x2f000000, 0x84000000
+       data4   0x53000000, 0xd1000000, 0x00000000, 0xed000000
+       data4   0x20000000, 0xfc000000, 0xb1000000, 0x5b000000
+       data4   0x6a000000, 0xcb000000, 0xbe000000, 0x39000000
+       data4   0x4a000000, 0x4c000000, 0x58000000, 0xcf000000
+       data4   0xd0000000, 0xef000000, 0xaa000000, 0xfb000000
+       data4   0x43000000, 0x4d000000, 0x33000000, 0x85000000
+       data4   0x45000000, 0xf9000000, 0x02000000, 0x7f000000
+       data4   0x50000000, 0x3c000000, 0x9f000000, 0xa8000000
+       data4   0x51000000, 0xa3000000, 0x40000000, 0x8f000000
+       data4   0x92000000, 0x9d000000, 0x38000000, 0xf5000000
+       data4   0xbc000000, 0xb6000000, 0xda000000, 0x21000000
+       data4   0x10000000, 0xff000000, 0xf3000000, 0xd2000000
+       data4   0xcd000000, 0x0c000000, 0x13000000, 0xec000000
+       data4   0x5f000000, 0x97000000, 0x44000000, 0x17000000
+       data4   0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000
+       data4   0x64000000, 0x5d000000, 0x19000000, 0x73000000
+       data4   0x60000000, 0x81000000, 0x4f000000, 0xdc000000
+       data4   0x22000000, 0x2a000000, 0x90000000, 0x88000000
+       data4   0x46000000, 0xee000000, 0xb8000000, 0x14000000
+       data4   0xde000000, 0x5e000000, 0x0b000000, 0xdb000000
+       data4   0xe0000000, 0x32000000, 0x3a000000, 0x0a000000
+       data4   0x49000000, 0x06000000, 0x24000000, 0x5c000000
+       data4   0xc2000000, 0xd3000000, 0xac000000, 0x62000000
+       data4   0x91000000, 0x95000000, 0xe4000000, 0x79000000
+       data4   0xe7000000, 0xc8000000, 0x37000000, 0x6d000000
+       data4   0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000
+       data4   0x6c000000, 0x56000000, 0xf4000000, 0xea000000
+       data4   0x65000000, 0x7a000000, 0xae000000, 0x08000000
+       data4   0xba000000, 0x78000000, 0x25000000, 0x2e000000
+       data4   0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000
+       data4   0xe8000000, 0xdd000000, 0x74000000, 0x1f000000
+       data4   0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000
+       data4   0x70000000, 0x3e000000, 0xb5000000, 0x66000000
+       data4   0x48000000, 0x03000000, 0xf6000000, 0x0e000000
+       data4   0x61000000, 0x35000000, 0x57000000, 0xb9000000
+       data4   0x86000000, 0xc1000000, 0x1d000000, 0x9e000000
+       data4   0xe1000000, 0xf8000000, 0x98000000, 0x11000000
+       data4   0x69000000, 0xd9000000, 0x8e000000, 0x94000000
+       data4   0x9b000000, 0x1e000000, 0x87000000, 0xe9000000
+       data4   0xce000000, 0x55000000, 0x28000000, 0xdf000000
+       data4   0x8c000000, 0xa1000000, 0x89000000, 0x0d000000
+       data4   0xbf000000, 0xe6000000, 0x42000000, 0x68000000
+       data4   0x41000000, 0x99000000, 0x2d000000, 0x0f000000
+       data4   0xb0000000, 0x54000000, 0xbb000000, 0x16000000
+// Te5:
+       data4   0x00630000, 0x007c0000, 0x00770000, 0x007b0000
+       data4   0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000
+       data4   0x00300000, 0x00010000, 0x00670000, 0x002b0000
+       data4   0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000
+       data4   0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000
+       data4   0x00fa0000, 0x00590000, 0x00470000, 0x00f00000
+       data4   0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000
+       data4   0x009c0000, 0x00a40000, 0x00720000, 0x00c00000
+       data4   0x00b70000, 0x00fd0000, 0x00930000, 0x00260000
+       data4   0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000
+       data4   0x00340000, 0x00a50000, 0x00e50000, 0x00f10000
+       data4   0x00710000, 0x00d80000, 0x00310000, 0x00150000
+       data4   0x00040000, 0x00c70000, 0x00230000, 0x00c30000
+       data4   0x00180000, 0x00960000, 0x00050000, 0x009a0000
+       data4   0x00070000, 0x00120000, 0x00800000, 0x00e20000
+       data4   0x00eb0000, 0x00270000, 0x00b20000, 0x00750000
+       data4   0x00090000, 0x00830000, 0x002c0000, 0x001a0000
+       data4   0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000
+       data4   0x00520000, 0x003b0000, 0x00d60000, 0x00b30000
+       data4   0x00290000, 0x00e30000, 0x002f0000, 0x00840000
+       data4   0x00530000, 0x00d10000, 0x00000000, 0x00ed0000
+       data4   0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000
+       data4   0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000
+       data4   0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000
+       data4   0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000
+       data4   0x00430000, 0x004d0000, 0x00330000, 0x00850000
+       data4   0x00450000, 0x00f90000, 0x00020000, 0x007f0000
+       data4   0x00500000, 0x003c0000, 0x009f0000, 0x00a80000
+       data4   0x00510000, 0x00a30000, 0x00400000, 0x008f0000
+       data4   0x00920000, 0x009d0000, 0x00380000, 0x00f50000
+       data4   0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000
+       data4   0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000
+       data4   0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000
+       data4   0x005f0000, 0x00970000, 0x00440000, 0x00170000
+       data4   0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000
+       data4   0x00640000, 0x005d0000, 0x00190000, 0x00730000
+       data4   0x00600000, 0x00810000, 0x004f0000, 0x00dc0000
+       data4   0x00220000, 0x002a0000, 0x00900000, 0x00880000
+       data4   0x00460000, 0x00ee0000, 0x00b80000, 0x00140000
+       data4   0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000
+       data4   0x00e00000, 0x00320000, 0x003a0000, 0x000a0000
+       data4   0x00490000, 0x00060000, 0x00240000, 0x005c0000
+       data4   0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000
+       data4   0x00910000, 0x00950000, 0x00e40000, 0x00790000
+       data4   0x00e70000, 0x00c80000, 0x00370000, 0x006d0000
+       data4   0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000
+       data4   0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000
+       data4   0x00650000, 0x007a0000, 0x00ae0000, 0x00080000
+       data4   0x00ba0000, 0x00780000, 0x00250000, 0x002e0000
+       data4   0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000
+       data4   0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000
+       data4   0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000
+       data4   0x00700000, 0x003e0000, 0x00b50000, 0x00660000
+       data4   0x00480000, 0x00030000, 0x00f60000, 0x000e0000
+       data4   0x00610000, 0x00350000, 0x00570000, 0x00b90000
+       data4   0x00860000, 0x00c10000, 0x001d0000, 0x009e0000
+       data4   0x00e10000, 0x00f80000, 0x00980000, 0x00110000
+       data4   0x00690000, 0x00d90000, 0x008e0000, 0x00940000
+       data4   0x009b0000, 0x001e0000, 0x00870000, 0x00e90000
+       data4   0x00ce0000, 0x00550000, 0x00280000, 0x00df0000
+       data4   0x008c0000, 0x00a10000, 0x00890000, 0x000d0000
+       data4   0x00bf0000, 0x00e60000, 0x00420000, 0x00680000
+       data4   0x00410000, 0x00990000, 0x002d0000, 0x000f0000
+       data4   0x00b00000, 0x00540000, 0x00bb0000, 0x00160000
+// Te6:
+       data4   0x00006300, 0x00007c00, 0x00007700, 0x00007b00
+       data4   0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500
+       data4   0x00003000, 0x00000100, 0x00006700, 0x00002b00
+       data4   0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600
+       data4   0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00
+       data4   0x0000fa00, 0x00005900, 0x00004700, 0x0000f000
+       data4   0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00
+       data4   0x00009c00, 0x0000a400, 0x00007200, 0x0000c000
+       data4   0x0000b700, 0x0000fd00, 0x00009300, 0x00002600
+       data4   0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00
+       data4   0x00003400, 0x0000a500, 0x0000e500, 0x0000f100
+       data4   0x00007100, 0x0000d800, 0x00003100, 0x00001500
+       data4   0x00000400, 0x0000c700, 0x00002300, 0x0000c300
+       data4   0x00001800, 0x00009600, 0x00000500, 0x00009a00
+       data4   0x00000700, 0x00001200, 0x00008000, 0x0000e200
+       data4   0x0000eb00, 0x00002700, 0x0000b200, 0x00007500
+       data4   0x00000900, 0x00008300, 0x00002c00, 0x00001a00
+       data4   0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000
+       data4   0x00005200, 0x00003b00, 0x0000d600, 0x0000b300
+       data4   0x00002900, 0x0000e300, 0x00002f00, 0x00008400
+       data4   0x00005300, 0x0000d100, 0x00000000, 0x0000ed00
+       data4   0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00
+       data4   0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900
+       data4   0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00
+       data4   0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00
+       data4   0x00004300, 0x00004d00, 0x00003300, 0x00008500
+       data4   0x00004500, 0x0000f900, 0x00000200, 0x00007f00
+       data4   0x00005000, 0x00003c00, 0x00009f00, 0x0000a800
+       data4   0x00005100, 0x0000a300, 0x00004000, 0x00008f00
+       data4   0x00009200, 0x00009d00, 0x00003800, 0x0000f500
+       data4   0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100
+       data4   0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200
+       data4   0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00
+       data4   0x00005f00, 0x00009700, 0x00004400, 0x00001700
+       data4   0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00
+       data4   0x00006400, 0x00005d00, 0x00001900, 0x00007300
+       data4   0x00006000, 0x00008100, 0x00004f00, 0x0000dc00
+       data4   0x00002200, 0x00002a00, 0x00009000, 0x00008800
+       data4   0x00004600, 0x0000ee00, 0x0000b800, 0x00001400
+       data4   0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00
+       data4   0x0000e000, 0x00003200, 0x00003a00, 0x00000a00
+       data4   0x00004900, 0x00000600, 0x00002400, 0x00005c00
+       data4   0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200
+       data4   0x00009100, 0x00009500, 0x0000e400, 0x00007900
+       data4   0x0000e700, 0x0000c800, 0x00003700, 0x00006d00
+       data4   0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900
+       data4   0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00
+       data4   0x00006500, 0x00007a00, 0x0000ae00, 0x00000800
+       data4   0x0000ba00, 0x00007800, 0x00002500, 0x00002e00
+       data4   0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600
+       data4   0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00
+       data4   0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00
+       data4   0x00007000, 0x00003e00, 0x0000b500, 0x00006600
+       data4   0x00004800, 0x00000300, 0x0000f600, 0x00000e00
+       data4   0x00006100, 0x00003500, 0x00005700, 0x0000b900
+       data4   0x00008600, 0x0000c100, 0x00001d00, 0x00009e00
+       data4   0x0000e100, 0x0000f800, 0x00009800, 0x00001100
+       data4   0x00006900, 0x0000d900, 0x00008e00, 0x00009400
+       data4   0x00009b00, 0x00001e00, 0x00008700, 0x0000e900
+       data4   0x0000ce00, 0x00005500, 0x00002800, 0x0000df00
+       data4   0x00008c00, 0x0000a100, 0x00008900, 0x00000d00
+       data4   0x0000bf00, 0x0000e600, 0x00004200, 0x00006800
+       data4   0x00004100, 0x00009900, 0x00002d00, 0x00000f00
+       data4   0x0000b000, 0x00005400, 0x0000bb00, 0x00001600
+// Te7:
+       data4   0x00000063, 0x0000007c, 0x00000077, 0x0000007b
+       data4   0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5
+       data4   0x00000030, 0x00000001, 0x00000067, 0x0000002b
+       data4   0x000000fe, 0x000000d7, 0x000000ab, 0x00000076
+       data4   0x000000ca, 0x00000082, 0x000000c9, 0x0000007d
+       data4   0x000000fa, 0x00000059, 0x00000047, 0x000000f0
+       data4   0x000000ad, 0x000000d4, 0x000000a2, 0x000000af
+       data4   0x0000009c, 0x000000a4, 0x00000072, 0x000000c0
+       data4   0x000000b7, 0x000000fd, 0x00000093, 0x00000026
+       data4   0x00000036, 0x0000003f, 0x000000f7, 0x000000cc
+       data4   0x00000034, 0x000000a5, 0x000000e5, 0x000000f1
+       data4   0x00000071, 0x000000d8, 0x00000031, 0x00000015
+       data4   0x00000004, 0x000000c7, 0x00000023, 0x000000c3
+       data4   0x00000018, 0x00000096, 0x00000005, 0x0000009a
+       data4   0x00000007, 0x00000012, 0x00000080, 0x000000e2
+       data4   0x000000eb, 0x00000027, 0x000000b2, 0x00000075
+       data4   0x00000009, 0x00000083, 0x0000002c, 0x0000001a
+       data4   0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0
+       data4   0x00000052, 0x0000003b, 0x000000d6, 0x000000b3
+       data4   0x00000029, 0x000000e3, 0x0000002f, 0x00000084
+       data4   0x00000053, 0x000000d1, 0x00000000, 0x000000ed
+       data4   0x00000020, 0x000000fc, 0x000000b1, 0x0000005b
+       data4   0x0000006a, 0x000000cb, 0x000000be, 0x00000039
+       data4   0x0000004a, 0x0000004c, 0x00000058, 0x000000cf
+       data4   0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb
+       data4   0x00000043, 0x0000004d, 0x00000033, 0x00000085
+       data4   0x00000045, 0x000000f9, 0x00000002, 0x0000007f
+       data4   0x00000050, 0x0000003c, 0x0000009f, 0x000000a8
+       data4   0x00000051, 0x000000a3, 0x00000040, 0x0000008f
+       data4   0x00000092, 0x0000009d, 0x00000038, 0x000000f5
+       data4   0x000000bc, 0x000000b6, 0x000000da, 0x00000021
+       data4   0x00000010, 0x000000ff, 0x000000f3, 0x000000d2
+       data4   0x000000cd, 0x0000000c, 0x00000013, 0x000000ec
+       data4   0x0000005f, 0x00000097, 0x00000044, 0x00000017
+       data4   0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d
+       data4   0x00000064, 0x0000005d, 0x00000019, 0x00000073
+       data4   0x00000060, 0x00000081, 0x0000004f, 0x000000dc
+       data4   0x00000022, 0x0000002a, 0x00000090, 0x00000088
+       data4   0x00000046, 0x000000ee, 0x000000b8, 0x00000014
+       data4   0x000000de, 0x0000005e, 0x0000000b, 0x000000db
+       data4   0x000000e0, 0x00000032, 0x0000003a, 0x0000000a
+       data4   0x00000049, 0x00000006, 0x00000024, 0x0000005c
+       data4   0x000000c2, 0x000000d3, 0x000000ac, 0x00000062
+       data4   0x00000091, 0x00000095, 0x000000e4, 0x00000079
+       data4   0x000000e7, 0x000000c8, 0x00000037, 0x0000006d
+       data4   0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9
+       data4   0x0000006c, 0x00000056, 0x000000f4, 0x000000ea
+       data4   0x00000065, 0x0000007a, 0x000000ae, 0x00000008
+       data4   0x000000ba, 0x00000078, 0x00000025, 0x0000002e
+       data4   0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6
+       data4   0x000000e8, 0x000000dd, 0x00000074, 0x0000001f
+       data4   0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a
+       data4   0x00000070, 0x0000003e, 0x000000b5, 0x00000066
+       data4   0x00000048, 0x00000003, 0x000000f6, 0x0000000e
+       data4   0x00000061, 0x00000035, 0x00000057, 0x000000b9
+       data4   0x00000086, 0x000000c1, 0x0000001d, 0x0000009e
+       data4   0x000000e1, 0x000000f8, 0x00000098, 0x00000011
+       data4   0x00000069, 0x000000d9, 0x0000008e, 0x00000094
+       data4   0x0000009b, 0x0000001e, 0x00000087, 0x000000e9
+       data4   0x000000ce, 0x00000055, 0x00000028, 0x000000df
+       data4   0x0000008c, 0x000000a1, 0x00000089, 0x0000000d
+       data4   0x000000bf, 0x000000e6, 0x00000042, 0x00000068
+       data4   0x00000041, 0x00000099, 0x0000002d, 0x0000000f
+       data4   0x000000b0, 0x00000054, 0x000000bb, 0x00000016
+.size  AES_Te#,8*256*4 // HP-UX assembler fails to ".-AES_Te#"
 
 .align 64
 .global        AES_Td#
@@ -1192,68 +1390,263 @@ AES_Td:        data4   0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96
        data4   0xa8017139, 0x0cb3de08, 0xb4e49cd8, 0x56c19064
        data4   0xcb84617b, 0x32b670d5, 0x6c5c7448, 0xb85742d0
 // Td4:
-       data4   0x52525252, 0x09090909, 0x6a6a6a6a, 0xd5d5d5d5
-       data4   0x30303030, 0x36363636, 0xa5a5a5a5, 0x38383838
-       data4   0xbfbfbfbf, 0x40404040, 0xa3a3a3a3, 0x9e9e9e9e
-       data4   0x81818181, 0xf3f3f3f3, 0xd7d7d7d7, 0xfbfbfbfb
-       data4   0x7c7c7c7c, 0xe3e3e3e3, 0x39393939, 0x82828282
-       data4   0x9b9b9b9b, 0x2f2f2f2f, 0xffffffff, 0x87878787
-       data4   0x34343434, 0x8e8e8e8e, 0x43434343, 0x44444444
-       data4   0xc4c4c4c4, 0xdededede, 0xe9e9e9e9, 0xcbcbcbcb
-       data4   0x54545454, 0x7b7b7b7b, 0x94949494, 0x32323232
-       data4   0xa6a6a6a6, 0xc2c2c2c2, 0x23232323, 0x3d3d3d3d
-       data4   0xeeeeeeee, 0x4c4c4c4c, 0x95959595, 0x0b0b0b0b
-       data4   0x42424242, 0xfafafafa, 0xc3c3c3c3, 0x4e4e4e4e
-       data4   0x08080808, 0x2e2e2e2e, 0xa1a1a1a1, 0x66666666
-       data4   0x28282828, 0xd9d9d9d9, 0x24242424, 0xb2b2b2b2
-       data4   0x76767676, 0x5b5b5b5b, 0xa2a2a2a2, 0x49494949
-       data4   0x6d6d6d6d, 0x8b8b8b8b, 0xd1d1d1d1, 0x25252525
-       data4   0x72727272, 0xf8f8f8f8, 0xf6f6f6f6, 0x64646464
-       data4   0x86868686, 0x68686868, 0x98989898, 0x16161616
-       data4   0xd4d4d4d4, 0xa4a4a4a4, 0x5c5c5c5c, 0xcccccccc
-       data4   0x5d5d5d5d, 0x65656565, 0xb6b6b6b6, 0x92929292
-       data4   0x6c6c6c6c, 0x70707070, 0x48484848, 0x50505050
-       data4   0xfdfdfdfd, 0xedededed, 0xb9b9b9b9, 0xdadadada
-       data4   0x5e5e5e5e, 0x15151515, 0x46464646, 0x57575757
-       data4   0xa7a7a7a7, 0x8d8d8d8d, 0x9d9d9d9d, 0x84848484
-       data4   0x90909090, 0xd8d8d8d8, 0xabababab, 0x00000000
-       data4   0x8c8c8c8c, 0xbcbcbcbc, 0xd3d3d3d3, 0x0a0a0a0a
-       data4   0xf7f7f7f7, 0xe4e4e4e4, 0x58585858, 0x05050505
-       data4   0xb8b8b8b8, 0xb3b3b3b3, 0x45454545, 0x06060606
-       data4   0xd0d0d0d0, 0x2c2c2c2c, 0x1e1e1e1e, 0x8f8f8f8f
-       data4   0xcacacaca, 0x3f3f3f3f, 0x0f0f0f0f, 0x02020202
-       data4   0xc1c1c1c1, 0xafafafaf, 0xbdbdbdbd, 0x03030303
-       data4   0x01010101, 0x13131313, 0x8a8a8a8a, 0x6b6b6b6b
-       data4   0x3a3a3a3a, 0x91919191, 0x11111111, 0x41414141
-       data4   0x4f4f4f4f, 0x67676767, 0xdcdcdcdc, 0xeaeaeaea
-       data4   0x97979797, 0xf2f2f2f2, 0xcfcfcfcf, 0xcececece
-       data4   0xf0f0f0f0, 0xb4b4b4b4, 0xe6e6e6e6, 0x73737373
-       data4   0x96969696, 0xacacacac, 0x74747474, 0x22222222
-       data4   0xe7e7e7e7, 0xadadadad, 0x35353535, 0x85858585
-       data4   0xe2e2e2e2, 0xf9f9f9f9, 0x37373737, 0xe8e8e8e8
-       data4   0x1c1c1c1c, 0x75757575, 0xdfdfdfdf, 0x6e6e6e6e
-       data4   0x47474747, 0xf1f1f1f1, 0x1a1a1a1a, 0x71717171
-       data4   0x1d1d1d1d, 0x29292929, 0xc5c5c5c5, 0x89898989
-       data4   0x6f6f6f6f, 0xb7b7b7b7, 0x62626262, 0x0e0e0e0e
-       data4   0xaaaaaaaa, 0x18181818, 0xbebebebe, 0x1b1b1b1b
-       data4   0xfcfcfcfc, 0x56565656, 0x3e3e3e3e, 0x4b4b4b4b
-       data4   0xc6c6c6c6, 0xd2d2d2d2, 0x79797979, 0x20202020
-       data4   0x9a9a9a9a, 0xdbdbdbdb, 0xc0c0c0c0, 0xfefefefe
-       data4   0x78787878, 0xcdcdcdcd, 0x5a5a5a5a, 0xf4f4f4f4
-       data4   0x1f1f1f1f, 0xdddddddd, 0xa8a8a8a8, 0x33333333
-       data4   0x88888888, 0x07070707, 0xc7c7c7c7, 0x31313131
-       data4   0xb1b1b1b1, 0x12121212, 0x10101010, 0x59595959
-       data4   0x27272727, 0x80808080, 0xecececec, 0x5f5f5f5f
-       data4   0x60606060, 0x51515151, 0x7f7f7f7f, 0xa9a9a9a9
-       data4   0x19191919, 0xb5b5b5b5, 0x4a4a4a4a, 0x0d0d0d0d
-       data4   0x2d2d2d2d, 0xe5e5e5e5, 0x7a7a7a7a, 0x9f9f9f9f
-       data4   0x93939393, 0xc9c9c9c9, 0x9c9c9c9c, 0xefefefef
-       data4   0xa0a0a0a0, 0xe0e0e0e0, 0x3b3b3b3b, 0x4d4d4d4d
-       data4   0xaeaeaeae, 0x2a2a2a2a, 0xf5f5f5f5, 0xb0b0b0b0
-       data4   0xc8c8c8c8, 0xebebebeb, 0xbbbbbbbb, 0x3c3c3c3c
-       data4   0x83838383, 0x53535353, 0x99999999, 0x61616161
-       data4   0x17171717, 0x2b2b2b2b, 0x04040404, 0x7e7e7e7e
-       data4   0xbabababa, 0x77777777, 0xd6d6d6d6, 0x26262626
-       data4   0xe1e1e1e1, 0x69696969, 0x14141414, 0x63636363
-       data4   0x55555555, 0x21212121, 0x0c0c0c0c, 0x7d7d7d7d
-.size  AES_Td#,5*256*4 // HP-UX assembler fails to ".-AES_Td#"
+       data4   0x52000000, 0x09000000, 0x6a000000, 0xd5000000
+       data4   0x30000000, 0x36000000, 0xa5000000, 0x38000000
+       data4   0xbf000000, 0x40000000, 0xa3000000, 0x9e000000
+       data4   0x81000000, 0xf3000000, 0xd7000000, 0xfb000000
+       data4   0x7c000000, 0xe3000000, 0x39000000, 0x82000000
+       data4   0x9b000000, 0x2f000000, 0xff000000, 0x87000000
+       data4   0x34000000, 0x8e000000, 0x43000000, 0x44000000
+       data4   0xc4000000, 0xde000000, 0xe9000000, 0xcb000000
+       data4   0x54000000, 0x7b000000, 0x94000000, 0x32000000
+       data4   0xa6000000, 0xc2000000, 0x23000000, 0x3d000000
+       data4   0xee000000, 0x4c000000, 0x95000000, 0x0b000000
+       data4   0x42000000, 0xfa000000, 0xc3000000, 0x4e000000
+       data4   0x08000000, 0x2e000000, 0xa1000000, 0x66000000
+       data4   0x28000000, 0xd9000000, 0x24000000, 0xb2000000
+       data4   0x76000000, 0x5b000000, 0xa2000000, 0x49000000
+       data4   0x6d000000, 0x8b000000, 0xd1000000, 0x25000000
+       data4   0x72000000, 0xf8000000, 0xf6000000, 0x64000000
+       data4   0x86000000, 0x68000000, 0x98000000, 0x16000000
+       data4   0xd4000000, 0xa4000000, 0x5c000000, 0xcc000000
+       data4   0x5d000000, 0x65000000, 0xb6000000, 0x92000000
+       data4   0x6c000000, 0x70000000, 0x48000000, 0x50000000
+       data4   0xfd000000, 0xed000000, 0xb9000000, 0xda000000
+       data4   0x5e000000, 0x15000000, 0x46000000, 0x57000000
+       data4   0xa7000000, 0x8d000000, 0x9d000000, 0x84000000
+       data4   0x90000000, 0xd8000000, 0xab000000, 0x00000000
+       data4   0x8c000000, 0xbc000000, 0xd3000000, 0x0a000000
+       data4   0xf7000000, 0xe4000000, 0x58000000, 0x05000000
+       data4   0xb8000000, 0xb3000000, 0x45000000, 0x06000000
+       data4   0xd0000000, 0x2c000000, 0x1e000000, 0x8f000000
+       data4   0xca000000, 0x3f000000, 0x0f000000, 0x02000000
+       data4   0xc1000000, 0xaf000000, 0xbd000000, 0x03000000
+       data4   0x01000000, 0x13000000, 0x8a000000, 0x6b000000
+       data4   0x3a000000, 0x91000000, 0x11000000, 0x41000000
+       data4   0x4f000000, 0x67000000, 0xdc000000, 0xea000000
+       data4   0x97000000, 0xf2000000, 0xcf000000, 0xce000000
+       data4   0xf0000000, 0xb4000000, 0xe6000000, 0x73000000
+       data4   0x96000000, 0xac000000, 0x74000000, 0x22000000
+       data4   0xe7000000, 0xad000000, 0x35000000, 0x85000000
+       data4   0xe2000000, 0xf9000000, 0x37000000, 0xe8000000
+       data4   0x1c000000, 0x75000000, 0xdf000000, 0x6e000000
+       data4   0x47000000, 0xf1000000, 0x1a000000, 0x71000000
+       data4   0x1d000000, 0x29000000, 0xc5000000, 0x89000000
+       data4   0x6f000000, 0xb7000000, 0x62000000, 0x0e000000
+       data4   0xaa000000, 0x18000000, 0xbe000000, 0x1b000000
+       data4   0xfc000000, 0x56000000, 0x3e000000, 0x4b000000
+       data4   0xc6000000, 0xd2000000, 0x79000000, 0x20000000
+       data4   0x9a000000, 0xdb000000, 0xc0000000, 0xfe000000
+       data4   0x78000000, 0xcd000000, 0x5a000000, 0xf4000000
+       data4   0x1f000000, 0xdd000000, 0xa8000000, 0x33000000
+       data4   0x88000000, 0x07000000, 0xc7000000, 0x31000000
+       data4   0xb1000000, 0x12000000, 0x10000000, 0x59000000
+       data4   0x27000000, 0x80000000, 0xec000000, 0x5f000000
+       data4   0x60000000, 0x51000000, 0x7f000000, 0xa9000000
+       data4   0x19000000, 0xb5000000, 0x4a000000, 0x0d000000
+       data4   0x2d000000, 0xe5000000, 0x7a000000, 0x9f000000
+       data4   0x93000000, 0xc9000000, 0x9c000000, 0xef000000
+       data4   0xa0000000, 0xe0000000, 0x3b000000, 0x4d000000
+       data4   0xae000000, 0x2a000000, 0xf5000000, 0xb0000000
+       data4   0xc8000000, 0xeb000000, 0xbb000000, 0x3c000000
+       data4   0x83000000, 0x53000000, 0x99000000, 0x61000000
+       data4   0x17000000, 0x2b000000, 0x04000000, 0x7e000000
+       data4   0xba000000, 0x77000000, 0xd6000000, 0x26000000
+       data4   0xe1000000, 0x69000000, 0x14000000, 0x63000000
+       data4   0x55000000, 0x21000000, 0x0c000000, 0x7d000000
+// Td5:
+       data4   0x00520000, 0x00090000, 0x006a0000, 0x00d50000
+       data4   0x00300000, 0x00360000, 0x00a50000, 0x00380000
+       data4   0x00bf0000, 0x00400000, 0x00a30000, 0x009e0000
+       data4   0x00810000, 0x00f30000, 0x00d70000, 0x00fb0000
+       data4   0x007c0000, 0x00e30000, 0x00390000, 0x00820000
+       data4   0x009b0000, 0x002f0000, 0x00ff0000, 0x00870000
+       data4   0x00340000, 0x008e0000, 0x00430000, 0x00440000
+       data4   0x00c40000, 0x00de0000, 0x00e90000, 0x00cb0000
+       data4   0x00540000, 0x007b0000, 0x00940000, 0x00320000
+       data4   0x00a60000, 0x00c20000, 0x00230000, 0x003d0000
+       data4   0x00ee0000, 0x004c0000, 0x00950000, 0x000b0000
+       data4   0x00420000, 0x00fa0000, 0x00c30000, 0x004e0000
+       data4   0x00080000, 0x002e0000, 0x00a10000, 0x00660000
+       data4   0x00280000, 0x00d90000, 0x00240000, 0x00b20000
+       data4   0x00760000, 0x005b0000, 0x00a20000, 0x00490000
+       data4   0x006d0000, 0x008b0000, 0x00d10000, 0x00250000
+       data4   0x00720000, 0x00f80000, 0x00f60000, 0x00640000
+       data4   0x00860000, 0x00680000, 0x00980000, 0x00160000
+       data4   0x00d40000, 0x00a40000, 0x005c0000, 0x00cc0000
+       data4   0x005d0000, 0x00650000, 0x00b60000, 0x00920000
+       data4   0x006c0000, 0x00700000, 0x00480000, 0x00500000
+       data4   0x00fd0000, 0x00ed0000, 0x00b90000, 0x00da0000
+       data4   0x005e0000, 0x00150000, 0x00460000, 0x00570000
+       data4   0x00a70000, 0x008d0000, 0x009d0000, 0x00840000
+       data4   0x00900000, 0x00d80000, 0x00ab0000, 0x00000000
+       data4   0x008c0000, 0x00bc0000, 0x00d30000, 0x000a0000
+       data4   0x00f70000, 0x00e40000, 0x00580000, 0x00050000
+       data4   0x00b80000, 0x00b30000, 0x00450000, 0x00060000
+       data4   0x00d00000, 0x002c0000, 0x001e0000, 0x008f0000
+       data4   0x00ca0000, 0x003f0000, 0x000f0000, 0x00020000
+       data4   0x00c10000, 0x00af0000, 0x00bd0000, 0x00030000
+       data4   0x00010000, 0x00130000, 0x008a0000, 0x006b0000
+       data4   0x003a0000, 0x00910000, 0x00110000, 0x00410000
+       data4   0x004f0000, 0x00670000, 0x00dc0000, 0x00ea0000
+       data4   0x00970000, 0x00f20000, 0x00cf0000, 0x00ce0000
+       data4   0x00f00000, 0x00b40000, 0x00e60000, 0x00730000
+       data4   0x00960000, 0x00ac0000, 0x00740000, 0x00220000
+       data4   0x00e70000, 0x00ad0000, 0x00350000, 0x00850000
+       data4   0x00e20000, 0x00f90000, 0x00370000, 0x00e80000
+       data4   0x001c0000, 0x00750000, 0x00df0000, 0x006e0000
+       data4   0x00470000, 0x00f10000, 0x001a0000, 0x00710000
+       data4   0x001d0000, 0x00290000, 0x00c50000, 0x00890000
+       data4   0x006f0000, 0x00b70000, 0x00620000, 0x000e0000
+       data4   0x00aa0000, 0x00180000, 0x00be0000, 0x001b0000
+       data4   0x00fc0000, 0x00560000, 0x003e0000, 0x004b0000
+       data4   0x00c60000, 0x00d20000, 0x00790000, 0x00200000
+       data4   0x009a0000, 0x00db0000, 0x00c00000, 0x00fe0000
+       data4   0x00780000, 0x00cd0000, 0x005a0000, 0x00f40000
+       data4   0x001f0000, 0x00dd0000, 0x00a80000, 0x00330000
+       data4   0x00880000, 0x00070000, 0x00c70000, 0x00310000
+       data4   0x00b10000, 0x00120000, 0x00100000, 0x00590000
+       data4   0x00270000, 0x00800000, 0x00ec0000, 0x005f0000
+       data4   0x00600000, 0x00510000, 0x007f0000, 0x00a90000
+       data4   0x00190000, 0x00b50000, 0x004a0000, 0x000d0000
+       data4   0x002d0000, 0x00e50000, 0x007a0000, 0x009f0000
+       data4   0x00930000, 0x00c90000, 0x009c0000, 0x00ef0000
+       data4   0x00a00000, 0x00e00000, 0x003b0000, 0x004d0000
+       data4   0x00ae0000, 0x002a0000, 0x00f50000, 0x00b00000
+       data4   0x00c80000, 0x00eb0000, 0x00bb0000, 0x003c0000
+       data4   0x00830000, 0x00530000, 0x00990000, 0x00610000
+       data4   0x00170000, 0x002b0000, 0x00040000, 0x007e0000
+       data4   0x00ba0000, 0x00770000, 0x00d60000, 0x00260000
+       data4   0x00e10000, 0x00690000, 0x00140000, 0x00630000
+       data4   0x00550000, 0x00210000, 0x000c0000, 0x007d0000
+// Td6:
+       data4   0x00005200, 0x00000900, 0x00006a00, 0x0000d500
+       data4   0x00003000, 0x00003600, 0x0000a500, 0x00003800
+       data4   0x0000bf00, 0x00004000, 0x0000a300, 0x00009e00
+       data4   0x00008100, 0x0000f300, 0x0000d700, 0x0000fb00
+       data4   0x00007c00, 0x0000e300, 0x00003900, 0x00008200
+       data4   0x00009b00, 0x00002f00, 0x0000ff00, 0x00008700
+       data4   0x00003400, 0x00008e00, 0x00004300, 0x00004400
+       data4   0x0000c400, 0x0000de00, 0x0000e900, 0x0000cb00
+       data4   0x00005400, 0x00007b00, 0x00009400, 0x00003200
+       data4   0x0000a600, 0x0000c200, 0x00002300, 0x00003d00
+       data4   0x0000ee00, 0x00004c00, 0x00009500, 0x00000b00
+       data4   0x00004200, 0x0000fa00, 0x0000c300, 0x00004e00
+       data4   0x00000800, 0x00002e00, 0x0000a100, 0x00006600
+       data4   0x00002800, 0x0000d900, 0x00002400, 0x0000b200
+       data4   0x00007600, 0x00005b00, 0x0000a200, 0x00004900
+       data4   0x00006d00, 0x00008b00, 0x0000d100, 0x00002500
+       data4   0x00007200, 0x0000f800, 0x0000f600, 0x00006400
+       data4   0x00008600, 0x00006800, 0x00009800, 0x00001600
+       data4   0x0000d400, 0x0000a400, 0x00005c00, 0x0000cc00
+       data4   0x00005d00, 0x00006500, 0x0000b600, 0x00009200
+       data4   0x00006c00, 0x00007000, 0x00004800, 0x00005000
+       data4   0x0000fd00, 0x0000ed00, 0x0000b900, 0x0000da00
+       data4   0x00005e00, 0x00001500, 0x00004600, 0x00005700
+       data4   0x0000a700, 0x00008d00, 0x00009d00, 0x00008400
+       data4   0x00009000, 0x0000d800, 0x0000ab00, 0x00000000
+       data4   0x00008c00, 0x0000bc00, 0x0000d300, 0x00000a00
+       data4   0x0000f700, 0x0000e400, 0x00005800, 0x00000500
+       data4   0x0000b800, 0x0000b300, 0x00004500, 0x00000600
+       data4   0x0000d000, 0x00002c00, 0x00001e00, 0x00008f00
+       data4   0x0000ca00, 0x00003f00, 0x00000f00, 0x00000200
+       data4   0x0000c100, 0x0000af00, 0x0000bd00, 0x00000300
+       data4   0x00000100, 0x00001300, 0x00008a00, 0x00006b00
+       data4   0x00003a00, 0x00009100, 0x00001100, 0x00004100
+       data4   0x00004f00, 0x00006700, 0x0000dc00, 0x0000ea00
+       data4   0x00009700, 0x0000f200, 0x0000cf00, 0x0000ce00
+       data4   0x0000f000, 0x0000b400, 0x0000e600, 0x00007300
+       data4   0x00009600, 0x0000ac00, 0x00007400, 0x00002200
+       data4   0x0000e700, 0x0000ad00, 0x00003500, 0x00008500
+       data4   0x0000e200, 0x0000f900, 0x00003700, 0x0000e800
+       data4   0x00001c00, 0x00007500, 0x0000df00, 0x00006e00
+       data4   0x00004700, 0x0000f100, 0x00001a00, 0x00007100
+       data4   0x00001d00, 0x00002900, 0x0000c500, 0x00008900
+       data4   0x00006f00, 0x0000b700, 0x00006200, 0x00000e00
+       data4   0x0000aa00, 0x00001800, 0x0000be00, 0x00001b00
+       data4   0x0000fc00, 0x00005600, 0x00003e00, 0x00004b00
+       data4   0x0000c600, 0x0000d200, 0x00007900, 0x00002000
+       data4   0x00009a00, 0x0000db00, 0x0000c000, 0x0000fe00
+       data4   0x00007800, 0x0000cd00, 0x00005a00, 0x0000f400
+       data4   0x00001f00, 0x0000dd00, 0x0000a800, 0x00003300
+       data4   0x00008800, 0x00000700, 0x0000c700, 0x00003100
+       data4   0x0000b100, 0x00001200, 0x00001000, 0x00005900
+       data4   0x00002700, 0x00008000, 0x0000ec00, 0x00005f00
+       data4   0x00006000, 0x00005100, 0x00007f00, 0x0000a900
+       data4   0x00001900, 0x0000b500, 0x00004a00, 0x00000d00
+       data4   0x00002d00, 0x0000e500, 0x00007a00, 0x00009f00
+       data4   0x00009300, 0x0000c900, 0x00009c00, 0x0000ef00
+       data4   0x0000a000, 0x0000e000, 0x00003b00, 0x00004d00
+       data4   0x0000ae00, 0x00002a00, 0x0000f500, 0x0000b000
+       data4   0x0000c800, 0x0000eb00, 0x0000bb00, 0x00003c00
+       data4   0x00008300, 0x00005300, 0x00009900, 0x00006100
+       data4   0x00001700, 0x00002b00, 0x00000400, 0x00007e00
+       data4   0x0000ba00, 0x00007700, 0x0000d600, 0x00002600
+       data4   0x0000e100, 0x00006900, 0x00001400, 0x00006300
+       data4   0x00005500, 0x00002100, 0x00000c00, 0x00007d00
+// Td7:
+       data4   0x00000052, 0x00000009, 0x0000006a, 0x000000d5
+       data4   0x00000030, 0x00000036, 0x000000a5, 0x00000038
+       data4   0x000000bf, 0x00000040, 0x000000a3, 0x0000009e
+       data4   0x00000081, 0x000000f3, 0x000000d7, 0x000000fb
+       data4   0x0000007c, 0x000000e3, 0x00000039, 0x00000082
+       data4   0x0000009b, 0x0000002f, 0x000000ff, 0x00000087
+       data4   0x00000034, 0x0000008e, 0x00000043, 0x00000044
+       data4   0x000000c4, 0x000000de, 0x000000e9, 0x000000cb
+       data4   0x00000054, 0x0000007b, 0x00000094, 0x00000032
+       data4   0x000000a6, 0x000000c2, 0x00000023, 0x0000003d
+       data4   0x000000ee, 0x0000004c, 0x00000095, 0x0000000b
+       data4   0x00000042, 0x000000fa, 0x000000c3, 0x0000004e
+       data4   0x00000008, 0x0000002e, 0x000000a1, 0x00000066
+       data4   0x00000028, 0x000000d9, 0x00000024, 0x000000b2
+       data4   0x00000076, 0x0000005b, 0x000000a2, 0x00000049
+       data4   0x0000006d, 0x0000008b, 0x000000d1, 0x00000025
+       data4   0x00000072, 0x000000f8, 0x000000f6, 0x00000064
+       data4   0x00000086, 0x00000068, 0x00000098, 0x00000016
+       data4   0x000000d4, 0x000000a4, 0x0000005c, 0x000000cc
+       data4   0x0000005d, 0x00000065, 0x000000b6, 0x00000092
+       data4   0x0000006c, 0x00000070, 0x00000048, 0x00000050
+       data4   0x000000fd, 0x000000ed, 0x000000b9, 0x000000da
+       data4   0x0000005e, 0x00000015, 0x00000046, 0x00000057
+       data4   0x000000a7, 0x0000008d, 0x0000009d, 0x00000084
+       data4   0x00000090, 0x000000d8, 0x000000ab, 0x00000000
+       data4   0x0000008c, 0x000000bc, 0x000000d3, 0x0000000a
+       data4   0x000000f7, 0x000000e4, 0x00000058, 0x00000005
+       data4   0x000000b8, 0x000000b3, 0x00000045, 0x00000006
+       data4   0x000000d0, 0x0000002c, 0x0000001e, 0x0000008f
+       data4   0x000000ca, 0x0000003f, 0x0000000f, 0x00000002
+       data4   0x000000c1, 0x000000af, 0x000000bd, 0x00000003
+       data4   0x00000001, 0x00000013, 0x0000008a, 0x0000006b
+       data4   0x0000003a, 0x00000091, 0x00000011, 0x00000041
+       data4   0x0000004f, 0x00000067, 0x000000dc, 0x000000ea
+       data4   0x00000097, 0x000000f2, 0x000000cf, 0x000000ce
+       data4   0x000000f0, 0x000000b4, 0x000000e6, 0x00000073
+       data4   0x00000096, 0x000000ac, 0x00000074, 0x00000022
+       data4   0x000000e7, 0x000000ad, 0x00000035, 0x00000085
+       data4   0x000000e2, 0x000000f9, 0x00000037, 0x000000e8
+       data4   0x0000001c, 0x00000075, 0x000000df, 0x0000006e
+       data4   0x00000047, 0x000000f1, 0x0000001a, 0x00000071
+       data4   0x0000001d, 0x00000029, 0x000000c5, 0x00000089
+       data4   0x0000006f, 0x000000b7, 0x00000062, 0x0000000e
+       data4   0x000000aa, 0x00000018, 0x000000be, 0x0000001b
+       data4   0x000000fc, 0x00000056, 0x0000003e, 0x0000004b
+       data4   0x000000c6, 0x000000d2, 0x00000079, 0x00000020
+       data4   0x0000009a, 0x000000db, 0x000000c0, 0x000000fe
+       data4   0x00000078, 0x000000cd, 0x0000005a, 0x000000f4
+       data4   0x0000001f, 0x000000dd, 0x000000a8, 0x00000033
+       data4   0x00000088, 0x00000007, 0x000000c7, 0x00000031
+       data4   0x000000b1, 0x00000012, 0x00000010, 0x00000059
+       data4   0x00000027, 0x00000080, 0x000000ec, 0x0000005f
+       data4   0x00000060, 0x00000051, 0x0000007f, 0x000000a9
+       data4   0x00000019, 0x000000b5, 0x0000004a, 0x0000000d
+       data4   0x0000002d, 0x000000e5, 0x0000007a, 0x0000009f
+       data4   0x00000093, 0x000000c9, 0x0000009c, 0x000000ef
+       data4   0x000000a0, 0x000000e0, 0x0000003b, 0x0000004d
+       data4   0x000000ae, 0x0000002a, 0x000000f5, 0x000000b0
+       data4   0x000000c8, 0x000000eb, 0x000000bb, 0x0000003c
+       data4   0x00000083, 0x00000053, 0x00000099, 0x00000061
+       data4   0x00000017, 0x0000002b, 0x00000004, 0x0000007e
+       data4   0x000000ba, 0x00000077, 0x000000d6, 0x00000026
+       data4   0x000000e1, 0x00000069, 0x00000014, 0x00000063
+       data4   0x00000055, 0x00000021, 0x0000000c, 0x0000007d
+.size  AES_Td#,8*256*4 // HP-UX assembler fails to ".-AES_Td#"