Following the license change, modify the boilerplates in crypto/aes/
[openssl.git] / crypto / aes / asm / aes-ppc.pl
index 9c8a8d32b82acbf5adbe9d1c5021737d81d5b93f..8a3110a9a69e08c48a220053fa17d402fccd0df3 100644 (file)
@@ -1,42 +1,65 @@
-#!/usr/bin/env perl
+#! /usr/bin/env perl
+# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the Apache License 2.0 (the "License").  You may not use
+# this file except in compliance with the License.  You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
 
 # ====================================================================
-# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
+# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
 # project. The module is, however, dual licensed under OpenSSL and
 # CRYPTOGAMS licenses depending on where you obtain it. For further
 # details see http://www.openssl.org/~appro/cryptogams/.
 # ====================================================================
 
-# Needs more work: key setup, page boundaries, CBC routine...
+# Needs more work: key setup, CBC routine...
 #
 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
 # 4.0. But these are not the ones currently used! Their "compact"
-# counterparts are, for security reason. ppc_AES_crypt_compact runs at
-# 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - at 1/3
-# of ppc_AES_decrypt.
+# counterparts are, for security reason. ppc_AES_encrypt_compact runs
+# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
+# at 1/3 of ppc_AES_decrypt.
+
+# February 2010
+#
+# Rescheduling instructions to favour Power6 pipeline gave 10%
+# performance improvement on the platform in question (and marginal
+# improvement even on others). It should be noted that Power6 fails
+# to process byte in 18 cycles, only in 23, because it fails to issue
+# 4 load instructions in two cycles, only in 3. As result non-compact
+# block subroutines are 25% slower than one would expect. Compact
+# functions scale better, because they have pure computational part,
+# which scales perfectly with clock frequency. To be specific
+# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
+# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
 
-$output = shift;
+$flavour = shift;
 
-if ($output =~ /64\.s/) {
+if ($flavour =~ /64/) {
        $SIZE_T =8;
+       $LRSAVE =2*$SIZE_T;
        $STU    ="stdu";
        $POP    ="ld";
        $PUSH   ="std";
-} elsif ($output =~ /32\.s/) {
+} elsif ($flavour =~ /32/) {
        $SIZE_T =4;
+       $LRSAVE =$SIZE_T;
        $STU    ="stwu";
        $POP    ="lwz";
        $PUSH   ="stw";
-} else { die "nonsense $output"; }
+} else { die "nonsense $flavour"; }
+
+$LITTLE_ENDIAN = ($flavour=~/le$/) ? $SIZE_T : 0;
 
 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 die "can't locate ppc-xlate.pl";
 
-( defined shift || open STDOUT,"| $^X $xlate $output" ) ||
-       die "can't call $xlate: $!";
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
 
 $FRAME=32*$SIZE_T;
 
@@ -54,7 +77,7 @@ $key="r5";
 $Tbl0="r3";
 $Tbl1="r6";
 $Tbl2="r7";
-$Tbl3="r2";
+$Tbl3=$out;    # stay away from "r2"; $out is offloaded to stack
 
 $s0="r8";
 $s1="r9";
@@ -62,7 +85,7 @@ $s2="r10";
 $s3="r11";
 
 $t0="r12";
-$t1="r13";
+$t1="r0";      # stay away from "r13";
 $t2="r14";
 $t3="r15";
 
@@ -86,13 +109,11 @@ $acc13="r29";
 $acc14="r30";
 $acc15="r31";
 
-# stay away from TLS pointer
-if ($SIZE_T==8)        { $t1="r0";             }
-else           { $Tbl3=$t0; $t="r0";   }
 $mask80=$Tbl2;
 $mask1b=$Tbl3;
 
 $code.=<<___;
+.machine       "any"
 .text
 
 .align 7
@@ -103,15 +124,19 @@ LAES_Te:
        addi    $Tbl0,$Tbl0,`128-8`
        mtlr    r0
        blr
-       .space  `32-24`
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+       .space  `64-9*4`
 LAES_Td:
        mflr    r0
        bcl     20,31,\$+4
        mflr    $Tbl0   ;    vvvvvvvv "distance" between . and 1st data entry
-       addi    $Tbl0,$Tbl0,`128-8-32+2048+256`
+       addi    $Tbl0,$Tbl0,`128-64-8+2048+256`
        mtlr    r0
        blr
-       .space  `128-32-24`
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+       .space  `128-64-9*4`
 ___
 &_data_word(
        0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
@@ -315,12 +340,10 @@ $code.=<<___;
 .globl .AES_encrypt
 .align 7
 .AES_encrypt:
-       mflr    r0
        $STU    $sp,-$FRAME($sp)
+       mflr    r0
 
-       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
-       $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
-       $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   $out,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
@@ -339,21 +362,144 @@ $code.=<<___;
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
+
+       andi.   $t0,$inp,3
+       andi.   $t1,$out,3
+       or.     $t0,$t0,$t1
+       bne     Lenc_unaligned
 
+Lenc_unaligned_ok:
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
        lwz     $s0,0($inp)
        lwz     $s1,4($inp)
        lwz     $s2,8($inp)
        lwz     $s3,12($inp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+       lwz     $t0,0($inp)
+       lwz     $t1,4($inp)
+       lwz     $t2,8($inp)
+       lwz     $t3,12($inp)
+       rotlwi  $s0,$t0,8
+       rotlwi  $s1,$t1,8
+       rotlwi  $s2,$t2,8
+       rotlwi  $s3,$t3,8
+       rlwimi  $s0,$t0,24,0,7
+       rlwimi  $s1,$t1,24,0,7
+       rlwimi  $s2,$t2,24,0,7
+       rlwimi  $s3,$t3,24,0,7
+       rlwimi  $s0,$t0,24,16,23
+       rlwimi  $s1,$t1,24,16,23
+       rlwimi  $s2,$t2,24,16,23
+       rlwimi  $s3,$t3,24,16,23
+___
+$code.=<<___;
        bl      LAES_Te
        bl      Lppc_AES_encrypt_compact
+       $POP    $out,`$FRAME-$SIZE_T*19`($sp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+       rotlwi  $t0,$s0,8
+       rotlwi  $t1,$s1,8
+       rotlwi  $t2,$s2,8
+       rotlwi  $t3,$s3,8
+       rlwimi  $t0,$s0,24,0,7
+       rlwimi  $t1,$s1,24,0,7
+       rlwimi  $t2,$s2,24,0,7
+       rlwimi  $t3,$s3,24,0,7
+       rlwimi  $t0,$s0,24,16,23
+       rlwimi  $t1,$s1,24,16,23
+       rlwimi  $t2,$s2,24,16,23
+       rlwimi  $t3,$s3,24,16,23
+       stw     $t0,0($out)
+       stw     $t1,4($out)
+       stw     $t2,8($out)
+       stw     $t3,12($out)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
        stw     $s0,0($out)
        stw     $s1,4($out)
        stw     $s2,8($out)
        stw     $s3,12($out)
+___
+$code.=<<___;
+       b       Lenc_done
 
-       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
-       $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
-       $POP    r13,`$FRAME-$SIZE_T*19`($sp)
+Lenc_unaligned:
+       subfic  $t0,$inp,4096
+       subfic  $t1,$out,4096
+       andi.   $t0,$t0,4096-16
+       beq     Lenc_xpage
+       andi.   $t1,$t1,4096-16
+       bne     Lenc_unaligned_ok
+
+Lenc_xpage:
+       lbz     $acc00,0($inp)
+       lbz     $acc01,1($inp)
+       lbz     $acc02,2($inp)
+       lbz     $s0,3($inp)
+       lbz     $acc04,4($inp)
+       lbz     $acc05,5($inp)
+       lbz     $acc06,6($inp)
+       lbz     $s1,7($inp)
+       lbz     $acc08,8($inp)
+       lbz     $acc09,9($inp)
+       lbz     $acc10,10($inp)
+       insrwi  $s0,$acc00,8,0
+       lbz     $s2,11($inp)
+       insrwi  $s1,$acc04,8,0
+       lbz     $acc12,12($inp)
+       insrwi  $s0,$acc01,8,8
+       lbz     $acc13,13($inp)
+       insrwi  $s1,$acc05,8,8
+       lbz     $acc14,14($inp)
+       insrwi  $s0,$acc02,8,16
+       lbz     $s3,15($inp)
+       insrwi  $s1,$acc06,8,16
+       insrwi  $s2,$acc08,8,0
+       insrwi  $s3,$acc12,8,0
+       insrwi  $s2,$acc09,8,8
+       insrwi  $s3,$acc13,8,8
+       insrwi  $s2,$acc10,8,16
+       insrwi  $s3,$acc14,8,16
+
+       bl      LAES_Te
+       bl      Lppc_AES_encrypt_compact
+       $POP    $out,`$FRAME-$SIZE_T*19`($sp)
+
+       extrwi  $acc00,$s0,8,0
+       extrwi  $acc01,$s0,8,8
+       stb     $acc00,0($out)
+       extrwi  $acc02,$s0,8,16
+       stb     $acc01,1($out)
+       stb     $acc02,2($out)
+       extrwi  $acc04,$s1,8,0
+       stb     $s0,3($out)
+       extrwi  $acc05,$s1,8,8
+       stb     $acc04,4($out)
+       extrwi  $acc06,$s1,8,16
+       stb     $acc05,5($out)
+       stb     $acc06,6($out)
+       extrwi  $acc08,$s2,8,0
+       stb     $s1,7($out)
+       extrwi  $acc09,$s2,8,8
+       stb     $acc08,8($out)
+       extrwi  $acc10,$s2,8,16
+       stb     $acc09,9($out)
+       stb     $acc10,10($out)
+       extrwi  $acc12,$s3,8,0
+       stb     $s2,11($out)
+       extrwi  $acc13,$s3,8,8
+       stb     $acc12,12($out)
+       extrwi  $acc14,$s3,8,16
+       stb     $acc13,13($out)
+       stb     $acc14,14($out)
+       stb     $s3,15($out)
+
+Lenc_done:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
        $POP    r14,`$FRAME-$SIZE_T*18`($sp)
        $POP    r15,`$FRAME-$SIZE_T*17`($sp)
        $POP    r16,`$FRAME-$SIZE_T*16`($sp)
@@ -375,18 +521,21 @@ $code.=<<___;
        mtlr    r0
        addi    $sp,$sp,$FRAME
        blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 
-.align 4
+.align 5
 Lppc_AES_encrypt:
        lwz     $acc00,240($key)
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,3
+       lwz     $t0,0($key)
        addi    $Tbl2,$Tbl0,2
+       lwz     $t1,4($key)
        addi    $Tbl3,$Tbl0,1
+       lwz     $t2,8($key)
        addi    $acc00,$acc00,-1
+       lwz     $t3,12($key)
        addi    $key,$key,16
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
@@ -400,44 +549,44 @@ Lenc_loop:
        rlwinm  $acc02,$s2,`32-24+3`,21,28
        rlwinm  $acc03,$s3,`32-24+3`,21,28
        lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        rlwinm  $acc04,$s1,`32-16+3`,21,28
+       lwz     $t1,4($key)
        rlwinm  $acc05,$s2,`32-16+3`,21,28
+       lwz     $t2,8($key)
        rlwinm  $acc06,$s3,`32-16+3`,21,28
+       lwz     $t3,12($key)
        rlwinm  $acc07,$s0,`32-16+3`,21,28
        lwzx    $acc00,$Tbl0,$acc00
-       lwzx    $acc01,$Tbl0,$acc01
-       lwzx    $acc02,$Tbl0,$acc02
-       lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc08,$s2,`32-8+3`,21,28
+       lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc09,$s3,`32-8+3`,21,28
+       lwzx    $acc02,$Tbl0,$acc02
        rlwinm  $acc10,$s0,`32-8+3`,21,28
+       lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc11,$s1,`32-8+3`,21,28
        lwzx    $acc04,$Tbl1,$acc04
-       lwzx    $acc05,$Tbl1,$acc05
-       lwzx    $acc06,$Tbl1,$acc06
-       lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc12,$s3,`0+3`,21,28
+       lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s0,`0+3`,21,28
+       lwzx    $acc06,$Tbl1,$acc06
        rlwinm  $acc14,$s1,`0+3`,21,28
+       lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s2,`0+3`,21,28
        lwzx    $acc08,$Tbl2,$acc08
-       lwzx    $acc09,$Tbl2,$acc09
-       lwzx    $acc10,$Tbl2,$acc10
-       lwzx    $acc11,$Tbl2,$acc11
        xor     $t0,$t0,$acc00
+       lwzx    $acc09,$Tbl2,$acc09
        xor     $t1,$t1,$acc01
+       lwzx    $acc10,$Tbl2,$acc10
        xor     $t2,$t2,$acc02
+       lwzx    $acc11,$Tbl2,$acc11
        xor     $t3,$t3,$acc03
        lwzx    $acc12,$Tbl3,$acc12
-       lwzx    $acc13,$Tbl3,$acc13
-       lwzx    $acc14,$Tbl3,$acc14
-       lwzx    $acc15,$Tbl3,$acc15
        xor     $t0,$t0,$acc04
+       lwzx    $acc13,$Tbl3,$acc13
        xor     $t1,$t1,$acc05
+       lwzx    $acc14,$Tbl3,$acc14
        xor     $t2,$t2,$acc06
+       lwzx    $acc15,$Tbl3,$acc15
        xor     $t3,$t3,$acc07
        xor     $t0,$t0,$acc08
        xor     $t1,$t1,$acc09
@@ -448,65 +597,65 @@ Lenc_loop:
        xor     $s2,$t2,$acc14
        xor     $s3,$t3,$acc15
        addi    $key,$key,16
-       bdnz-   Lenc_loop
+       bdnz    Lenc_loop
 
        addi    $Tbl2,$Tbl0,2048
        nop
-       lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Te4
-       lwz     $acc09,`2048+32`($Tbl0)
-       lwz     $acc10,`2048+64`($Tbl0)
-       lwz     $acc11,`2048+96`($Tbl0)
-       lwz     $acc08,`2048+128`($Tbl0)
-       lwz     $acc09,`2048+160`($Tbl0)
-       lwz     $acc10,`2048+192`($Tbl0)
-       lwz     $acc11,`2048+224`($Tbl0)
-       rlwinm  $acc00,$s0,`32-24`,24,31
-       rlwinm  $acc01,$s1,`32-24`,24,31
-       rlwinm  $acc02,$s2,`32-24`,24,31
-       rlwinm  $acc03,$s3,`32-24`,24,31
        lwz     $t0,0($key)
+       rlwinm  $acc00,$s0,`32-24`,24,31
        lwz     $t1,4($key)
+       rlwinm  $acc01,$s1,`32-24`,24,31
        lwz     $t2,8($key)
+       rlwinm  $acc02,$s2,`32-24`,24,31
        lwz     $t3,12($key)
+       rlwinm  $acc03,$s3,`32-24`,24,31
+       lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Te4
        rlwinm  $acc04,$s1,`32-16`,24,31
+       lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc05,$s2,`32-16`,24,31
+       lwz     $acc10,`2048+64`($Tbl0)
        rlwinm  $acc06,$s3,`32-16`,24,31
+       lwz     $acc11,`2048+96`($Tbl0)
        rlwinm  $acc07,$s0,`32-16`,24,31
-       lbzx    $acc00,$Tbl2,$acc00
-       lbzx    $acc01,$Tbl2,$acc01
-       lbzx    $acc02,$Tbl2,$acc02
-       lbzx    $acc03,$Tbl2,$acc03
+       lwz     $acc12,`2048+128`($Tbl0)
        rlwinm  $acc08,$s2,`32-8`,24,31
+       lwz     $acc13,`2048+160`($Tbl0)
        rlwinm  $acc09,$s3,`32-8`,24,31
+       lwz     $acc14,`2048+192`($Tbl0)
        rlwinm  $acc10,$s0,`32-8`,24,31
+       lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc11,$s1,`32-8`,24,31
-       lbzx    $acc04,$Tbl2,$acc04
-       lbzx    $acc05,$Tbl2,$acc05
-       lbzx    $acc06,$Tbl2,$acc06
-       lbzx    $acc07,$Tbl2,$acc07
+       lbzx    $acc00,$Tbl2,$acc00
        rlwinm  $acc12,$s3,`0`,24,31
+       lbzx    $acc01,$Tbl2,$acc01
        rlwinm  $acc13,$s0,`0`,24,31
+       lbzx    $acc02,$Tbl2,$acc02
        rlwinm  $acc14,$s1,`0`,24,31
+       lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc15,$s2,`0`,24,31
-       lbzx    $acc08,$Tbl2,$acc08
-       lbzx    $acc09,$Tbl2,$acc09
-       lbzx    $acc10,$Tbl2,$acc10
-       lbzx    $acc11,$Tbl2,$acc11
+       lbzx    $acc04,$Tbl2,$acc04
        rlwinm  $s0,$acc00,24,0,7
+       lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $s1,$acc01,24,0,7
+       lbzx    $acc06,$Tbl2,$acc06
        rlwinm  $s2,$acc02,24,0,7
+       lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $s3,$acc03,24,0,7
-       lbzx    $acc12,$Tbl2,$acc12
-       lbzx    $acc13,$Tbl2,$acc13
-       lbzx    $acc14,$Tbl2,$acc14
-       lbzx    $acc15,$Tbl2,$acc15
+       lbzx    $acc08,$Tbl2,$acc08
        rlwimi  $s0,$acc04,16,8,15
+       lbzx    $acc09,$Tbl2,$acc09
        rlwimi  $s1,$acc05,16,8,15
+       lbzx    $acc10,$Tbl2,$acc10
        rlwimi  $s2,$acc06,16,8,15
+       lbzx    $acc11,$Tbl2,$acc11
        rlwimi  $s3,$acc07,16,8,15
+       lbzx    $acc12,$Tbl2,$acc12
        rlwimi  $s0,$acc08,8,16,23
+       lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s1,$acc09,8,16,23
+       lbzx    $acc14,$Tbl2,$acc14
        rlwimi  $s2,$acc10,8,16,23
+       lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s3,$acc11,8,16,23
        or      $s0,$s0,$acc12
        or      $s1,$s1,$acc13
@@ -517,92 +666,96 @@ Lenc_loop:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .align 4
 Lppc_AES_encrypt_compact:
        lwz     $acc00,240($key)
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,2048
+       lwz     $t0,0($key)
        lis     $mask80,0x8080
+       lwz     $t1,4($key)
        lis     $mask1b,0x1b1b
-       addi    $acc00,$acc00,-1
-       addi    $key,$key,16
+       lwz     $t2,8($key)
        ori     $mask80,$mask80,0x8080
+       lwz     $t3,12($key)
        ori     $mask1b,$mask1b,0x1b1b
-       xor     $s0,$s0,$t0
-       xor     $s1,$s1,$t1
-       xor     $s2,$s2,$t2
-       xor     $s3,$s3,$t3
+       addi    $key,$key,16
        mtctr   $acc00
 .align 4
 Lenc_compact_loop:
+       xor     $s0,$s0,$t0
+       xor     $s1,$s1,$t1
        rlwinm  $acc00,$s0,`32-24`,24,31
+       xor     $s2,$s2,$t2
        rlwinm  $acc01,$s1,`32-24`,24,31
+       xor     $s3,$s3,$t3
        rlwinm  $acc02,$s2,`32-24`,24,31
        rlwinm  $acc03,$s3,`32-24`,24,31
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        rlwinm  $acc04,$s1,`32-16`,24,31
        rlwinm  $acc05,$s2,`32-16`,24,31
        rlwinm  $acc06,$s3,`32-16`,24,31
        rlwinm  $acc07,$s0,`32-16`,24,31
        lbzx    $acc00,$Tbl1,$acc00
-       lbzx    $acc01,$Tbl1,$acc01
-       lbzx    $acc02,$Tbl1,$acc02
-       lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc08,$s2,`32-8`,24,31
+       lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc09,$s3,`32-8`,24,31
+       lbzx    $acc02,$Tbl1,$acc02
        rlwinm  $acc10,$s0,`32-8`,24,31
+       lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl1,$acc04
-       lbzx    $acc05,$Tbl1,$acc05
-       lbzx    $acc06,$Tbl1,$acc06
-       lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc12,$s3,`0`,24,31
+       lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s0,`0`,24,31
+       lbzx    $acc06,$Tbl1,$acc06
        rlwinm  $acc14,$s1,`0`,24,31
+       lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s2,`0`,24,31
        lbzx    $acc08,$Tbl1,$acc08
-       lbzx    $acc09,$Tbl1,$acc09
-       lbzx    $acc10,$Tbl1,$acc10
-       lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s0,$acc00,24,0,7
+       lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s1,$acc01,24,0,7
+       lbzx    $acc10,$Tbl1,$acc10
        rlwinm  $s2,$acc02,24,0,7
+       lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl1,$acc12
-       lbzx    $acc13,$Tbl1,$acc13
-       lbzx    $acc14,$Tbl1,$acc14
-       lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s0,$acc04,16,8,15
+       lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s1,$acc05,16,8,15
+       lbzx    $acc14,$Tbl1,$acc14
        rlwimi  $s2,$acc06,16,8,15
+       lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
        rlwimi  $s2,$acc10,8,16,23
        rlwimi  $s3,$acc11,8,16,23
+       lwz     $t0,0($key)
        or      $s0,$s0,$acc12
+       lwz     $t1,4($key)
        or      $s1,$s1,$acc13
+       lwz     $t2,8($key)
        or      $s2,$s2,$acc14
+       lwz     $t3,12($key)
        or      $s3,$s3,$acc15
 
+       addi    $key,$key,16
+       bdz     Lenc_compact_done
+
        and     $acc00,$s0,$mask80      # r1=r0&0x80808080
        and     $acc01,$s1,$mask80
        and     $acc02,$s2,$mask80
        and     $acc03,$s3,$mask80
        srwi    $acc04,$acc00,7         # r1>>7
-       srwi    $acc05,$acc01,7
-       srwi    $acc06,$acc02,7
-       srwi    $acc07,$acc03,7
        andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
+       srwi    $acc05,$acc01,7
        andc    $acc09,$s1,$mask80
+       srwi    $acc06,$acc02,7
        andc    $acc10,$s2,$mask80
+       srwi    $acc07,$acc03,7
        andc    $acc11,$s3,$mask80
        sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
        sub     $acc01,$acc01,$acc05
@@ -618,113 +771,57 @@ Lenc_compact_loop:
        and     $acc03,$acc03,$mask1b
        xor     $acc00,$acc00,$acc08    # r2
        xor     $acc01,$acc01,$acc09
+        rotlwi $acc12,$s0,16           # ROTATE(r0,16)
        xor     $acc02,$acc02,$acc10
+        rotlwi $acc13,$s1,16
        xor     $acc03,$acc03,$acc11
+        rotlwi $acc14,$s2,16
 
-       rotlwi  $acc12,$s0,16           # ROTATE(r0,16)
-       rotlwi  $acc13,$s1,16
-       rotlwi  $acc14,$s2,16
-       rotlwi  $acc15,$s3,16
        xor     $s0,$s0,$acc00          # r0^r2
+       rotlwi  $acc15,$s3,16
        xor     $s1,$s1,$acc01
-       xor     $s2,$s2,$acc02
-       xor     $s3,$s3,$acc03
        rotrwi  $s0,$s0,24              # ROTATE(r2^r0,24)
+       xor     $s2,$s2,$acc02
        rotrwi  $s1,$s1,24
+       xor     $s3,$s3,$acc03
        rotrwi  $s2,$s2,24
-       rotrwi  $s3,$s3,24
        xor     $s0,$s0,$acc00          # ROTATE(r2^r0,24)^r2
+       rotrwi  $s3,$s3,24
        xor     $s1,$s1,$acc01
        xor     $s2,$s2,$acc02
        xor     $s3,$s3,$acc03
        rotlwi  $acc08,$acc12,8         # ROTATE(r0,24)
-       rotlwi  $acc09,$acc13,8
-       rotlwi  $acc10,$acc14,8
-       rotlwi  $acc11,$acc15,8
        xor     $s0,$s0,$acc12          #
+       rotlwi  $acc09,$acc13,8
        xor     $s1,$s1,$acc13
+       rotlwi  $acc10,$acc14,8
        xor     $s2,$s2,$acc14
+       rotlwi  $acc11,$acc15,8
        xor     $s3,$s3,$acc15
        xor     $s0,$s0,$acc08          #
        xor     $s1,$s1,$acc09
        xor     $s2,$s2,$acc10
        xor     $s3,$s3,$acc11
 
-       xor     $s0,$s0,$t0
-       xor     $s1,$s1,$t1
-       xor     $s2,$s2,$t2
-       xor     $s3,$s3,$t3
-
-       addi    $key,$key,16
-       bdnz-   Lenc_compact_loop
-
-       rlwinm  $acc00,$s0,`32-24`,24,31
-       rlwinm  $acc01,$s1,`32-24`,24,31
-       rlwinm  $acc02,$s2,`32-24`,24,31
-       rlwinm  $acc03,$s3,`32-24`,24,31
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
-       rlwinm  $acc04,$s1,`32-16`,24,31
-       rlwinm  $acc05,$s2,`32-16`,24,31
-       rlwinm  $acc06,$s3,`32-16`,24,31
-       rlwinm  $acc07,$s0,`32-16`,24,31
-       lbzx    $acc00,$Tbl1,$acc00
-       lbzx    $acc01,$Tbl1,$acc01
-       lbzx    $acc02,$Tbl1,$acc02
-       lbzx    $acc03,$Tbl1,$acc03
-       rlwinm  $acc08,$s2,`32-8`,24,31
-       rlwinm  $acc09,$s3,`32-8`,24,31
-       rlwinm  $acc10,$s0,`32-8`,24,31
-       rlwinm  $acc11,$s1,`32-8`,24,31
-       lbzx    $acc04,$Tbl1,$acc04
-       lbzx    $acc05,$Tbl1,$acc05
-       lbzx    $acc06,$Tbl1,$acc06
-       lbzx    $acc07,$Tbl1,$acc07
-       rlwinm  $acc12,$s3,`0`,24,31
-       rlwinm  $acc13,$s0,`0`,24,31
-       rlwinm  $acc14,$s1,`0`,24,31
-       rlwinm  $acc15,$s2,`0`,24,31
-       lbzx    $acc08,$Tbl1,$acc08
-       lbzx    $acc09,$Tbl1,$acc09
-       lbzx    $acc10,$Tbl1,$acc10
-       lbzx    $acc11,$Tbl1,$acc11
-       rlwinm  $s0,$acc00,24,0,7
-       rlwinm  $s1,$acc01,24,0,7
-       rlwinm  $s2,$acc02,24,0,7
-       rlwinm  $s3,$acc03,24,0,7
-       lbzx    $acc12,$Tbl1,$acc12
-       lbzx    $acc13,$Tbl1,$acc13
-       lbzx    $acc14,$Tbl1,$acc14
-       lbzx    $acc15,$Tbl1,$acc15
-       rlwimi  $s0,$acc04,16,8,15
-       rlwimi  $s1,$acc05,16,8,15
-       rlwimi  $s2,$acc06,16,8,15
-       rlwimi  $s3,$acc07,16,8,15
-       rlwimi  $s0,$acc08,8,16,23
-       rlwimi  $s1,$acc09,8,16,23
-       rlwimi  $s2,$acc10,8,16,23
-       rlwimi  $s3,$acc11,8,16,23
-       or      $s0,$s0,$acc12
-       or      $s1,$s1,$acc13
-       or      $s2,$s2,$acc14
-       or      $s3,$s3,$acc15
+       b       Lenc_compact_loop
+.align 4
+Lenc_compact_done:
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.size  .AES_encrypt,.-.AES_encrypt
 
 .globl .AES_decrypt
 .align 7
 .AES_decrypt:
-       mflr    r0
        $STU    $sp,-$FRAME($sp)
+       mflr    r0
 
-       $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
-       $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
-       $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
+       $PUSH   $out,`$FRAME-$SIZE_T*19`($sp)
        $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
        $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
        $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
@@ -743,24 +840,147 @@ Lenc_compact_loop:
        $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
        $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
        $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
+       $PUSH   r0,`$FRAME+$LRSAVE`($sp)
 
+       andi.   $t0,$inp,3
+       andi.   $t1,$out,3
+       or.     $t0,$t0,$t1
+       bne     Ldec_unaligned
+
+Ldec_unaligned_ok:
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
        lwz     $s0,0($inp)
        lwz     $s1,4($inp)
        lwz     $s2,8($inp)
        lwz     $s3,12($inp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+       lwz     $t0,0($inp)
+       lwz     $t1,4($inp)
+       lwz     $t2,8($inp)
+       lwz     $t3,12($inp)
+       rotlwi  $s0,$t0,8
+       rotlwi  $s1,$t1,8
+       rotlwi  $s2,$t2,8
+       rotlwi  $s3,$t3,8
+       rlwimi  $s0,$t0,24,0,7
+       rlwimi  $s1,$t1,24,0,7
+       rlwimi  $s2,$t2,24,0,7
+       rlwimi  $s3,$t3,24,0,7
+       rlwimi  $s0,$t0,24,16,23
+       rlwimi  $s1,$t1,24,16,23
+       rlwimi  $s2,$t2,24,16,23
+       rlwimi  $s3,$t3,24,16,23
+___
+$code.=<<___;
        bl      LAES_Td
        bl      Lppc_AES_decrypt_compact
+       $POP    $out,`$FRAME-$SIZE_T*19`($sp)
+___
+$code.=<<___ if ($LITTLE_ENDIAN);
+       rotlwi  $t0,$s0,8
+       rotlwi  $t1,$s1,8
+       rotlwi  $t2,$s2,8
+       rotlwi  $t3,$s3,8
+       rlwimi  $t0,$s0,24,0,7
+       rlwimi  $t1,$s1,24,0,7
+       rlwimi  $t2,$s2,24,0,7
+       rlwimi  $t3,$s3,24,0,7
+       rlwimi  $t0,$s0,24,16,23
+       rlwimi  $t1,$s1,24,16,23
+       rlwimi  $t2,$s2,24,16,23
+       rlwimi  $t3,$s3,24,16,23
+       stw     $t0,0($out)
+       stw     $t1,4($out)
+       stw     $t2,8($out)
+       stw     $t3,12($out)
+___
+$code.=<<___ if (!$LITTLE_ENDIAN);
        stw     $s0,0($out)
        stw     $s1,4($out)
        stw     $s2,8($out)
        stw     $s3,12($out)
+___
+$code.=<<___;
+       b       Ldec_done
 
-       $POP    r0,`$FRAME-$SIZE_T*21`($sp)
-       $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
-       $POP    r13,`$FRAME-$SIZE_T*19`($sp)
-       $POP    r14,`$FRAME-$SIZE_T*18`($sp)
-       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
-       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
+Ldec_unaligned:
+       subfic  $t0,$inp,4096
+       subfic  $t1,$out,4096
+       andi.   $t0,$t0,4096-16
+       beq     Ldec_xpage
+       andi.   $t1,$t1,4096-16
+       bne     Ldec_unaligned_ok
+
+Ldec_xpage:
+       lbz     $acc00,0($inp)
+       lbz     $acc01,1($inp)
+       lbz     $acc02,2($inp)
+       lbz     $s0,3($inp)
+       lbz     $acc04,4($inp)
+       lbz     $acc05,5($inp)
+       lbz     $acc06,6($inp)
+       lbz     $s1,7($inp)
+       lbz     $acc08,8($inp)
+       lbz     $acc09,9($inp)
+       lbz     $acc10,10($inp)
+       insrwi  $s0,$acc00,8,0
+       lbz     $s2,11($inp)
+       insrwi  $s1,$acc04,8,0
+       lbz     $acc12,12($inp)
+       insrwi  $s0,$acc01,8,8
+       lbz     $acc13,13($inp)
+       insrwi  $s1,$acc05,8,8
+       lbz     $acc14,14($inp)
+       insrwi  $s0,$acc02,8,16
+       lbz     $s3,15($inp)
+       insrwi  $s1,$acc06,8,16
+       insrwi  $s2,$acc08,8,0
+       insrwi  $s3,$acc12,8,0
+       insrwi  $s2,$acc09,8,8
+       insrwi  $s3,$acc13,8,8
+       insrwi  $s2,$acc10,8,16
+       insrwi  $s3,$acc14,8,16
+
+       bl      LAES_Td
+       bl      Lppc_AES_decrypt_compact
+       $POP    $out,`$FRAME-$SIZE_T*19`($sp)
+
+       extrwi  $acc00,$s0,8,0
+       extrwi  $acc01,$s0,8,8
+       stb     $acc00,0($out)
+       extrwi  $acc02,$s0,8,16
+       stb     $acc01,1($out)
+       stb     $acc02,2($out)
+       extrwi  $acc04,$s1,8,0
+       stb     $s0,3($out)
+       extrwi  $acc05,$s1,8,8
+       stb     $acc04,4($out)
+       extrwi  $acc06,$s1,8,16
+       stb     $acc05,5($out)
+       stb     $acc06,6($out)
+       extrwi  $acc08,$s2,8,0
+       stb     $s1,7($out)
+       extrwi  $acc09,$s2,8,8
+       stb     $acc08,8($out)
+       extrwi  $acc10,$s2,8,16
+       stb     $acc09,9($out)
+       stb     $acc10,10($out)
+       extrwi  $acc12,$s3,8,0
+       stb     $s2,11($out)
+       extrwi  $acc13,$s3,8,8
+       stb     $acc12,12($out)
+       extrwi  $acc14,$s3,8,16
+       stb     $acc13,13($out)
+       stb     $acc14,14($out)
+       stb     $s3,15($out)
+
+Ldec_done:
+       $POP    r0,`$FRAME+$LRSAVE`($sp)
+       $POP    r14,`$FRAME-$SIZE_T*18`($sp)
+       $POP    r15,`$FRAME-$SIZE_T*17`($sp)
+       $POP    r16,`$FRAME-$SIZE_T*16`($sp)
        $POP    r17,`$FRAME-$SIZE_T*15`($sp)
        $POP    r18,`$FRAME-$SIZE_T*14`($sp)
        $POP    r19,`$FRAME-$SIZE_T*13`($sp)
@@ -779,18 +999,21 @@ Lenc_compact_loop:
        mtlr    r0
        addi    $sp,$sp,$FRAME
        blr
+       .long   0
+       .byte   0,12,4,1,0x80,18,3,0
+       .long   0
 
-.align 4
+.align 5
 Lppc_AES_decrypt:
        lwz     $acc00,240($key)
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,3
+       lwz     $t0,0($key)
        addi    $Tbl2,$Tbl0,2
+       lwz     $t1,4($key)
        addi    $Tbl3,$Tbl0,1
+       lwz     $t2,8($key)
        addi    $acc00,$acc00,-1
+       lwz     $t3,12($key)
        addi    $key,$key,16
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
@@ -804,44 +1027,44 @@ Ldec_loop:
        rlwinm  $acc02,$s2,`32-24+3`,21,28
        rlwinm  $acc03,$s3,`32-24+3`,21,28
        lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        rlwinm  $acc04,$s3,`32-16+3`,21,28
+       lwz     $t1,4($key)
        rlwinm  $acc05,$s0,`32-16+3`,21,28
+       lwz     $t2,8($key)
        rlwinm  $acc06,$s1,`32-16+3`,21,28
+       lwz     $t3,12($key)
        rlwinm  $acc07,$s2,`32-16+3`,21,28
        lwzx    $acc00,$Tbl0,$acc00
-       lwzx    $acc01,$Tbl0,$acc01
-       lwzx    $acc02,$Tbl0,$acc02
-       lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc08,$s2,`32-8+3`,21,28
+       lwzx    $acc01,$Tbl0,$acc01
        rlwinm  $acc09,$s3,`32-8+3`,21,28
+       lwzx    $acc02,$Tbl0,$acc02
        rlwinm  $acc10,$s0,`32-8+3`,21,28
+       lwzx    $acc03,$Tbl0,$acc03
        rlwinm  $acc11,$s1,`32-8+3`,21,28
        lwzx    $acc04,$Tbl1,$acc04
-       lwzx    $acc05,$Tbl1,$acc05
-       lwzx    $acc06,$Tbl1,$acc06
-       lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc12,$s1,`0+3`,21,28
+       lwzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s2,`0+3`,21,28
+       lwzx    $acc06,$Tbl1,$acc06
        rlwinm  $acc14,$s3,`0+3`,21,28
+       lwzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s0,`0+3`,21,28
        lwzx    $acc08,$Tbl2,$acc08
-       lwzx    $acc09,$Tbl2,$acc09
-       lwzx    $acc10,$Tbl2,$acc10
-       lwzx    $acc11,$Tbl2,$acc11
        xor     $t0,$t0,$acc00
+       lwzx    $acc09,$Tbl2,$acc09
        xor     $t1,$t1,$acc01
+       lwzx    $acc10,$Tbl2,$acc10
        xor     $t2,$t2,$acc02
+       lwzx    $acc11,$Tbl2,$acc11
        xor     $t3,$t3,$acc03
        lwzx    $acc12,$Tbl3,$acc12
-       lwzx    $acc13,$Tbl3,$acc13
-       lwzx    $acc14,$Tbl3,$acc14
-       lwzx    $acc15,$Tbl3,$acc15
        xor     $t0,$t0,$acc04
+       lwzx    $acc13,$Tbl3,$acc13
        xor     $t1,$t1,$acc05
+       lwzx    $acc14,$Tbl3,$acc14
        xor     $t2,$t2,$acc06
+       lwzx    $acc15,$Tbl3,$acc15
        xor     $t3,$t3,$acc07
        xor     $t0,$t0,$acc08
        xor     $t1,$t1,$acc09
@@ -852,61 +1075,61 @@ Ldec_loop:
        xor     $s2,$t2,$acc14
        xor     $s3,$t3,$acc15
        addi    $key,$key,16
-       bdnz-   Ldec_loop
+       bdnz    Ldec_loop
 
        addi    $Tbl2,$Tbl0,2048
        nop
-       lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Td4
-       lwz     $acc09,`2048+32`($Tbl0)
-       lwz     $acc10,`2048+64`($Tbl0)
-       lwz     $acc11,`2048+96`($Tbl0)
-       lwz     $acc08,`2048+128`($Tbl0)
-       lwz     $acc09,`2048+160`($Tbl0)
-       lwz     $acc10,`2048+192`($Tbl0)
-       lwz     $acc11,`2048+224`($Tbl0)
-       rlwinm  $acc00,$s0,`32-24`,24,31
-       rlwinm  $acc01,$s1,`32-24`,24,31
-       rlwinm  $acc02,$s2,`32-24`,24,31
-       rlwinm  $acc03,$s3,`32-24`,24,31
        lwz     $t0,0($key)
+       rlwinm  $acc00,$s0,`32-24`,24,31
        lwz     $t1,4($key)
+       rlwinm  $acc01,$s1,`32-24`,24,31
        lwz     $t2,8($key)
+       rlwinm  $acc02,$s2,`32-24`,24,31
        lwz     $t3,12($key)
+       rlwinm  $acc03,$s3,`32-24`,24,31
+       lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Td4
        rlwinm  $acc04,$s3,`32-16`,24,31
+       lwz     $acc09,`2048+32`($Tbl0)
        rlwinm  $acc05,$s0,`32-16`,24,31
-       rlwinm  $acc06,$s1,`32-16`,24,31
-       rlwinm  $acc07,$s2,`32-16`,24,31
+       lwz     $acc10,`2048+64`($Tbl0)
        lbzx    $acc00,$Tbl2,$acc00
+       lwz     $acc11,`2048+96`($Tbl0)
        lbzx    $acc01,$Tbl2,$acc01
-       lbzx    $acc02,$Tbl2,$acc02
-       lbzx    $acc03,$Tbl2,$acc03
+       lwz     $acc12,`2048+128`($Tbl0)
+       rlwinm  $acc06,$s1,`32-16`,24,31
+       lwz     $acc13,`2048+160`($Tbl0)
+       rlwinm  $acc07,$s2,`32-16`,24,31
+       lwz     $acc14,`2048+192`($Tbl0)
        rlwinm  $acc08,$s2,`32-8`,24,31
+       lwz     $acc15,`2048+224`($Tbl0)
        rlwinm  $acc09,$s3,`32-8`,24,31
+       lbzx    $acc02,$Tbl2,$acc02
        rlwinm  $acc10,$s0,`32-8`,24,31
+       lbzx    $acc03,$Tbl2,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl2,$acc04
-       lbzx    $acc05,$Tbl2,$acc05
-       lbzx    $acc06,$Tbl2,$acc06
-       lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $acc12,$s1,`0`,24,31
+       lbzx    $acc05,$Tbl2,$acc05
        rlwinm  $acc13,$s2,`0`,24,31
+       lbzx    $acc06,$Tbl2,$acc06
        rlwinm  $acc14,$s3,`0`,24,31
+       lbzx    $acc07,$Tbl2,$acc07
        rlwinm  $acc15,$s0,`0`,24,31
        lbzx    $acc08,$Tbl2,$acc08
-       lbzx    $acc09,$Tbl2,$acc09
-       lbzx    $acc10,$Tbl2,$acc10
-       lbzx    $acc11,$Tbl2,$acc11
        rlwinm  $s0,$acc00,24,0,7
+       lbzx    $acc09,$Tbl2,$acc09
        rlwinm  $s1,$acc01,24,0,7
+       lbzx    $acc10,$Tbl2,$acc10
        rlwinm  $s2,$acc02,24,0,7
+       lbzx    $acc11,$Tbl2,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl2,$acc12
-       lbzx    $acc13,$Tbl2,$acc13
-       lbzx    $acc14,$Tbl2,$acc14
-       lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s0,$acc04,16,8,15
+       lbzx    $acc13,$Tbl2,$acc13
        rlwimi  $s1,$acc05,16,8,15
+       lbzx    $acc14,$Tbl2,$acc14
        rlwimi  $s2,$acc06,16,8,15
+       lbzx    $acc15,$Tbl2,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
@@ -921,92 +1144,165 @@ Ldec_loop:
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
 
 .align 4
 Lppc_AES_decrypt_compact:
        lwz     $acc00,240($key)
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        addi    $Tbl1,$Tbl0,2048
+       lwz     $t0,0($key)
        lis     $mask80,0x8080
+       lwz     $t1,4($key)
        lis     $mask1b,0x1b1b
-       addi    $acc00,$acc00,-1
-       addi    $key,$key,16
+       lwz     $t2,8($key)
        ori     $mask80,$mask80,0x8080
+       lwz     $t3,12($key)
        ori     $mask1b,$mask1b,0x1b1b
-       xor     $s0,$s0,$t0
-       xor     $s1,$s1,$t1
-       xor     $s2,$s2,$t2
-       xor     $s3,$s3,$t3
+       addi    $key,$key,16
+___
+$code.=<<___ if ($SIZE_T==8);
+       insrdi  $mask80,$mask80,32,0
+       insrdi  $mask1b,$mask1b,32,0
+___
+$code.=<<___;
        mtctr   $acc00
 .align 4
 Ldec_compact_loop:
+       xor     $s0,$s0,$t0
+       xor     $s1,$s1,$t1
        rlwinm  $acc00,$s0,`32-24`,24,31
+       xor     $s2,$s2,$t2
        rlwinm  $acc01,$s1,`32-24`,24,31
+       xor     $s3,$s3,$t3
        rlwinm  $acc02,$s2,`32-24`,24,31
        rlwinm  $acc03,$s3,`32-24`,24,31
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
        rlwinm  $acc04,$s3,`32-16`,24,31
        rlwinm  $acc05,$s0,`32-16`,24,31
        rlwinm  $acc06,$s1,`32-16`,24,31
        rlwinm  $acc07,$s2,`32-16`,24,31
        lbzx    $acc00,$Tbl1,$acc00
-       lbzx    $acc01,$Tbl1,$acc01
-       lbzx    $acc02,$Tbl1,$acc02
-       lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc08,$s2,`32-8`,24,31
+       lbzx    $acc01,$Tbl1,$acc01
        rlwinm  $acc09,$s3,`32-8`,24,31
+       lbzx    $acc02,$Tbl1,$acc02
        rlwinm  $acc10,$s0,`32-8`,24,31
+       lbzx    $acc03,$Tbl1,$acc03
        rlwinm  $acc11,$s1,`32-8`,24,31
        lbzx    $acc04,$Tbl1,$acc04
-       lbzx    $acc05,$Tbl1,$acc05
-       lbzx    $acc06,$Tbl1,$acc06
-       lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc12,$s1,`0`,24,31
+       lbzx    $acc05,$Tbl1,$acc05
        rlwinm  $acc13,$s2,`0`,24,31
+       lbzx    $acc06,$Tbl1,$acc06
        rlwinm  $acc14,$s3,`0`,24,31
+       lbzx    $acc07,$Tbl1,$acc07
        rlwinm  $acc15,$s0,`0`,24,31
        lbzx    $acc08,$Tbl1,$acc08
-       lbzx    $acc09,$Tbl1,$acc09
-       lbzx    $acc10,$Tbl1,$acc10
-       lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s0,$acc00,24,0,7
+       lbzx    $acc09,$Tbl1,$acc09
        rlwinm  $s1,$acc01,24,0,7
+       lbzx    $acc10,$Tbl1,$acc10
        rlwinm  $s2,$acc02,24,0,7
+       lbzx    $acc11,$Tbl1,$acc11
        rlwinm  $s3,$acc03,24,0,7
        lbzx    $acc12,$Tbl1,$acc12
-       lbzx    $acc13,$Tbl1,$acc13
-       lbzx    $acc14,$Tbl1,$acc14
-       lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s0,$acc04,16,8,15
+       lbzx    $acc13,$Tbl1,$acc13
        rlwimi  $s1,$acc05,16,8,15
+       lbzx    $acc14,$Tbl1,$acc14
        rlwimi  $s2,$acc06,16,8,15
+       lbzx    $acc15,$Tbl1,$acc15
        rlwimi  $s3,$acc07,16,8,15
        rlwimi  $s0,$acc08,8,16,23
        rlwimi  $s1,$acc09,8,16,23
        rlwimi  $s2,$acc10,8,16,23
        rlwimi  $s3,$acc11,8,16,23
+       lwz     $t0,0($key)
        or      $s0,$s0,$acc12
+       lwz     $t1,4($key)
        or      $s1,$s1,$acc13
+       lwz     $t2,8($key)
        or      $s2,$s2,$acc14
+       lwz     $t3,12($key)
        or      $s3,$s3,$acc15
 
+       addi    $key,$key,16
+       bdz     Ldec_compact_done
+___
+$code.=<<___ if ($SIZE_T==8);
+       # vectorized permutation improves decrypt performance by 10%
+       insrdi  $s0,$s1,32,0
+       insrdi  $s2,$s3,32,0
+
+       and     $acc00,$s0,$mask80      # r1=r0&0x80808080
+       and     $acc02,$s2,$mask80
+       srdi    $acc04,$acc00,7         # r1>>7
+       srdi    $acc06,$acc02,7
+       andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
+       andc    $acc10,$s2,$mask80
+       sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
+       sub     $acc02,$acc02,$acc06
+       add     $acc08,$acc08,$acc08    # (r0&0x7f7f7f7f)<<1
+       add     $acc10,$acc10,$acc10
+       and     $acc00,$acc00,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
+       and     $acc02,$acc02,$mask1b
+       xor     $acc00,$acc00,$acc08    # r2
+       xor     $acc02,$acc02,$acc10
+
+       and     $acc04,$acc00,$mask80   # r1=r2&0x80808080
+       and     $acc06,$acc02,$mask80
+       srdi    $acc08,$acc04,7         # r1>>7
+       srdi    $acc10,$acc06,7
+       andc    $acc12,$acc00,$mask80   # r2&0x7f7f7f7f
+       andc    $acc14,$acc02,$mask80
+       sub     $acc04,$acc04,$acc08    # r1-(r1>>7)
+       sub     $acc06,$acc06,$acc10
+       add     $acc12,$acc12,$acc12    # (r2&0x7f7f7f7f)<<1
+       add     $acc14,$acc14,$acc14
+       and     $acc04,$acc04,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
+       and     $acc06,$acc06,$mask1b
+       xor     $acc04,$acc04,$acc12    # r4
+       xor     $acc06,$acc06,$acc14
+
+       and     $acc08,$acc04,$mask80   # r1=r4&0x80808080
+       and     $acc10,$acc06,$mask80
+       srdi    $acc12,$acc08,7         # r1>>7
+       srdi    $acc14,$acc10,7
+       sub     $acc08,$acc08,$acc12    # r1-(r1>>7)
+       sub     $acc10,$acc10,$acc14
+       andc    $acc12,$acc04,$mask80   # r4&0x7f7f7f7f
+       andc    $acc14,$acc06,$mask80
+       add     $acc12,$acc12,$acc12    # (r4&0x7f7f7f7f)<<1
+       add     $acc14,$acc14,$acc14
+       and     $acc08,$acc08,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
+       and     $acc10,$acc10,$mask1b
+       xor     $acc08,$acc08,$acc12    # r8
+       xor     $acc10,$acc10,$acc14
+
+       xor     $acc00,$acc00,$s0       # r2^r0
+       xor     $acc02,$acc02,$s2
+       xor     $acc04,$acc04,$s0       # r4^r0
+       xor     $acc06,$acc06,$s2
+
+       extrdi  $acc01,$acc00,32,0
+       extrdi  $acc03,$acc02,32,0
+       extrdi  $acc05,$acc04,32,0
+       extrdi  $acc07,$acc06,32,0
+       extrdi  $acc09,$acc08,32,0
+       extrdi  $acc11,$acc10,32,0
+___
+$code.=<<___ if ($SIZE_T==4);
        and     $acc00,$s0,$mask80      # r1=r0&0x80808080
        and     $acc01,$s1,$mask80
        and     $acc02,$s2,$mask80
        and     $acc03,$s3,$mask80
        srwi    $acc04,$acc00,7         # r1>>7
-       srwi    $acc05,$acc01,7
-       srwi    $acc06,$acc02,7
-       srwi    $acc07,$acc03,7
        andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
+       srwi    $acc05,$acc01,7
        andc    $acc09,$s1,$mask80
+       srwi    $acc06,$acc02,7
        andc    $acc10,$s2,$mask80
+       srwi    $acc07,$acc03,7
        andc    $acc11,$s3,$mask80
        sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
        sub     $acc01,$acc01,$acc05
@@ -1030,12 +1326,12 @@ Ldec_compact_loop:
        and     $acc06,$acc02,$mask80
        and     $acc07,$acc03,$mask80
        srwi    $acc08,$acc04,7         # r1>>7
-       srwi    $acc09,$acc05,7
-       srwi    $acc10,$acc06,7
-       srwi    $acc11,$acc07,7
        andc    $acc12,$acc00,$mask80   # r2&0x7f7f7f7f
+       srwi    $acc09,$acc05,7
        andc    $acc13,$acc01,$mask80
+       srwi    $acc10,$acc06,7
        andc    $acc14,$acc02,$mask80
+       srwi    $acc11,$acc07,7
        andc    $acc15,$acc03,$mask80
        sub     $acc04,$acc04,$acc08    # r1-(r1>>7)
        sub     $acc05,$acc05,$acc09
@@ -1056,13 +1352,13 @@ Ldec_compact_loop:
 
        and     $acc08,$acc04,$mask80   # r1=r4&0x80808080
        and     $acc09,$acc05,$mask80
-       and     $acc10,$acc06,$mask80
-       and     $acc11,$acc07,$mask80
        srwi    $acc12,$acc08,7         # r1>>7
+       and     $acc10,$acc06,$mask80
        srwi    $acc13,$acc09,7
+       and     $acc11,$acc07,$mask80
        srwi    $acc14,$acc10,7
-       srwi    $acc15,$acc11,7
        sub     $acc08,$acc08,$acc12    # r1-(r1>>7)
+       srwi    $acc15,$acc11,7
        sub     $acc09,$acc09,$acc13
        sub     $acc10,$acc10,$acc14
        sub     $acc11,$acc11,$acc15
@@ -1091,12 +1387,14 @@ Ldec_compact_loop:
        xor     $acc05,$acc05,$s1
        xor     $acc06,$acc06,$s2
        xor     $acc07,$acc07,$s3
+___
+$code.=<<___;
        rotrwi  $s0,$s0,8               # = ROTATE(r0,8)
        rotrwi  $s1,$s1,8
-       rotrwi  $s2,$s2,8
-       rotrwi  $s3,$s3,8
        xor     $s0,$s0,$acc00          # ^= r2^r0
+       rotrwi  $s2,$s2,8
        xor     $s1,$s1,$acc01
+       rotrwi  $s3,$s3,8
        xor     $s2,$s2,$acc02
        xor     $s3,$s3,$acc03
        xor     $acc00,$acc00,$acc08
@@ -1104,108 +1402,54 @@ Ldec_compact_loop:
        xor     $acc02,$acc02,$acc10
        xor     $acc03,$acc03,$acc11
        xor     $s0,$s0,$acc04          # ^= r4^r0
-       xor     $s1,$s1,$acc05
-       xor     $s2,$s2,$acc06
-       xor     $s3,$s3,$acc07
        rotrwi  $acc00,$acc00,24
+       xor     $s1,$s1,$acc05
        rotrwi  $acc01,$acc01,24
+       xor     $s2,$s2,$acc06
        rotrwi  $acc02,$acc02,24
+       xor     $s3,$s3,$acc07
        rotrwi  $acc03,$acc03,24
        xor     $acc04,$acc04,$acc08
        xor     $acc05,$acc05,$acc09
        xor     $acc06,$acc06,$acc10
        xor     $acc07,$acc07,$acc11
        xor     $s0,$s0,$acc08          # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
-       xor     $s1,$s1,$acc09
-       xor     $s2,$s2,$acc10
-       xor     $s3,$s3,$acc11
        rotrwi  $acc04,$acc04,16
+       xor     $s1,$s1,$acc09
        rotrwi  $acc05,$acc05,16
+       xor     $s2,$s2,$acc10
        rotrwi  $acc06,$acc06,16
+       xor     $s3,$s3,$acc11
        rotrwi  $acc07,$acc07,16
        xor     $s0,$s0,$acc00          # ^= ROTATE(r8^r2^r0,24)
-       xor     $s1,$s1,$acc01
-       xor     $s2,$s2,$acc02
-       xor     $s3,$s3,$acc03
        rotrwi  $acc08,$acc08,8
+       xor     $s1,$s1,$acc01
        rotrwi  $acc09,$acc09,8
+       xor     $s2,$s2,$acc02
        rotrwi  $acc10,$acc10,8
+       xor     $s3,$s3,$acc03
        rotrwi  $acc11,$acc11,8
        xor     $s0,$s0,$acc04          # ^= ROTATE(r8^r4^r0,16)
        xor     $s1,$s1,$acc05
        xor     $s2,$s2,$acc06
        xor     $s3,$s3,$acc07
-       xor     $s0,$s0,$acc08          # ^= ROTATE(r8,8)       
-       xor     $s1,$s1,$acc09  
-       xor     $s2,$s2,$acc10  
-       xor     $s3,$s3,$acc11  
-
-       xor     $s0,$s0,$t0
-       xor     $s1,$s1,$t1
-       xor     $s2,$s2,$t2
-       xor     $s3,$s3,$t3
-
-       addi    $key,$key,16
-       bdnz-   Ldec_compact_loop
+       xor     $s0,$s0,$acc08          # ^= ROTATE(r8,8)
+       xor     $s1,$s1,$acc09
+       xor     $s2,$s2,$acc10
+       xor     $s3,$s3,$acc11
 
-       rlwinm  $acc00,$s0,`32-24`,24,31
-       rlwinm  $acc01,$s1,`32-24`,24,31
-       rlwinm  $acc02,$s2,`32-24`,24,31
-       rlwinm  $acc03,$s3,`32-24`,24,31
-       lwz     $t0,0($key)
-       lwz     $t1,4($key)
-       lwz     $t2,8($key)
-       lwz     $t3,12($key)
-       rlwinm  $acc04,$s3,`32-16`,24,31
-       rlwinm  $acc05,$s0,`32-16`,24,31
-       rlwinm  $acc06,$s1,`32-16`,24,31
-       rlwinm  $acc07,$s2,`32-16`,24,31
-       lbzx    $acc00,$Tbl1,$acc00
-       lbzx    $acc01,$Tbl1,$acc01
-       lbzx    $acc02,$Tbl1,$acc02
-       lbzx    $acc03,$Tbl1,$acc03
-       rlwinm  $acc08,$s2,`32-8`,24,31
-       rlwinm  $acc09,$s3,`32-8`,24,31
-       rlwinm  $acc10,$s0,`32-8`,24,31
-       rlwinm  $acc11,$s1,`32-8`,24,31
-       lbzx    $acc04,$Tbl1,$acc04
-       lbzx    $acc05,$Tbl1,$acc05
-       lbzx    $acc06,$Tbl1,$acc06
-       lbzx    $acc07,$Tbl1,$acc07
-       rlwinm  $acc12,$s1,`0`,24,31
-       rlwinm  $acc13,$s2,`0`,24,31
-       rlwinm  $acc14,$s3,`0`,24,31
-       rlwinm  $acc15,$s0,`0`,24,31
-       lbzx    $acc08,$Tbl1,$acc08
-       lbzx    $acc09,$Tbl1,$acc09
-       lbzx    $acc10,$Tbl1,$acc10
-       lbzx    $acc11,$Tbl1,$acc11
-       rlwinm  $s0,$acc00,24,0,7
-       rlwinm  $s1,$acc01,24,0,7
-       rlwinm  $s2,$acc02,24,0,7
-       rlwinm  $s3,$acc03,24,0,7
-       lbzx    $acc12,$Tbl1,$acc12
-       lbzx    $acc13,$Tbl1,$acc13
-       lbzx    $acc14,$Tbl1,$acc14
-       lbzx    $acc15,$Tbl1,$acc15
-       rlwimi  $s0,$acc04,16,8,15
-       rlwimi  $s1,$acc05,16,8,15
-       rlwimi  $s2,$acc06,16,8,15
-       rlwimi  $s3,$acc07,16,8,15
-       rlwimi  $s0,$acc08,8,16,23
-       rlwimi  $s1,$acc09,8,16,23
-       rlwimi  $s2,$acc10,8,16,23
-       rlwimi  $s3,$acc11,8,16,23
-       or      $s0,$s0,$acc12
-       or      $s1,$s1,$acc13
-       or      $s2,$s2,$acc14
-       or      $s3,$s3,$acc15
+       b       Ldec_compact_loop
+.align 4
+Ldec_compact_done:
        xor     $s0,$s0,$t0
        xor     $s1,$s1,$t1
        xor     $s2,$s2,$t2
        xor     $s3,$s3,$t3
        blr
-.long  0
+       .long   0
+       .byte   0,12,0x14,0,0,0,0,0
+.size  .AES_decrypt,.-.AES_decrypt
+
 .asciz "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
 .align 7
 ___