# ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
# 128-bit key, which is ~40% better than 64-bit code generated by gcc
# 4.0. But these are not the ones currently used! Their "compact"
-# counterparts are, for security reason. ppc_AES_crypt_compact runs at
-# 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact - at 1/3
-# of ppc_AES_decrypt.
+# counterparts are, for security reason. ppc_AES_encrypt_compact runs
+# at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
+# at 1/3 of ppc_AES_decrypt.
-$output = shift;
+# February 2010
+#
+# Rescheduling instructions to favour Power6 pipeline gives 10%
+# performance improvement on the platfrom in question (and marginal
+# improvement even on others). It should be noted that Power6 fails
+# to process byte in 18 cycles, only in 23, because it fails to issue
+# 4 load instructions in two cycles, only in 3. As result non-compact
+# block subroutines are 25% slower than one would expect. Compact
+# functions scale better, because they have pure computational part,
+# which scales perfectly with clock frequency. To be specific
+# ppc_AES_encrypt_compact operates at 42 cycles per byte, while
+# ppc_AES_decrypt_compact - at 55 (in 64-bit build).
+
+$flavour = shift;
-if ($output =~ /64\.s/) {
+if ($flavour =~ /64/) {
$SIZE_T =8;
$STU ="stdu";
$POP ="ld";
$PUSH ="std";
-} elsif ($output =~ /32\.s/) {
+} elsif ($flavour =~ /32/) {
$SIZE_T =4;
$STU ="stwu";
$POP ="lwz";
$PUSH ="stw";
-} else { die "nonsense $output"; }
+} else { die "nonsense $flavour"; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
die "can't locate ppc-xlate.pl";
-( defined shift || open STDOUT,"| $^X $xlate $output" ) ||
- die "can't call $xlate: $!";
+open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
$FRAME=32*$SIZE_T;
$mask1b=$Tbl3;
$code.=<<___;
+.machine "any"
.text
.align 7
addi $sp,$sp,$FRAME
blr
-.align 4
+.align 5
Lppc_AES_encrypt:
lwz $acc00,240($key)
lwz $t0,0($key)
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc04,$s1,`32-16+3`,21,28
rlwinm $acc05,$s2,`32-16+3`,21,28
+ lwz $t2,8($key)
+ lwz $t3,12($key)
rlwinm $acc06,$s3,`32-16+3`,21,28
rlwinm $acc07,$s0,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
lwzx $acc01,$Tbl0,$acc01
- lwzx $acc02,$Tbl0,$acc02
- lwzx $acc03,$Tbl0,$acc03
rlwinm $acc08,$s2,`32-8+3`,21,28
rlwinm $acc09,$s3,`32-8+3`,21,28
+ lwzx $acc02,$Tbl0,$acc02
+ lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
lwzx $acc05,$Tbl1,$acc05
- lwzx $acc06,$Tbl1,$acc06
- lwzx $acc07,$Tbl1,$acc07
rlwinm $acc12,$s3,`0+3`,21,28
rlwinm $acc13,$s0,`0+3`,21,28
+ lwzx $acc06,$Tbl1,$acc06
+ lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0+3`,21,28
rlwinm $acc15,$s2,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
lwzx $acc09,$Tbl2,$acc09
- lwzx $acc10,$Tbl2,$acc10
- lwzx $acc11,$Tbl2,$acc11
xor $t0,$t0,$acc00
xor $t1,$t1,$acc01
+ lwzx $acc10,$Tbl2,$acc10
+ lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
lwzx $acc13,$Tbl3,$acc13
- lwzx $acc14,$Tbl3,$acc14
- lwzx $acc15,$Tbl3,$acc15
xor $t0,$t0,$acc04
xor $t1,$t1,$acc05
+ lwzx $acc14,$Tbl3,$acc14
+ lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
addi $Tbl2,$Tbl0,2048
nop
- lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
- lwz $acc09,`2048+32`($Tbl0)
- lwz $acc10,`2048+64`($Tbl0)
- lwz $acc11,`2048+96`($Tbl0)
- lwz $acc08,`2048+128`($Tbl0)
- lwz $acc09,`2048+160`($Tbl0)
- lwz $acc10,`2048+192`($Tbl0)
- lwz $acc11,`2048+224`($Tbl0)
- rlwinm $acc00,$s0,`32-24`,24,31
- rlwinm $acc01,$s1,`32-24`,24,31
- rlwinm $acc02,$s2,`32-24`,24,31
- rlwinm $acc03,$s3,`32-24`,24,31
lwz $t0,0($key)
lwz $t1,4($key)
+ rlwinm $acc00,$s0,`32-24`,24,31
+ rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
lwz $t3,12($key)
+ rlwinm $acc02,$s2,`32-24`,24,31
+ rlwinm $acc03,$s3,`32-24`,24,31
+ lwz $acc08,`2048+0`($Tbl0) ! prefetch Te4
+ lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s1,`32-16`,24,31
rlwinm $acc05,$s2,`32-16`,24,31
+ lwz $acc10,`2048+64`($Tbl0)
+ lwz $acc11,`2048+96`($Tbl0)
rlwinm $acc06,$s3,`32-16`,24,31
rlwinm $acc07,$s0,`32-16`,24,31
- lbzx $acc00,$Tbl2,$acc00
- lbzx $acc01,$Tbl2,$acc01
- lbzx $acc02,$Tbl2,$acc02
- lbzx $acc03,$Tbl2,$acc03
+ lwz $acc12,`2048+128`($Tbl0)
+ lwz $acc13,`2048+160`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
rlwinm $acc09,$s3,`32-8`,24,31
+ lwz $acc14,`2048+192`($Tbl0)
+ lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc10,$s0,`32-8`,24,31
rlwinm $acc11,$s1,`32-8`,24,31
- lbzx $acc04,$Tbl2,$acc04
- lbzx $acc05,$Tbl2,$acc05
- lbzx $acc06,$Tbl2,$acc06
- lbzx $acc07,$Tbl2,$acc07
+ lbzx $acc00,$Tbl2,$acc00
+ lbzx $acc01,$Tbl2,$acc01
rlwinm $acc12,$s3,`0`,24,31
rlwinm $acc13,$s0,`0`,24,31
+ lbzx $acc02,$Tbl2,$acc02
+ lbzx $acc03,$Tbl2,$acc03
rlwinm $acc14,$s1,`0`,24,31
rlwinm $acc15,$s2,`0`,24,31
- lbzx $acc08,$Tbl2,$acc08
- lbzx $acc09,$Tbl2,$acc09
- lbzx $acc10,$Tbl2,$acc10
- lbzx $acc11,$Tbl2,$acc11
+ lbzx $acc04,$Tbl2,$acc04
+ lbzx $acc05,$Tbl2,$acc05
rlwinm $s0,$acc00,24,0,7
rlwinm $s1,$acc01,24,0,7
+ lbzx $acc06,$Tbl2,$acc06
+ lbzx $acc07,$Tbl2,$acc07
rlwinm $s2,$acc02,24,0,7
rlwinm $s3,$acc03,24,0,7
- lbzx $acc12,$Tbl2,$acc12
- lbzx $acc13,$Tbl2,$acc13
- lbzx $acc14,$Tbl2,$acc14
- lbzx $acc15,$Tbl2,$acc15
+ lbzx $acc08,$Tbl2,$acc08
+ lbzx $acc09,$Tbl2,$acc09
rlwimi $s0,$acc04,16,8,15
rlwimi $s1,$acc05,16,8,15
+ lbzx $acc10,$Tbl2,$acc10
+ lbzx $acc11,$Tbl2,$acc11
rlwimi $s2,$acc06,16,8,15
rlwimi $s3,$acc07,16,8,15
+ lbzx $acc12,$Tbl2,$acc12
+ lbzx $acc13,$Tbl2,$acc13
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
+ lbzx $acc14,$Tbl2,$acc14
+ lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
or $s0,$s0,$acc12
addi $Tbl1,$Tbl0,2048
lis $mask80,0x8080
lis $mask1b,0x1b1b
- addi $acc00,$acc00,-1
addi $key,$key,16
ori $mask80,$mask80,0x8080
ori $mask1b,$mask1b,0x1b1b
+ mtctr $acc00
+.align 4
+Lenc_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
- mtctr $acc00
-.align 4
-Lenc_compact_loop:
rlwinm $acc00,$s0,`32-24`,24,31
rlwinm $acc01,$s1,`32-24`,24,31
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc04,$s1,`32-16`,24,31
rlwinm $acc05,$s2,`32-16`,24,31
rlwinm $acc06,$s3,`32-16`,24,31
rlwinm $acc07,$s0,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
lbzx $acc01,$Tbl1,$acc01
- lbzx $acc02,$Tbl1,$acc02
- lbzx $acc03,$Tbl1,$acc03
rlwinm $acc08,$s2,`32-8`,24,31
rlwinm $acc09,$s3,`32-8`,24,31
+ lbzx $acc02,$Tbl1,$acc02
+ lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
lbzx $acc05,$Tbl1,$acc05
- lbzx $acc06,$Tbl1,$acc06
- lbzx $acc07,$Tbl1,$acc07
rlwinm $acc12,$s3,`0`,24,31
rlwinm $acc13,$s0,`0`,24,31
+ lbzx $acc06,$Tbl1,$acc06
+ lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s1,`0`,24,31
rlwinm $acc15,$s2,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
lbzx $acc09,$Tbl1,$acc09
- lbzx $acc10,$Tbl1,$acc10
- lbzx $acc11,$Tbl1,$acc11
rlwinm $s0,$acc00,24,0,7
rlwinm $s1,$acc01,24,0,7
+ lbzx $acc10,$Tbl1,$acc10
+ lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
lbzx $acc13,$Tbl1,$acc13
- lbzx $acc14,$Tbl1,$acc14
- lbzx $acc15,$Tbl1,$acc15
rlwimi $s0,$acc04,16,8,15
rlwimi $s1,$acc05,16,8,15
+ lbzx $acc14,$Tbl1,$acc14
+ lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
+ lwz $t0,0($key)
+ lwz $t1,4($key)
or $s0,$s0,$acc12
or $s1,$s1,$acc13
+ lwz $t2,8($key)
+ lwz $t3,12($key)
or $s2,$s2,$acc14
or $s3,$s3,$acc15
+ addi $key,$key,16
+ bdz Lenc_compact_done
+
and $acc00,$s0,$mask80 # r1=r0&0x80808080
and $acc01,$s1,$mask80
and $acc02,$s2,$mask80
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
- xor $s0,$s0,$t0
- xor $s1,$s1,$t1
- xor $s2,$s2,$t2
- xor $s3,$s3,$t3
-
- addi $key,$key,16
- bdnz- Lenc_compact_loop
-
- rlwinm $acc00,$s0,`32-24`,24,31
- rlwinm $acc01,$s1,`32-24`,24,31
- rlwinm $acc02,$s2,`32-24`,24,31
- rlwinm $acc03,$s3,`32-24`,24,31
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
- rlwinm $acc04,$s1,`32-16`,24,31
- rlwinm $acc05,$s2,`32-16`,24,31
- rlwinm $acc06,$s3,`32-16`,24,31
- rlwinm $acc07,$s0,`32-16`,24,31
- lbzx $acc00,$Tbl1,$acc00
- lbzx $acc01,$Tbl1,$acc01
- lbzx $acc02,$Tbl1,$acc02
- lbzx $acc03,$Tbl1,$acc03
- rlwinm $acc08,$s2,`32-8`,24,31
- rlwinm $acc09,$s3,`32-8`,24,31
- rlwinm $acc10,$s0,`32-8`,24,31
- rlwinm $acc11,$s1,`32-8`,24,31
- lbzx $acc04,$Tbl1,$acc04
- lbzx $acc05,$Tbl1,$acc05
- lbzx $acc06,$Tbl1,$acc06
- lbzx $acc07,$Tbl1,$acc07
- rlwinm $acc12,$s3,`0`,24,31
- rlwinm $acc13,$s0,`0`,24,31
- rlwinm $acc14,$s1,`0`,24,31
- rlwinm $acc15,$s2,`0`,24,31
- lbzx $acc08,$Tbl1,$acc08
- lbzx $acc09,$Tbl1,$acc09
- lbzx $acc10,$Tbl1,$acc10
- lbzx $acc11,$Tbl1,$acc11
- rlwinm $s0,$acc00,24,0,7
- rlwinm $s1,$acc01,24,0,7
- rlwinm $s2,$acc02,24,0,7
- rlwinm $s3,$acc03,24,0,7
- lbzx $acc12,$Tbl1,$acc12
- lbzx $acc13,$Tbl1,$acc13
- lbzx $acc14,$Tbl1,$acc14
- lbzx $acc15,$Tbl1,$acc15
- rlwimi $s0,$acc04,16,8,15
- rlwimi $s1,$acc05,16,8,15
- rlwimi $s2,$acc06,16,8,15
- rlwimi $s3,$acc07,16,8,15
- rlwimi $s0,$acc08,8,16,23
- rlwimi $s1,$acc09,8,16,23
- rlwimi $s2,$acc10,8,16,23
- rlwimi $s3,$acc11,8,16,23
- or $s0,$s0,$acc12
- or $s1,$s1,$acc13
- or $s2,$s2,$acc14
- or $s3,$s3,$acc15
+ b Lenc_compact_loop
+.align 4
+Lenc_compact_done:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
addi $sp,$sp,$FRAME
blr
-.align 4
+.align 5
Lppc_AES_decrypt:
lwz $acc00,240($key)
lwz $t0,0($key)
rlwinm $acc03,$s3,`32-24+3`,21,28
lwz $t0,0($key)
lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc04,$s3,`32-16+3`,21,28
rlwinm $acc05,$s0,`32-16+3`,21,28
+ lwz $t2,8($key)
+ lwz $t3,12($key)
rlwinm $acc06,$s1,`32-16+3`,21,28
rlwinm $acc07,$s2,`32-16+3`,21,28
lwzx $acc00,$Tbl0,$acc00
lwzx $acc01,$Tbl0,$acc01
- lwzx $acc02,$Tbl0,$acc02
- lwzx $acc03,$Tbl0,$acc03
rlwinm $acc08,$s2,`32-8+3`,21,28
rlwinm $acc09,$s3,`32-8+3`,21,28
+ lwzx $acc02,$Tbl0,$acc02
+ lwzx $acc03,$Tbl0,$acc03
rlwinm $acc10,$s0,`32-8+3`,21,28
rlwinm $acc11,$s1,`32-8+3`,21,28
lwzx $acc04,$Tbl1,$acc04
lwzx $acc05,$Tbl1,$acc05
- lwzx $acc06,$Tbl1,$acc06
- lwzx $acc07,$Tbl1,$acc07
rlwinm $acc12,$s1,`0+3`,21,28
rlwinm $acc13,$s2,`0+3`,21,28
+ lwzx $acc06,$Tbl1,$acc06
+ lwzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0+3`,21,28
rlwinm $acc15,$s0,`0+3`,21,28
lwzx $acc08,$Tbl2,$acc08
lwzx $acc09,$Tbl2,$acc09
- lwzx $acc10,$Tbl2,$acc10
- lwzx $acc11,$Tbl2,$acc11
xor $t0,$t0,$acc00
xor $t1,$t1,$acc01
+ lwzx $acc10,$Tbl2,$acc10
+ lwzx $acc11,$Tbl2,$acc11
xor $t2,$t2,$acc02
xor $t3,$t3,$acc03
lwzx $acc12,$Tbl3,$acc12
lwzx $acc13,$Tbl3,$acc13
- lwzx $acc14,$Tbl3,$acc14
- lwzx $acc15,$Tbl3,$acc15
xor $t0,$t0,$acc04
xor $t1,$t1,$acc05
+ lwzx $acc14,$Tbl3,$acc14
+ lwzx $acc15,$Tbl3,$acc15
xor $t2,$t2,$acc06
xor $t3,$t3,$acc07
xor $t0,$t0,$acc08
addi $Tbl2,$Tbl0,2048
nop
- lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
- lwz $acc09,`2048+32`($Tbl0)
- lwz $acc10,`2048+64`($Tbl0)
- lwz $acc11,`2048+96`($Tbl0)
- lwz $acc08,`2048+128`($Tbl0)
- lwz $acc09,`2048+160`($Tbl0)
- lwz $acc10,`2048+192`($Tbl0)
- lwz $acc11,`2048+224`($Tbl0)
- rlwinm $acc00,$s0,`32-24`,24,31
- rlwinm $acc01,$s1,`32-24`,24,31
- rlwinm $acc02,$s2,`32-24`,24,31
- rlwinm $acc03,$s3,`32-24`,24,31
lwz $t0,0($key)
lwz $t1,4($key)
+ rlwinm $acc00,$s0,`32-24`,24,31
+ rlwinm $acc01,$s1,`32-24`,24,31
lwz $t2,8($key)
lwz $t3,12($key)
+ rlwinm $acc02,$s2,`32-24`,24,31
+ rlwinm $acc03,$s3,`32-24`,24,31
+ lwz $acc08,`2048+0`($Tbl0) ! prefetch Td4
+ lwz $acc09,`2048+32`($Tbl0)
rlwinm $acc04,$s3,`32-16`,24,31
rlwinm $acc05,$s0,`32-16`,24,31
- rlwinm $acc06,$s1,`32-16`,24,31
- rlwinm $acc07,$s2,`32-16`,24,31
+ lwz $acc10,`2048+64`($Tbl0)
+ lwz $acc11,`2048+96`($Tbl0)
lbzx $acc00,$Tbl2,$acc00
lbzx $acc01,$Tbl2,$acc01
- lbzx $acc02,$Tbl2,$acc02
- lbzx $acc03,$Tbl2,$acc03
+ lwz $acc12,`2048+128`($Tbl0)
+ lwz $acc13,`2048+160`($Tbl0)
+ rlwinm $acc06,$s1,`32-16`,24,31
+ rlwinm $acc07,$s2,`32-16`,24,31
+ lwz $acc14,`2048+192`($Tbl0)
+ lwz $acc15,`2048+224`($Tbl0)
rlwinm $acc08,$s2,`32-8`,24,31
rlwinm $acc09,$s3,`32-8`,24,31
+ lbzx $acc02,$Tbl2,$acc02
+ lbzx $acc03,$Tbl2,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl2,$acc04
lbzx $acc05,$Tbl2,$acc05
- lbzx $acc06,$Tbl2,$acc06
- lbzx $acc07,$Tbl2,$acc07
rlwinm $acc12,$s1,`0`,24,31
rlwinm $acc13,$s2,`0`,24,31
+ lbzx $acc06,$Tbl2,$acc06
+ lbzx $acc07,$Tbl2,$acc07
rlwinm $acc14,$s3,`0`,24,31
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl2,$acc08
lbzx $acc09,$Tbl2,$acc09
- lbzx $acc10,$Tbl2,$acc10
- lbzx $acc11,$Tbl2,$acc11
rlwinm $s0,$acc00,24,0,7
rlwinm $s1,$acc01,24,0,7
+ lbzx $acc10,$Tbl2,$acc10
+ lbzx $acc11,$Tbl2,$acc11
rlwinm $s2,$acc02,24,0,7
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl2,$acc12
lbzx $acc13,$Tbl2,$acc13
- lbzx $acc14,$Tbl2,$acc14
- lbzx $acc15,$Tbl2,$acc15
rlwimi $s0,$acc04,16,8,15
rlwimi $s1,$acc05,16,8,15
+ lbzx $acc14,$Tbl2,$acc14
+ lbzx $acc15,$Tbl2,$acc15
rlwimi $s2,$acc06,16,8,15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
addi $Tbl1,$Tbl0,2048
lis $mask80,0x8080
lis $mask1b,0x1b1b
- addi $acc00,$acc00,-1
addi $key,$key,16
ori $mask80,$mask80,0x8080
ori $mask1b,$mask1b,0x1b1b
+___
+$code.=<<___ if ($SIZE_T==8);
+ insrdi $mask80,$mask80,32,0
+ insrdi $mask1b,$mask1b,32,0
+___
+$code.=<<___;
+ mtctr $acc00
+.align 4
+Ldec_compact_loop:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2
xor $s3,$s3,$t3
- mtctr $acc00
-.align 4
-Ldec_compact_loop:
rlwinm $acc00,$s0,`32-24`,24,31
rlwinm $acc01,$s1,`32-24`,24,31
rlwinm $acc02,$s2,`32-24`,24,31
rlwinm $acc03,$s3,`32-24`,24,31
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
rlwinm $acc04,$s3,`32-16`,24,31
rlwinm $acc05,$s0,`32-16`,24,31
rlwinm $acc06,$s1,`32-16`,24,31
rlwinm $acc07,$s2,`32-16`,24,31
lbzx $acc00,$Tbl1,$acc00
lbzx $acc01,$Tbl1,$acc01
- lbzx $acc02,$Tbl1,$acc02
- lbzx $acc03,$Tbl1,$acc03
rlwinm $acc08,$s2,`32-8`,24,31
rlwinm $acc09,$s3,`32-8`,24,31
+ lbzx $acc02,$Tbl1,$acc02
+ lbzx $acc03,$Tbl1,$acc03
rlwinm $acc10,$s0,`32-8`,24,31
rlwinm $acc11,$s1,`32-8`,24,31
lbzx $acc04,$Tbl1,$acc04
lbzx $acc05,$Tbl1,$acc05
- lbzx $acc06,$Tbl1,$acc06
- lbzx $acc07,$Tbl1,$acc07
rlwinm $acc12,$s1,`0`,24,31
rlwinm $acc13,$s2,`0`,24,31
+ lbzx $acc06,$Tbl1,$acc06
+ lbzx $acc07,$Tbl1,$acc07
rlwinm $acc14,$s3,`0`,24,31
rlwinm $acc15,$s0,`0`,24,31
lbzx $acc08,$Tbl1,$acc08
lbzx $acc09,$Tbl1,$acc09
- lbzx $acc10,$Tbl1,$acc10
- lbzx $acc11,$Tbl1,$acc11
rlwinm $s0,$acc00,24,0,7
rlwinm $s1,$acc01,24,0,7
+ lbzx $acc10,$Tbl1,$acc10
+ lbzx $acc11,$Tbl1,$acc11
rlwinm $s2,$acc02,24,0,7
rlwinm $s3,$acc03,24,0,7
lbzx $acc12,$Tbl1,$acc12
lbzx $acc13,$Tbl1,$acc13
- lbzx $acc14,$Tbl1,$acc14
- lbzx $acc15,$Tbl1,$acc15
rlwimi $s0,$acc04,16,8,15
rlwimi $s1,$acc05,16,8,15
+ lbzx $acc14,$Tbl1,$acc14
+ lbzx $acc15,$Tbl1,$acc15
rlwimi $s2,$acc06,16,8,15
rlwimi $s3,$acc07,16,8,15
rlwimi $s0,$acc08,8,16,23
rlwimi $s1,$acc09,8,16,23
rlwimi $s2,$acc10,8,16,23
rlwimi $s3,$acc11,8,16,23
+ lwz $t0,0($key)
+ lwz $t1,4($key)
or $s0,$s0,$acc12
or $s1,$s1,$acc13
+ lwz $t2,8($key)
+ lwz $t3,12($key)
or $s2,$s2,$acc14
or $s3,$s3,$acc15
+ addi $key,$key,16
+ bdz Ldec_compact_done
+___
+$code.=<<___ if ($SIZE_T==8);
+ # vectorized permutation improves decrypt performance by 10%
+ insrdi $s0,$s1,32,0
+ insrdi $s2,$s3,32,0
+
+ and $acc00,$s0,$mask80 # r1=r0&0x80808080
+ and $acc02,$s2,$mask80
+ srdi $acc04,$acc00,7 # r1>>7
+ srdi $acc06,$acc02,7
+ andc $acc08,$s0,$mask80 # r0&0x7f7f7f7f
+ andc $acc10,$s2,$mask80
+ sub $acc00,$acc00,$acc04 # r1-(r1>>7)
+ sub $acc02,$acc02,$acc06
+ add $acc08,$acc08,$acc08 # (r0&0x7f7f7f7f)<<1
+ add $acc10,$acc10,$acc10
+ and $acc00,$acc00,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc02,$acc02,$mask1b
+ xor $acc00,$acc00,$acc08 # r2
+ xor $acc02,$acc02,$acc10
+
+ and $acc04,$acc00,$mask80 # r1=r2&0x80808080
+ and $acc06,$acc02,$mask80
+ srdi $acc08,$acc04,7 # r1>>7
+ srdi $acc10,$acc06,7
+ andc $acc12,$acc00,$mask80 # r2&0x7f7f7f7f
+ andc $acc14,$acc02,$mask80
+ sub $acc04,$acc04,$acc08 # r1-(r1>>7)
+ sub $acc06,$acc06,$acc10
+ add $acc12,$acc12,$acc12 # (r2&0x7f7f7f7f)<<1
+ add $acc14,$acc14,$acc14
+ and $acc04,$acc04,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc06,$acc06,$mask1b
+ xor $acc04,$acc04,$acc12 # r4
+ xor $acc06,$acc06,$acc14
+
+ and $acc08,$acc04,$mask80 # r1=r4&0x80808080
+ and $acc10,$acc06,$mask80
+ srdi $acc12,$acc08,7 # r1>>7
+ srdi $acc14,$acc10,7
+ sub $acc08,$acc08,$acc12 # r1-(r1>>7)
+ sub $acc10,$acc10,$acc14
+ andc $acc12,$acc04,$mask80 # r4&0x7f7f7f7f
+ andc $acc14,$acc06,$mask80
+ add $acc12,$acc12,$acc12 # (r4&0x7f7f7f7f)<<1
+ add $acc14,$acc14,$acc14
+ and $acc08,$acc08,$mask1b # (r1-(r1>>7))&0x1b1b1b1b
+ and $acc10,$acc10,$mask1b
+ xor $acc08,$acc08,$acc12 # r8
+ xor $acc10,$acc10,$acc14
+
+ xor $acc00,$acc00,$s0 # r2^r0
+ xor $acc02,$acc02,$s2
+ xor $acc04,$acc04,$s0 # r4^r0
+ xor $acc06,$acc06,$s2
+
+ extrdi $acc01,$acc00,32,0
+ extrdi $acc03,$acc02,32,0
+ extrdi $acc05,$acc04,32,0
+ extrdi $acc07,$acc06,32,0
+ extrdi $acc09,$acc08,32,0
+ extrdi $acc11,$acc10,32,0
+___
+$code.=<<___ if ($SIZE_T==4);
and $acc00,$s0,$mask80 # r1=r0&0x80808080
and $acc01,$s1,$mask80
and $acc02,$s2,$mask80
xor $acc05,$acc05,$s1
xor $acc06,$acc06,$s2
xor $acc07,$acc07,$s3
+___
+$code.=<<___;
rotrwi $s0,$s0,8 # = ROTATE(r0,8)
rotrwi $s1,$s1,8
rotrwi $s2,$s2,8
xor $s2,$s2,$acc10
xor $s3,$s3,$acc11
- xor $s0,$s0,$t0
- xor $s1,$s1,$t1
- xor $s2,$s2,$t2
- xor $s3,$s3,$t3
-
- addi $key,$key,16
- bdnz- Ldec_compact_loop
-
- rlwinm $acc00,$s0,`32-24`,24,31
- rlwinm $acc01,$s1,`32-24`,24,31
- rlwinm $acc02,$s2,`32-24`,24,31
- rlwinm $acc03,$s3,`32-24`,24,31
- lwz $t0,0($key)
- lwz $t1,4($key)
- lwz $t2,8($key)
- lwz $t3,12($key)
- rlwinm $acc04,$s3,`32-16`,24,31
- rlwinm $acc05,$s0,`32-16`,24,31
- rlwinm $acc06,$s1,`32-16`,24,31
- rlwinm $acc07,$s2,`32-16`,24,31
- lbzx $acc00,$Tbl1,$acc00
- lbzx $acc01,$Tbl1,$acc01
- lbzx $acc02,$Tbl1,$acc02
- lbzx $acc03,$Tbl1,$acc03
- rlwinm $acc08,$s2,`32-8`,24,31
- rlwinm $acc09,$s3,`32-8`,24,31
- rlwinm $acc10,$s0,`32-8`,24,31
- rlwinm $acc11,$s1,`32-8`,24,31
- lbzx $acc04,$Tbl1,$acc04
- lbzx $acc05,$Tbl1,$acc05
- lbzx $acc06,$Tbl1,$acc06
- lbzx $acc07,$Tbl1,$acc07
- rlwinm $acc12,$s1,`0`,24,31
- rlwinm $acc13,$s2,`0`,24,31
- rlwinm $acc14,$s3,`0`,24,31
- rlwinm $acc15,$s0,`0`,24,31
- lbzx $acc08,$Tbl1,$acc08
- lbzx $acc09,$Tbl1,$acc09
- lbzx $acc10,$Tbl1,$acc10
- lbzx $acc11,$Tbl1,$acc11
- rlwinm $s0,$acc00,24,0,7
- rlwinm $s1,$acc01,24,0,7
- rlwinm $s2,$acc02,24,0,7
- rlwinm $s3,$acc03,24,0,7
- lbzx $acc12,$Tbl1,$acc12
- lbzx $acc13,$Tbl1,$acc13
- lbzx $acc14,$Tbl1,$acc14
- lbzx $acc15,$Tbl1,$acc15
- rlwimi $s0,$acc04,16,8,15
- rlwimi $s1,$acc05,16,8,15
- rlwimi $s2,$acc06,16,8,15
- rlwimi $s3,$acc07,16,8,15
- rlwimi $s0,$acc08,8,16,23
- rlwimi $s1,$acc09,8,16,23
- rlwimi $s2,$acc10,8,16,23
- rlwimi $s3,$acc11,8,16,23
- or $s0,$s0,$acc12
- or $s1,$s1,$acc13
- or $s2,$s2,$acc14
- or $s3,$s3,$acc15
+ b Ldec_compact_loop
+.align 4
+Ldec_compact_done:
xor $s0,$s0,$t0
xor $s1,$s1,$t1
xor $s2,$s2,$t2