10% performance tweak in 64-bit mode.
[openssl.git] / crypto / aes / asm / aes-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # Needs more work: key setup, page boundaries, CBC routine...
11 #
12 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
14 # 4.0. But these are not the ones currently used! Their "compact"
15 # counterparts are, for security reason. ppc_AES_encrypt_compact runs
16 # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17 # at 1/3 of ppc_AES_decrypt.
18
19 $output = shift;
20
21 if ($output =~ /64\.s/) {
22         $SIZE_T =8;
23         $STU    ="stdu";
24         $POP    ="ld";
25         $PUSH   ="std";
26 } elsif ($output =~ /32\.s/) {
27         $SIZE_T =4;
28         $STU    ="stwu";
29         $POP    ="lwz";
30         $PUSH   ="stw";
31 } else { die "nonsense $output"; }
32
33 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
34 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
35 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
36 die "can't locate ppc-xlate.pl";
37
38 ( defined shift || open STDOUT,"| $^X $xlate $output" ) ||
39         die "can't call $xlate: $!";
40
41 $FRAME=32*$SIZE_T;
42
43 sub _data_word()
44 { my $i;
45     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
46 }
47
48 $sp="r1";
49 $toc="r2";
50 $inp="r3";
51 $out="r4";
52 $key="r5";
53
54 $Tbl0="r3";
55 $Tbl1="r6";
56 $Tbl2="r7";
57 $Tbl3="r2";
58
59 $s0="r8";
60 $s1="r9";
61 $s2="r10";
62 $s3="r11";
63
64 $t0="r12";
65 $t1="r13";
66 $t2="r14";
67 $t3="r15";
68
69 $acc00="r16";
70 $acc01="r17";
71 $acc02="r18";
72 $acc03="r19";
73
74 $acc04="r20";
75 $acc05="r21";
76 $acc06="r22";
77 $acc07="r23";
78
79 $acc08="r24";
80 $acc09="r25";
81 $acc10="r26";
82 $acc11="r27";
83
84 $acc12="r28";
85 $acc13="r29";
86 $acc14="r30";
87 $acc15="r31";
88
89 # stay away from TLS pointer
90 if ($SIZE_T==8) { die if ($t1 ne "r13");  $t1="r0";             }
91 else            { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0";  }
92 $mask80=$Tbl2;
93 $mask1b=$Tbl3;
94
95 $code.=<<___;
96 .text
97
98 .align  7
99 LAES_Te:
100         mflr    r0
101         bcl     20,31,\$+4
102         mflr    $Tbl0   ;    vvvvv "distance" between . and 1st data entry
103         addi    $Tbl0,$Tbl0,`128-8`
104         mtlr    r0
105         blr
106         .space  `32-24`
107 LAES_Td:
108         mflr    r0
109         bcl     20,31,\$+4
110         mflr    $Tbl0   ;    vvvvvvvv "distance" between . and 1st data entry
111         addi    $Tbl0,$Tbl0,`128-8-32+2048+256`
112         mtlr    r0
113         blr
114         .space  `128-32-24`
115 ___
116 &_data_word(
117         0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
118         0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
119         0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
120         0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
121         0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
122         0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
123         0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
124         0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
125         0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
126         0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
127         0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
128         0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
129         0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
130         0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
131         0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
132         0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
133         0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
134         0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
135         0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
136         0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
137         0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
138         0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
139         0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
140         0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
141         0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
142         0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
143         0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
144         0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
145         0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
146         0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
147         0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
148         0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
149         0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
150         0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
151         0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
152         0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
153         0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
154         0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
155         0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
156         0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
157         0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
158         0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
159         0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
160         0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
161         0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
162         0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
163         0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
164         0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
165         0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
166         0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
167         0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
168         0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
169         0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
170         0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
171         0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
172         0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
173         0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
174         0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
175         0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
176         0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
177         0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
178         0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
179         0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
180         0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
181 $code.=<<___;
182 .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
183 .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
184 .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
185 .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
186 .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
187 .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
188 .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
189 .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
190 .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
191 .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
192 .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
193 .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
194 .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
195 .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
196 .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
197 .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
198 .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
199 .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
200 .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
201 .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
202 .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
203 .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
204 .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
205 .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
206 .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
207 .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
208 .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
209 .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
210 .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
211 .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
212 .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
213 .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
214 ___
215 &_data_word(
216         0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
217         0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
218         0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
219         0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
220         0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
221         0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
222         0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
223         0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
224         0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
225         0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
226         0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
227         0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
228         0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
229         0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
230         0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
231         0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
232         0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
233         0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
234         0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
235         0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
236         0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
237         0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
238         0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
239         0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
240         0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
241         0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
242         0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
243         0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
244         0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
245         0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
246         0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
247         0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
248         0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
249         0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
250         0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
251         0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
252         0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
253         0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
254         0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
255         0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
256         0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
257         0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
258         0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
259         0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
260         0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
261         0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
262         0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
263         0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
264         0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
265         0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
266         0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
267         0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
268         0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
269         0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
270         0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
271         0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
272         0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
273         0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
274         0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
275         0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
276         0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
277         0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
278         0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
279         0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
280 $code.=<<___;
281 .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
282 .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
283 .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
284 .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
285 .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
286 .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
287 .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
288 .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
289 .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
290 .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
291 .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
292 .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
293 .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
294 .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
295 .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
296 .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
297 .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
298 .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
299 .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
300 .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
301 .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
302 .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
303 .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
304 .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
305 .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
306 .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
307 .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
308 .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
309 .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
310 .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
311 .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
312 .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
313
314
315 .globl  .AES_encrypt
316 .align  7
317 .AES_encrypt:
318         mflr    r0
319         $STU    $sp,-$FRAME($sp)
320
321         $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
322         $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
323         $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
324         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
325         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
326         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
327         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
328         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
329         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
330         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
331         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
332         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
333         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
334         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
335         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
336         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
337         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
338         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
339         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
340         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
341         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
342
343         lwz     $s0,0($inp)
344         lwz     $s1,4($inp)
345         lwz     $s2,8($inp)
346         lwz     $s3,12($inp)
347         bl      LAES_Te
348         bl      Lppc_AES_encrypt_compact
349         stw     $s0,0($out)
350         stw     $s1,4($out)
351         stw     $s2,8($out)
352         stw     $s3,12($out)
353
354         $POP    r0,`$FRAME-$SIZE_T*21`($sp)
355         $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
356         $POP    r13,`$FRAME-$SIZE_T*19`($sp)
357         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
358         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
359         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
360         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
361         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
362         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
363         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
364         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
365         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
366         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
367         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
368         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
369         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
370         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
371         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
372         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
373         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
374         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
375         mtlr    r0
376         addi    $sp,$sp,$FRAME
377         blr
378
379 .align  4
380 Lppc_AES_encrypt:
381         lwz     $acc00,240($key)
382         lwz     $t0,0($key)
383         lwz     $t1,4($key)
384         lwz     $t2,8($key)
385         lwz     $t3,12($key)
386         addi    $Tbl1,$Tbl0,3
387         addi    $Tbl2,$Tbl0,2
388         addi    $Tbl3,$Tbl0,1
389         addi    $acc00,$acc00,-1
390         addi    $key,$key,16
391         xor     $s0,$s0,$t0
392         xor     $s1,$s1,$t1
393         xor     $s2,$s2,$t2
394         xor     $s3,$s3,$t3
395         mtctr   $acc00
396 .align  4
397 Lenc_loop:
398         rlwinm  $acc00,$s0,`32-24+3`,21,28
399         rlwinm  $acc01,$s1,`32-24+3`,21,28
400         rlwinm  $acc02,$s2,`32-24+3`,21,28
401         rlwinm  $acc03,$s3,`32-24+3`,21,28
402         lwz     $t0,0($key)
403         lwz     $t1,4($key)
404         lwz     $t2,8($key)
405         lwz     $t3,12($key)
406         rlwinm  $acc04,$s1,`32-16+3`,21,28
407         rlwinm  $acc05,$s2,`32-16+3`,21,28
408         rlwinm  $acc06,$s3,`32-16+3`,21,28
409         rlwinm  $acc07,$s0,`32-16+3`,21,28
410         lwzx    $acc00,$Tbl0,$acc00
411         lwzx    $acc01,$Tbl0,$acc01
412         lwzx    $acc02,$Tbl0,$acc02
413         lwzx    $acc03,$Tbl0,$acc03
414         rlwinm  $acc08,$s2,`32-8+3`,21,28
415         rlwinm  $acc09,$s3,`32-8+3`,21,28
416         rlwinm  $acc10,$s0,`32-8+3`,21,28
417         rlwinm  $acc11,$s1,`32-8+3`,21,28
418         lwzx    $acc04,$Tbl1,$acc04
419         lwzx    $acc05,$Tbl1,$acc05
420         lwzx    $acc06,$Tbl1,$acc06
421         lwzx    $acc07,$Tbl1,$acc07
422         rlwinm  $acc12,$s3,`0+3`,21,28
423         rlwinm  $acc13,$s0,`0+3`,21,28
424         rlwinm  $acc14,$s1,`0+3`,21,28
425         rlwinm  $acc15,$s2,`0+3`,21,28
426         lwzx    $acc08,$Tbl2,$acc08
427         lwzx    $acc09,$Tbl2,$acc09
428         lwzx    $acc10,$Tbl2,$acc10
429         lwzx    $acc11,$Tbl2,$acc11
430         xor     $t0,$t0,$acc00
431         xor     $t1,$t1,$acc01
432         xor     $t2,$t2,$acc02
433         xor     $t3,$t3,$acc03
434         lwzx    $acc12,$Tbl3,$acc12
435         lwzx    $acc13,$Tbl3,$acc13
436         lwzx    $acc14,$Tbl3,$acc14
437         lwzx    $acc15,$Tbl3,$acc15
438         xor     $t0,$t0,$acc04
439         xor     $t1,$t1,$acc05
440         xor     $t2,$t2,$acc06
441         xor     $t3,$t3,$acc07
442         xor     $t0,$t0,$acc08
443         xor     $t1,$t1,$acc09
444         xor     $t2,$t2,$acc10
445         xor     $t3,$t3,$acc11
446         xor     $s0,$t0,$acc12
447         xor     $s1,$t1,$acc13
448         xor     $s2,$t2,$acc14
449         xor     $s3,$t3,$acc15
450         addi    $key,$key,16
451         bdnz-   Lenc_loop
452
453         addi    $Tbl2,$Tbl0,2048
454         nop
455         lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Te4
456         lwz     $acc09,`2048+32`($Tbl0)
457         lwz     $acc10,`2048+64`($Tbl0)
458         lwz     $acc11,`2048+96`($Tbl0)
459         lwz     $acc08,`2048+128`($Tbl0)
460         lwz     $acc09,`2048+160`($Tbl0)
461         lwz     $acc10,`2048+192`($Tbl0)
462         lwz     $acc11,`2048+224`($Tbl0)
463         rlwinm  $acc00,$s0,`32-24`,24,31
464         rlwinm  $acc01,$s1,`32-24`,24,31
465         rlwinm  $acc02,$s2,`32-24`,24,31
466         rlwinm  $acc03,$s3,`32-24`,24,31
467         lwz     $t0,0($key)
468         lwz     $t1,4($key)
469         lwz     $t2,8($key)
470         lwz     $t3,12($key)
471         rlwinm  $acc04,$s1,`32-16`,24,31
472         rlwinm  $acc05,$s2,`32-16`,24,31
473         rlwinm  $acc06,$s3,`32-16`,24,31
474         rlwinm  $acc07,$s0,`32-16`,24,31
475         lbzx    $acc00,$Tbl2,$acc00
476         lbzx    $acc01,$Tbl2,$acc01
477         lbzx    $acc02,$Tbl2,$acc02
478         lbzx    $acc03,$Tbl2,$acc03
479         rlwinm  $acc08,$s2,`32-8`,24,31
480         rlwinm  $acc09,$s3,`32-8`,24,31
481         rlwinm  $acc10,$s0,`32-8`,24,31
482         rlwinm  $acc11,$s1,`32-8`,24,31
483         lbzx    $acc04,$Tbl2,$acc04
484         lbzx    $acc05,$Tbl2,$acc05
485         lbzx    $acc06,$Tbl2,$acc06
486         lbzx    $acc07,$Tbl2,$acc07
487         rlwinm  $acc12,$s3,`0`,24,31
488         rlwinm  $acc13,$s0,`0`,24,31
489         rlwinm  $acc14,$s1,`0`,24,31
490         rlwinm  $acc15,$s2,`0`,24,31
491         lbzx    $acc08,$Tbl2,$acc08
492         lbzx    $acc09,$Tbl2,$acc09
493         lbzx    $acc10,$Tbl2,$acc10
494         lbzx    $acc11,$Tbl2,$acc11
495         rlwinm  $s0,$acc00,24,0,7
496         rlwinm  $s1,$acc01,24,0,7
497         rlwinm  $s2,$acc02,24,0,7
498         rlwinm  $s3,$acc03,24,0,7
499         lbzx    $acc12,$Tbl2,$acc12
500         lbzx    $acc13,$Tbl2,$acc13
501         lbzx    $acc14,$Tbl2,$acc14
502         lbzx    $acc15,$Tbl2,$acc15
503         rlwimi  $s0,$acc04,16,8,15
504         rlwimi  $s1,$acc05,16,8,15
505         rlwimi  $s2,$acc06,16,8,15
506         rlwimi  $s3,$acc07,16,8,15
507         rlwimi  $s0,$acc08,8,16,23
508         rlwimi  $s1,$acc09,8,16,23
509         rlwimi  $s2,$acc10,8,16,23
510         rlwimi  $s3,$acc11,8,16,23
511         or      $s0,$s0,$acc12
512         or      $s1,$s1,$acc13
513         or      $s2,$s2,$acc14
514         or      $s3,$s3,$acc15
515         xor     $s0,$s0,$t0
516         xor     $s1,$s1,$t1
517         xor     $s2,$s2,$t2
518         xor     $s3,$s3,$t3
519         blr
520
521 .align  4
522 Lppc_AES_encrypt_compact:
523         lwz     $acc00,240($key)
524         lwz     $t0,0($key)
525         lwz     $t1,4($key)
526         lwz     $t2,8($key)
527         lwz     $t3,12($key)
528         addi    $Tbl1,$Tbl0,2048
529         lis     $mask80,0x8080
530         lis     $mask1b,0x1b1b
531         addi    $key,$key,16
532         ori     $mask80,$mask80,0x8080
533         ori     $mask1b,$mask1b,0x1b1b
534         mtctr   $acc00
535 .align  4
536 Lenc_compact_loop:
537         xor     $s0,$s0,$t0
538         xor     $s1,$s1,$t1
539         xor     $s2,$s2,$t2
540         xor     $s3,$s3,$t3
541         rlwinm  $acc00,$s0,`32-24`,24,31
542         rlwinm  $acc01,$s1,`32-24`,24,31
543         rlwinm  $acc02,$s2,`32-24`,24,31
544         rlwinm  $acc03,$s3,`32-24`,24,31
545         lwz     $t0,0($key)
546         lwz     $t1,4($key)
547         lwz     $t2,8($key)
548         lwz     $t3,12($key)
549         rlwinm  $acc04,$s1,`32-16`,24,31
550         rlwinm  $acc05,$s2,`32-16`,24,31
551         rlwinm  $acc06,$s3,`32-16`,24,31
552         rlwinm  $acc07,$s0,`32-16`,24,31
553         lbzx    $acc00,$Tbl1,$acc00
554         lbzx    $acc01,$Tbl1,$acc01
555         lbzx    $acc02,$Tbl1,$acc02
556         lbzx    $acc03,$Tbl1,$acc03
557         rlwinm  $acc08,$s2,`32-8`,24,31
558         rlwinm  $acc09,$s3,`32-8`,24,31
559         rlwinm  $acc10,$s0,`32-8`,24,31
560         rlwinm  $acc11,$s1,`32-8`,24,31
561         lbzx    $acc04,$Tbl1,$acc04
562         lbzx    $acc05,$Tbl1,$acc05
563         lbzx    $acc06,$Tbl1,$acc06
564         lbzx    $acc07,$Tbl1,$acc07
565         rlwinm  $acc12,$s3,`0`,24,31
566         rlwinm  $acc13,$s0,`0`,24,31
567         rlwinm  $acc14,$s1,`0`,24,31
568         rlwinm  $acc15,$s2,`0`,24,31
569         lbzx    $acc08,$Tbl1,$acc08
570         lbzx    $acc09,$Tbl1,$acc09
571         lbzx    $acc10,$Tbl1,$acc10
572         lbzx    $acc11,$Tbl1,$acc11
573         rlwinm  $s0,$acc00,24,0,7
574         rlwinm  $s1,$acc01,24,0,7
575         rlwinm  $s2,$acc02,24,0,7
576         rlwinm  $s3,$acc03,24,0,7
577         lbzx    $acc12,$Tbl1,$acc12
578         lbzx    $acc13,$Tbl1,$acc13
579         lbzx    $acc14,$Tbl1,$acc14
580         lbzx    $acc15,$Tbl1,$acc15
581         rlwimi  $s0,$acc04,16,8,15
582         rlwimi  $s1,$acc05,16,8,15
583         rlwimi  $s2,$acc06,16,8,15
584         rlwimi  $s3,$acc07,16,8,15
585         rlwimi  $s0,$acc08,8,16,23
586         rlwimi  $s1,$acc09,8,16,23
587         rlwimi  $s2,$acc10,8,16,23
588         rlwimi  $s3,$acc11,8,16,23
589         or      $s0,$s0,$acc12
590         or      $s1,$s1,$acc13
591         or      $s2,$s2,$acc14
592         or      $s3,$s3,$acc15
593
594         addi    $key,$key,16
595         bdz     Lenc_compact_done
596
597         and     $acc00,$s0,$mask80      # r1=r0&0x80808080
598         and     $acc01,$s1,$mask80
599         and     $acc02,$s2,$mask80
600         and     $acc03,$s3,$mask80
601         srwi    $acc04,$acc00,7         # r1>>7
602         srwi    $acc05,$acc01,7
603         srwi    $acc06,$acc02,7
604         srwi    $acc07,$acc03,7
605         andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
606         andc    $acc09,$s1,$mask80
607         andc    $acc10,$s2,$mask80
608         andc    $acc11,$s3,$mask80
609         sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
610         sub     $acc01,$acc01,$acc05
611         sub     $acc02,$acc02,$acc06
612         sub     $acc03,$acc03,$acc07
613         add     $acc08,$acc08,$acc08    # (r0&0x7f7f7f7f)<<1
614         add     $acc09,$acc09,$acc09
615         add     $acc10,$acc10,$acc10
616         add     $acc11,$acc11,$acc11
617         and     $acc00,$acc00,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
618         and     $acc01,$acc01,$mask1b
619         and     $acc02,$acc02,$mask1b
620         and     $acc03,$acc03,$mask1b
621         xor     $acc00,$acc00,$acc08    # r2
622         xor     $acc01,$acc01,$acc09
623         xor     $acc02,$acc02,$acc10
624         xor     $acc03,$acc03,$acc11
625
626         rotlwi  $acc12,$s0,16           # ROTATE(r0,16)
627         rotlwi  $acc13,$s1,16
628         rotlwi  $acc14,$s2,16
629         rotlwi  $acc15,$s3,16
630         xor     $s0,$s0,$acc00          # r0^r2
631         xor     $s1,$s1,$acc01
632         xor     $s2,$s2,$acc02
633         xor     $s3,$s3,$acc03
634         rotrwi  $s0,$s0,24              # ROTATE(r2^r0,24)
635         rotrwi  $s1,$s1,24
636         rotrwi  $s2,$s2,24
637         rotrwi  $s3,$s3,24
638         xor     $s0,$s0,$acc00          # ROTATE(r2^r0,24)^r2
639         xor     $s1,$s1,$acc01
640         xor     $s2,$s2,$acc02
641         xor     $s3,$s3,$acc03
642         rotlwi  $acc08,$acc12,8         # ROTATE(r0,24)
643         rotlwi  $acc09,$acc13,8
644         rotlwi  $acc10,$acc14,8
645         rotlwi  $acc11,$acc15,8
646         xor     $s0,$s0,$acc12          #
647         xor     $s1,$s1,$acc13
648         xor     $s2,$s2,$acc14
649         xor     $s3,$s3,$acc15
650         xor     $s0,$s0,$acc08          #
651         xor     $s1,$s1,$acc09
652         xor     $s2,$s2,$acc10
653         xor     $s3,$s3,$acc11
654
655         b       Lenc_compact_loop
656 .align  4
657 Lenc_compact_done:
658         xor     $s0,$s0,$t0
659         xor     $s1,$s1,$t1
660         xor     $s2,$s2,$t2
661         xor     $s3,$s3,$t3
662         blr
663
664 .globl  .AES_decrypt
665 .align  7
666 .AES_decrypt:
667         mflr    r0
668         $STU    $sp,-$FRAME($sp)
669
670         $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
671         $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
672         $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
673         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
674         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
675         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
676         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
677         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
678         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
679         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
680         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
681         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
682         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
683         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
684         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
685         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
686         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
687         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
688         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
689         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
690         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
691
692         lwz     $s0,0($inp)
693         lwz     $s1,4($inp)
694         lwz     $s2,8($inp)
695         lwz     $s3,12($inp)
696         bl      LAES_Td
697         bl      Lppc_AES_decrypt_compact
698         stw     $s0,0($out)
699         stw     $s1,4($out)
700         stw     $s2,8($out)
701         stw     $s3,12($out)
702
703         $POP    r0,`$FRAME-$SIZE_T*21`($sp)
704         $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
705         $POP    r13,`$FRAME-$SIZE_T*19`($sp)
706         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
707         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
708         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
709         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
710         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
711         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
712         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
713         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
714         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
715         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
716         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
717         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
718         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
719         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
720         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
721         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
722         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
723         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
724         mtlr    r0
725         addi    $sp,$sp,$FRAME
726         blr
727
728 .align  4
729 Lppc_AES_decrypt:
730         lwz     $acc00,240($key)
731         lwz     $t0,0($key)
732         lwz     $t1,4($key)
733         lwz     $t2,8($key)
734         lwz     $t3,12($key)
735         addi    $Tbl1,$Tbl0,3
736         addi    $Tbl2,$Tbl0,2
737         addi    $Tbl3,$Tbl0,1
738         addi    $acc00,$acc00,-1
739         addi    $key,$key,16
740         xor     $s0,$s0,$t0
741         xor     $s1,$s1,$t1
742         xor     $s2,$s2,$t2
743         xor     $s3,$s3,$t3
744         mtctr   $acc00
745 .align  4
746 Ldec_loop:
747         rlwinm  $acc00,$s0,`32-24+3`,21,28
748         rlwinm  $acc01,$s1,`32-24+3`,21,28
749         rlwinm  $acc02,$s2,`32-24+3`,21,28
750         rlwinm  $acc03,$s3,`32-24+3`,21,28
751         lwz     $t0,0($key)
752         lwz     $t1,4($key)
753         lwz     $t2,8($key)
754         lwz     $t3,12($key)
755         rlwinm  $acc04,$s3,`32-16+3`,21,28
756         rlwinm  $acc05,$s0,`32-16+3`,21,28
757         rlwinm  $acc06,$s1,`32-16+3`,21,28
758         rlwinm  $acc07,$s2,`32-16+3`,21,28
759         lwzx    $acc00,$Tbl0,$acc00
760         lwzx    $acc01,$Tbl0,$acc01
761         lwzx    $acc02,$Tbl0,$acc02
762         lwzx    $acc03,$Tbl0,$acc03
763         rlwinm  $acc08,$s2,`32-8+3`,21,28
764         rlwinm  $acc09,$s3,`32-8+3`,21,28
765         rlwinm  $acc10,$s0,`32-8+3`,21,28
766         rlwinm  $acc11,$s1,`32-8+3`,21,28
767         lwzx    $acc04,$Tbl1,$acc04
768         lwzx    $acc05,$Tbl1,$acc05
769         lwzx    $acc06,$Tbl1,$acc06
770         lwzx    $acc07,$Tbl1,$acc07
771         rlwinm  $acc12,$s1,`0+3`,21,28
772         rlwinm  $acc13,$s2,`0+3`,21,28
773         rlwinm  $acc14,$s3,`0+3`,21,28
774         rlwinm  $acc15,$s0,`0+3`,21,28
775         lwzx    $acc08,$Tbl2,$acc08
776         lwzx    $acc09,$Tbl2,$acc09
777         lwzx    $acc10,$Tbl2,$acc10
778         lwzx    $acc11,$Tbl2,$acc11
779         xor     $t0,$t0,$acc00
780         xor     $t1,$t1,$acc01
781         xor     $t2,$t2,$acc02
782         xor     $t3,$t3,$acc03
783         lwzx    $acc12,$Tbl3,$acc12
784         lwzx    $acc13,$Tbl3,$acc13
785         lwzx    $acc14,$Tbl3,$acc14
786         lwzx    $acc15,$Tbl3,$acc15
787         xor     $t0,$t0,$acc04
788         xor     $t1,$t1,$acc05
789         xor     $t2,$t2,$acc06
790         xor     $t3,$t3,$acc07
791         xor     $t0,$t0,$acc08
792         xor     $t1,$t1,$acc09
793         xor     $t2,$t2,$acc10
794         xor     $t3,$t3,$acc11
795         xor     $s0,$t0,$acc12
796         xor     $s1,$t1,$acc13
797         xor     $s2,$t2,$acc14
798         xor     $s3,$t3,$acc15
799         addi    $key,$key,16
800         bdnz-   Ldec_loop
801
802         addi    $Tbl2,$Tbl0,2048
803         nop
804         lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Td4
805         lwz     $acc09,`2048+32`($Tbl0)
806         lwz     $acc10,`2048+64`($Tbl0)
807         lwz     $acc11,`2048+96`($Tbl0)
808         lwz     $acc08,`2048+128`($Tbl0)
809         lwz     $acc09,`2048+160`($Tbl0)
810         lwz     $acc10,`2048+192`($Tbl0)
811         lwz     $acc11,`2048+224`($Tbl0)
812         rlwinm  $acc00,$s0,`32-24`,24,31
813         rlwinm  $acc01,$s1,`32-24`,24,31
814         rlwinm  $acc02,$s2,`32-24`,24,31
815         rlwinm  $acc03,$s3,`32-24`,24,31
816         lwz     $t0,0($key)
817         lwz     $t1,4($key)
818         lwz     $t2,8($key)
819         lwz     $t3,12($key)
820         rlwinm  $acc04,$s3,`32-16`,24,31
821         rlwinm  $acc05,$s0,`32-16`,24,31
822         rlwinm  $acc06,$s1,`32-16`,24,31
823         rlwinm  $acc07,$s2,`32-16`,24,31
824         lbzx    $acc00,$Tbl2,$acc00
825         lbzx    $acc01,$Tbl2,$acc01
826         lbzx    $acc02,$Tbl2,$acc02
827         lbzx    $acc03,$Tbl2,$acc03
828         rlwinm  $acc08,$s2,`32-8`,24,31
829         rlwinm  $acc09,$s3,`32-8`,24,31
830         rlwinm  $acc10,$s0,`32-8`,24,31
831         rlwinm  $acc11,$s1,`32-8`,24,31
832         lbzx    $acc04,$Tbl2,$acc04
833         lbzx    $acc05,$Tbl2,$acc05
834         lbzx    $acc06,$Tbl2,$acc06
835         lbzx    $acc07,$Tbl2,$acc07
836         rlwinm  $acc12,$s1,`0`,24,31
837         rlwinm  $acc13,$s2,`0`,24,31
838         rlwinm  $acc14,$s3,`0`,24,31
839         rlwinm  $acc15,$s0,`0`,24,31
840         lbzx    $acc08,$Tbl2,$acc08
841         lbzx    $acc09,$Tbl2,$acc09
842         lbzx    $acc10,$Tbl2,$acc10
843         lbzx    $acc11,$Tbl2,$acc11
844         rlwinm  $s0,$acc00,24,0,7
845         rlwinm  $s1,$acc01,24,0,7
846         rlwinm  $s2,$acc02,24,0,7
847         rlwinm  $s3,$acc03,24,0,7
848         lbzx    $acc12,$Tbl2,$acc12
849         lbzx    $acc13,$Tbl2,$acc13
850         lbzx    $acc14,$Tbl2,$acc14
851         lbzx    $acc15,$Tbl2,$acc15
852         rlwimi  $s0,$acc04,16,8,15
853         rlwimi  $s1,$acc05,16,8,15
854         rlwimi  $s2,$acc06,16,8,15
855         rlwimi  $s3,$acc07,16,8,15
856         rlwimi  $s0,$acc08,8,16,23
857         rlwimi  $s1,$acc09,8,16,23
858         rlwimi  $s2,$acc10,8,16,23
859         rlwimi  $s3,$acc11,8,16,23
860         or      $s0,$s0,$acc12
861         or      $s1,$s1,$acc13
862         or      $s2,$s2,$acc14
863         or      $s3,$s3,$acc15
864         xor     $s0,$s0,$t0
865         xor     $s1,$s1,$t1
866         xor     $s2,$s2,$t2
867         xor     $s3,$s3,$t3
868         blr
869
870 .align  4
871 Lppc_AES_decrypt_compact:
872         lwz     $acc00,240($key)
873         lwz     $t0,0($key)
874         lwz     $t1,4($key)
875         lwz     $t2,8($key)
876         lwz     $t3,12($key)
877         addi    $Tbl1,$Tbl0,2048
878         lis     $mask80,0x8080
879         lis     $mask1b,0x1b1b
880         addi    $key,$key,16
881         ori     $mask80,$mask80,0x8080
882         ori     $mask1b,$mask1b,0x1b1b
883 ___
884 $code.=<<___ if ($SIZE_T==8);
885         insrdi  $mask80,$mask80,32,0
886         insrdi  $mask1b,$mask1b,32,0
887 ___
888 $code.=<<___;
889         mtctr   $acc00
890 .align  4
891 Ldec_compact_loop:
892         xor     $s0,$s0,$t0
893         xor     $s1,$s1,$t1
894         xor     $s2,$s2,$t2
895         xor     $s3,$s3,$t3
896         rlwinm  $acc00,$s0,`32-24`,24,31
897         rlwinm  $acc01,$s1,`32-24`,24,31
898         rlwinm  $acc02,$s2,`32-24`,24,31
899         rlwinm  $acc03,$s3,`32-24`,24,31
900         lwz     $t0,0($key)
901         lwz     $t1,4($key)
902         lwz     $t2,8($key)
903         lwz     $t3,12($key)
904         rlwinm  $acc04,$s3,`32-16`,24,31
905         rlwinm  $acc05,$s0,`32-16`,24,31
906         rlwinm  $acc06,$s1,`32-16`,24,31
907         rlwinm  $acc07,$s2,`32-16`,24,31
908         lbzx    $acc00,$Tbl1,$acc00
909         lbzx    $acc01,$Tbl1,$acc01
910         lbzx    $acc02,$Tbl1,$acc02
911         lbzx    $acc03,$Tbl1,$acc03
912         rlwinm  $acc08,$s2,`32-8`,24,31
913         rlwinm  $acc09,$s3,`32-8`,24,31
914         rlwinm  $acc10,$s0,`32-8`,24,31
915         rlwinm  $acc11,$s1,`32-8`,24,31
916         lbzx    $acc04,$Tbl1,$acc04
917         lbzx    $acc05,$Tbl1,$acc05
918         lbzx    $acc06,$Tbl1,$acc06
919         lbzx    $acc07,$Tbl1,$acc07
920         rlwinm  $acc12,$s1,`0`,24,31
921         rlwinm  $acc13,$s2,`0`,24,31
922         rlwinm  $acc14,$s3,`0`,24,31
923         rlwinm  $acc15,$s0,`0`,24,31
924         lbzx    $acc08,$Tbl1,$acc08
925         lbzx    $acc09,$Tbl1,$acc09
926         lbzx    $acc10,$Tbl1,$acc10
927         lbzx    $acc11,$Tbl1,$acc11
928         rlwinm  $s0,$acc00,24,0,7
929         rlwinm  $s1,$acc01,24,0,7
930         rlwinm  $s2,$acc02,24,0,7
931         rlwinm  $s3,$acc03,24,0,7
932         lbzx    $acc12,$Tbl1,$acc12
933         lbzx    $acc13,$Tbl1,$acc13
934         lbzx    $acc14,$Tbl1,$acc14
935         lbzx    $acc15,$Tbl1,$acc15
936         rlwimi  $s0,$acc04,16,8,15
937         rlwimi  $s1,$acc05,16,8,15
938         rlwimi  $s2,$acc06,16,8,15
939         rlwimi  $s3,$acc07,16,8,15
940         rlwimi  $s0,$acc08,8,16,23
941         rlwimi  $s1,$acc09,8,16,23
942         rlwimi  $s2,$acc10,8,16,23
943         rlwimi  $s3,$acc11,8,16,23
944         or      $s0,$s0,$acc12
945         or      $s1,$s1,$acc13
946         or      $s2,$s2,$acc14
947         or      $s3,$s3,$acc15
948
949         addi    $key,$key,16
950         bdz     Ldec_compact_done
951 ___
952 $code.=<<___ if ($SIZE_T==8);
953         # vectorized permutation improves decrypt performance by 10%
954         insrdi  $s0,$s1,32,0
955         insrdi  $s2,$s3,32,0
956
957         and     $acc00,$s0,$mask80      # r1=r0&0x80808080
958         and     $acc02,$s2,$mask80
959         srdi    $acc04,$acc00,7         # r1>>7
960         srdi    $acc06,$acc02,7
961         andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
962         andc    $acc10,$s2,$mask80
963         sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
964         sub     $acc02,$acc02,$acc06
965         add     $acc08,$acc08,$acc08    # (r0&0x7f7f7f7f)<<1
966         add     $acc10,$acc10,$acc10
967         and     $acc00,$acc00,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
968         and     $acc02,$acc02,$mask1b
969         xor     $acc00,$acc00,$acc08    # r2
970         xor     $acc02,$acc02,$acc10
971
972         and     $acc04,$acc00,$mask80   # r1=r2&0x80808080
973         and     $acc06,$acc02,$mask80
974         srdi    $acc08,$acc04,7         # r1>>7
975         srdi    $acc10,$acc06,7
976         andc    $acc12,$acc00,$mask80   # r2&0x7f7f7f7f
977         andc    $acc14,$acc02,$mask80
978         sub     $acc04,$acc04,$acc08    # r1-(r1>>7)
979         sub     $acc06,$acc06,$acc10
980         add     $acc12,$acc12,$acc12    # (r2&0x7f7f7f7f)<<1
981         add     $acc14,$acc14,$acc14
982         and     $acc04,$acc04,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
983         and     $acc06,$acc06,$mask1b
984         xor     $acc04,$acc04,$acc12    # r4
985         xor     $acc06,$acc06,$acc14
986
987         and     $acc08,$acc04,$mask80   # r1=r4&0x80808080
988         and     $acc10,$acc06,$mask80
989         srdi    $acc12,$acc08,7         # r1>>7
990         srdi    $acc14,$acc10,7
991         sub     $acc08,$acc08,$acc12    # r1-(r1>>7)
992         sub     $acc10,$acc10,$acc14
993         andc    $acc12,$acc04,$mask80   # r4&0x7f7f7f7f
994         andc    $acc14,$acc06,$mask80
995         add     $acc12,$acc12,$acc12    # (r4&0x7f7f7f7f)<<1
996         add     $acc14,$acc14,$acc14
997         and     $acc08,$acc08,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
998         and     $acc10,$acc10,$mask1b
999         xor     $acc08,$acc08,$acc12    # r8
1000         xor     $acc10,$acc10,$acc14
1001
1002         xor     $acc00,$acc00,$s0       # r2^r0
1003         xor     $acc02,$acc02,$s2
1004         xor     $acc04,$acc04,$s0       # r4^r0
1005         xor     $acc06,$acc06,$s2
1006
1007         extrdi  $acc01,$acc00,0,32
1008         extrdi  $acc03,$acc02,0,32
1009         extrdi  $acc05,$acc04,0,32
1010         extrdi  $acc07,$acc06,0,32
1011         extrdi  $acc09,$acc08,0,32
1012         extrdi  $acc11,$acc10,0,32
1013 ___
1014 $code.=<<___ if ($SIZE_T==4);
1015         and     $acc00,$s0,$mask80      # r1=r0&0x80808080
1016         and     $acc01,$s1,$mask80
1017         and     $acc02,$s2,$mask80
1018         and     $acc03,$s3,$mask80
1019         srwi    $acc04,$acc00,7         # r1>>7
1020         srwi    $acc05,$acc01,7
1021         srwi    $acc06,$acc02,7
1022         srwi    $acc07,$acc03,7
1023         andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
1024         andc    $acc09,$s1,$mask80
1025         andc    $acc10,$s2,$mask80
1026         andc    $acc11,$s3,$mask80
1027         sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
1028         sub     $acc01,$acc01,$acc05
1029         sub     $acc02,$acc02,$acc06
1030         sub     $acc03,$acc03,$acc07
1031         add     $acc08,$acc08,$acc08    # (r0&0x7f7f7f7f)<<1
1032         add     $acc09,$acc09,$acc09
1033         add     $acc10,$acc10,$acc10
1034         add     $acc11,$acc11,$acc11
1035         and     $acc00,$acc00,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
1036         and     $acc01,$acc01,$mask1b
1037         and     $acc02,$acc02,$mask1b
1038         and     $acc03,$acc03,$mask1b
1039         xor     $acc00,$acc00,$acc08    # r2
1040         xor     $acc01,$acc01,$acc09
1041         xor     $acc02,$acc02,$acc10
1042         xor     $acc03,$acc03,$acc11
1043
1044         and     $acc04,$acc00,$mask80   # r1=r2&0x80808080
1045         and     $acc05,$acc01,$mask80
1046         and     $acc06,$acc02,$mask80
1047         and     $acc07,$acc03,$mask80
1048         srwi    $acc08,$acc04,7         # r1>>7
1049         srwi    $acc09,$acc05,7
1050         srwi    $acc10,$acc06,7
1051         srwi    $acc11,$acc07,7
1052         andc    $acc12,$acc00,$mask80   # r2&0x7f7f7f7f
1053         andc    $acc13,$acc01,$mask80
1054         andc    $acc14,$acc02,$mask80
1055         andc    $acc15,$acc03,$mask80
1056         sub     $acc04,$acc04,$acc08    # r1-(r1>>7)
1057         sub     $acc05,$acc05,$acc09
1058         sub     $acc06,$acc06,$acc10
1059         sub     $acc07,$acc07,$acc11
1060         add     $acc12,$acc12,$acc12    # (r2&0x7f7f7f7f)<<1
1061         add     $acc13,$acc13,$acc13
1062         add     $acc14,$acc14,$acc14
1063         add     $acc15,$acc15,$acc15
1064         and     $acc04,$acc04,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
1065         and     $acc05,$acc05,$mask1b
1066         and     $acc06,$acc06,$mask1b
1067         and     $acc07,$acc07,$mask1b
1068         xor     $acc04,$acc04,$acc12    # r4
1069         xor     $acc05,$acc05,$acc13
1070         xor     $acc06,$acc06,$acc14
1071         xor     $acc07,$acc07,$acc15
1072
1073         and     $acc08,$acc04,$mask80   # r1=r4&0x80808080
1074         and     $acc09,$acc05,$mask80
1075         and     $acc10,$acc06,$mask80
1076         and     $acc11,$acc07,$mask80
1077         srwi    $acc12,$acc08,7         # r1>>7
1078         srwi    $acc13,$acc09,7
1079         srwi    $acc14,$acc10,7
1080         srwi    $acc15,$acc11,7
1081         sub     $acc08,$acc08,$acc12    # r1-(r1>>7)
1082         sub     $acc09,$acc09,$acc13
1083         sub     $acc10,$acc10,$acc14
1084         sub     $acc11,$acc11,$acc15
1085         andc    $acc12,$acc04,$mask80   # r4&0x7f7f7f7f
1086         andc    $acc13,$acc05,$mask80
1087         andc    $acc14,$acc06,$mask80
1088         andc    $acc15,$acc07,$mask80
1089         add     $acc12,$acc12,$acc12    # (r4&0x7f7f7f7f)<<1
1090         add     $acc13,$acc13,$acc13
1091         add     $acc14,$acc14,$acc14
1092         add     $acc15,$acc15,$acc15
1093         and     $acc08,$acc08,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
1094         and     $acc09,$acc09,$mask1b
1095         and     $acc10,$acc10,$mask1b
1096         and     $acc11,$acc11,$mask1b
1097         xor     $acc08,$acc08,$acc12    # r8
1098         xor     $acc09,$acc09,$acc13
1099         xor     $acc10,$acc10,$acc14
1100         xor     $acc11,$acc11,$acc15
1101
1102         xor     $acc00,$acc00,$s0       # r2^r0
1103         xor     $acc01,$acc01,$s1
1104         xor     $acc02,$acc02,$s2
1105         xor     $acc03,$acc03,$s3
1106         xor     $acc04,$acc04,$s0       # r4^r0
1107         xor     $acc05,$acc05,$s1
1108         xor     $acc06,$acc06,$s2
1109         xor     $acc07,$acc07,$s3
1110 ___
1111 $code.=<<___;
1112         rotrwi  $s0,$s0,8               # = ROTATE(r0,8)
1113         rotrwi  $s1,$s1,8
1114         rotrwi  $s2,$s2,8
1115         rotrwi  $s3,$s3,8
1116         xor     $s0,$s0,$acc00          # ^= r2^r0
1117         xor     $s1,$s1,$acc01
1118         xor     $s2,$s2,$acc02
1119         xor     $s3,$s3,$acc03
1120         xor     $acc00,$acc00,$acc08
1121         xor     $acc01,$acc01,$acc09
1122         xor     $acc02,$acc02,$acc10
1123         xor     $acc03,$acc03,$acc11
1124         xor     $s0,$s0,$acc04          # ^= r4^r0
1125         xor     $s1,$s1,$acc05
1126         xor     $s2,$s2,$acc06
1127         xor     $s3,$s3,$acc07
1128         rotrwi  $acc00,$acc00,24
1129         rotrwi  $acc01,$acc01,24
1130         rotrwi  $acc02,$acc02,24
1131         rotrwi  $acc03,$acc03,24
1132         xor     $acc04,$acc04,$acc08
1133         xor     $acc05,$acc05,$acc09
1134         xor     $acc06,$acc06,$acc10
1135         xor     $acc07,$acc07,$acc11
1136         xor     $s0,$s0,$acc08          # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1137         xor     $s1,$s1,$acc09
1138         xor     $s2,$s2,$acc10
1139         xor     $s3,$s3,$acc11
1140         rotrwi  $acc04,$acc04,16
1141         rotrwi  $acc05,$acc05,16
1142         rotrwi  $acc06,$acc06,16
1143         rotrwi  $acc07,$acc07,16
1144         xor     $s0,$s0,$acc00          # ^= ROTATE(r8^r2^r0,24)
1145         xor     $s1,$s1,$acc01
1146         xor     $s2,$s2,$acc02
1147         xor     $s3,$s3,$acc03
1148         rotrwi  $acc08,$acc08,8
1149         rotrwi  $acc09,$acc09,8
1150         rotrwi  $acc10,$acc10,8
1151         rotrwi  $acc11,$acc11,8
1152         xor     $s0,$s0,$acc04          # ^= ROTATE(r8^r4^r0,16)
1153         xor     $s1,$s1,$acc05
1154         xor     $s2,$s2,$acc06
1155         xor     $s3,$s3,$acc07
1156         xor     $s0,$s0,$acc08          # ^= ROTATE(r8,8)       
1157         xor     $s1,$s1,$acc09  
1158         xor     $s2,$s2,$acc10  
1159         xor     $s3,$s3,$acc11  
1160
1161         b       Ldec_compact_loop
1162 .align  4
1163 Ldec_compact_done:
1164         xor     $s0,$s0,$t0
1165         xor     $s1,$s1,$t1
1166         xor     $s2,$s2,$t2
1167         xor     $s3,$s3,$t3
1168         blr
1169 .long   0
1170 .asciz  "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1171 .align  7
1172 ___
1173
1174 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1175 print $code;
1176 close STDOUT;