aes-ppc.pl: 10% performance improvement on Power6.
[openssl.git] / crypto / aes / asm / aes-ppc.pl
1 #!/usr/bin/env perl
2
3 # ====================================================================
4 # Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
5 # project. The module is, however, dual licensed under OpenSSL and
6 # CRYPTOGAMS licenses depending on where you obtain it. For further
7 # details see http://www.openssl.org/~appro/cryptogams/.
8 # ====================================================================
9
10 # Needs more work: key setup, page boundaries, CBC routine...
11 #
12 # ppc_AES_[en|de]crypt perform at 18 cycles per byte processed with
13 # 128-bit key, which is ~40% better than 64-bit code generated by gcc
14 # 4.0. But these are not the ones currently used! Their "compact"
15 # counterparts are, for security reason. ppc_AES_encrypt_compact runs
16 # at 1/2 of ppc_AES_encrypt speed, while ppc_AES_decrypt_compact -
17 # at 1/3 of ppc_AES_decrypt.
18
19 # February 2010
20 #
21 # Rescheduling instructions to favour Power6 pipeline gives 10%
22 # performance improvement on the platfrom in question (and marginal
23 # improvement even on others). It should be noted that Power6 fails
24 # to process byte in 18 cycles, only in 23, because it fails to issue
25 # 4 load instructions in two cycles, only in 3. As result non-compact
26 # block subroutines are 25% slower than one would expect. Compact
27 # functions scale better, because they have pure computational part,
28 # which scales perfectly with clock frequency. To be specific
29 # ppc_AES_encrypt_compact operates at 42 cycles per byte, while
30 # ppc_AES_decrypt_compact - at 55 (in 64-bit build).
31
32 $flavour = shift;
33
34 if ($flavour =~ /64/) {
35         $SIZE_T =8;
36         $STU    ="stdu";
37         $POP    ="ld";
38         $PUSH   ="std";
39 } elsif ($flavour =~ /32/) {
40         $SIZE_T =4;
41         $STU    ="stwu";
42         $POP    ="lwz";
43         $PUSH   ="stw";
44 } else { die "nonsense $flavour"; }
45
46 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
47 ( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
48 ( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
49 die "can't locate ppc-xlate.pl";
50
51 open STDOUT,"| $^X $xlate $flavour ".shift || die "can't call $xlate: $!";
52
53 $FRAME=32*$SIZE_T;
54
55 sub _data_word()
56 { my $i;
57     while(defined($i=shift)) { $code.=sprintf"\t.long\t0x%08x,0x%08x\n",$i,$i; }
58 }
59
60 $sp="r1";
61 $toc="r2";
62 $inp="r3";
63 $out="r4";
64 $key="r5";
65
66 $Tbl0="r3";
67 $Tbl1="r6";
68 $Tbl2="r7";
69 $Tbl3="r2";
70
71 $s0="r8";
72 $s1="r9";
73 $s2="r10";
74 $s3="r11";
75
76 $t0="r12";
77 $t1="r13";
78 $t2="r14";
79 $t3="r15";
80
81 $acc00="r16";
82 $acc01="r17";
83 $acc02="r18";
84 $acc03="r19";
85
86 $acc04="r20";
87 $acc05="r21";
88 $acc06="r22";
89 $acc07="r23";
90
91 $acc08="r24";
92 $acc09="r25";
93 $acc10="r26";
94 $acc11="r27";
95
96 $acc12="r28";
97 $acc13="r29";
98 $acc14="r30";
99 $acc15="r31";
100
101 # stay away from TLS pointer
102 if ($SIZE_T==8) { die if ($t1 ne "r13");  $t1="r0";             }
103 else            { die if ($Tbl3 ne "r2"); $Tbl3=$t0; $t0="r0";  }
104 $mask80=$Tbl2;
105 $mask1b=$Tbl3;
106
107 $code.=<<___;
108 .machine        "any"
109 .text
110
111 .align  7
112 LAES_Te:
113         mflr    r0
114         bcl     20,31,\$+4
115         mflr    $Tbl0   ;    vvvvv "distance" between . and 1st data entry
116         addi    $Tbl0,$Tbl0,`128-8`
117         mtlr    r0
118         blr
119         .space  `32-24`
120 LAES_Td:
121         mflr    r0
122         bcl     20,31,\$+4
123         mflr    $Tbl0   ;    vvvvvvvv "distance" between . and 1st data entry
124         addi    $Tbl0,$Tbl0,`128-8-32+2048+256`
125         mtlr    r0
126         blr
127         .space  `128-32-24`
128 ___
129 &_data_word(
130         0xc66363a5, 0xf87c7c84, 0xee777799, 0xf67b7b8d,
131         0xfff2f20d, 0xd66b6bbd, 0xde6f6fb1, 0x91c5c554,
132         0x60303050, 0x02010103, 0xce6767a9, 0x562b2b7d,
133         0xe7fefe19, 0xb5d7d762, 0x4dababe6, 0xec76769a,
134         0x8fcaca45, 0x1f82829d, 0x89c9c940, 0xfa7d7d87,
135         0xeffafa15, 0xb25959eb, 0x8e4747c9, 0xfbf0f00b,
136         0x41adadec, 0xb3d4d467, 0x5fa2a2fd, 0x45afafea,
137         0x239c9cbf, 0x53a4a4f7, 0xe4727296, 0x9bc0c05b,
138         0x75b7b7c2, 0xe1fdfd1c, 0x3d9393ae, 0x4c26266a,
139         0x6c36365a, 0x7e3f3f41, 0xf5f7f702, 0x83cccc4f,
140         0x6834345c, 0x51a5a5f4, 0xd1e5e534, 0xf9f1f108,
141         0xe2717193, 0xabd8d873, 0x62313153, 0x2a15153f,
142         0x0804040c, 0x95c7c752, 0x46232365, 0x9dc3c35e,
143         0x30181828, 0x379696a1, 0x0a05050f, 0x2f9a9ab5,
144         0x0e070709, 0x24121236, 0x1b80809b, 0xdfe2e23d,
145         0xcdebeb26, 0x4e272769, 0x7fb2b2cd, 0xea75759f,
146         0x1209091b, 0x1d83839e, 0x582c2c74, 0x341a1a2e,
147         0x361b1b2d, 0xdc6e6eb2, 0xb45a5aee, 0x5ba0a0fb,
148         0xa45252f6, 0x763b3b4d, 0xb7d6d661, 0x7db3b3ce,
149         0x5229297b, 0xdde3e33e, 0x5e2f2f71, 0x13848497,
150         0xa65353f5, 0xb9d1d168, 0x00000000, 0xc1eded2c,
151         0x40202060, 0xe3fcfc1f, 0x79b1b1c8, 0xb65b5bed,
152         0xd46a6abe, 0x8dcbcb46, 0x67bebed9, 0x7239394b,
153         0x944a4ade, 0x984c4cd4, 0xb05858e8, 0x85cfcf4a,
154         0xbbd0d06b, 0xc5efef2a, 0x4faaaae5, 0xedfbfb16,
155         0x864343c5, 0x9a4d4dd7, 0x66333355, 0x11858594,
156         0x8a4545cf, 0xe9f9f910, 0x04020206, 0xfe7f7f81,
157         0xa05050f0, 0x783c3c44, 0x259f9fba, 0x4ba8a8e3,
158         0xa25151f3, 0x5da3a3fe, 0x804040c0, 0x058f8f8a,
159         0x3f9292ad, 0x219d9dbc, 0x70383848, 0xf1f5f504,
160         0x63bcbcdf, 0x77b6b6c1, 0xafdada75, 0x42212163,
161         0x20101030, 0xe5ffff1a, 0xfdf3f30e, 0xbfd2d26d,
162         0x81cdcd4c, 0x180c0c14, 0x26131335, 0xc3ecec2f,
163         0xbe5f5fe1, 0x359797a2, 0x884444cc, 0x2e171739,
164         0x93c4c457, 0x55a7a7f2, 0xfc7e7e82, 0x7a3d3d47,
165         0xc86464ac, 0xba5d5de7, 0x3219192b, 0xe6737395,
166         0xc06060a0, 0x19818198, 0x9e4f4fd1, 0xa3dcdc7f,
167         0x44222266, 0x542a2a7e, 0x3b9090ab, 0x0b888883,
168         0x8c4646ca, 0xc7eeee29, 0x6bb8b8d3, 0x2814143c,
169         0xa7dede79, 0xbc5e5ee2, 0x160b0b1d, 0xaddbdb76,
170         0xdbe0e03b, 0x64323256, 0x743a3a4e, 0x140a0a1e,
171         0x924949db, 0x0c06060a, 0x4824246c, 0xb85c5ce4,
172         0x9fc2c25d, 0xbdd3d36e, 0x43acacef, 0xc46262a6,
173         0x399191a8, 0x319595a4, 0xd3e4e437, 0xf279798b,
174         0xd5e7e732, 0x8bc8c843, 0x6e373759, 0xda6d6db7,
175         0x018d8d8c, 0xb1d5d564, 0x9c4e4ed2, 0x49a9a9e0,
176         0xd86c6cb4, 0xac5656fa, 0xf3f4f407, 0xcfeaea25,
177         0xca6565af, 0xf47a7a8e, 0x47aeaee9, 0x10080818,
178         0x6fbabad5, 0xf0787888, 0x4a25256f, 0x5c2e2e72,
179         0x381c1c24, 0x57a6a6f1, 0x73b4b4c7, 0x97c6c651,
180         0xcbe8e823, 0xa1dddd7c, 0xe874749c, 0x3e1f1f21,
181         0x964b4bdd, 0x61bdbddc, 0x0d8b8b86, 0x0f8a8a85,
182         0xe0707090, 0x7c3e3e42, 0x71b5b5c4, 0xcc6666aa,
183         0x904848d8, 0x06030305, 0xf7f6f601, 0x1c0e0e12,
184         0xc26161a3, 0x6a35355f, 0xae5757f9, 0x69b9b9d0,
185         0x17868691, 0x99c1c158, 0x3a1d1d27, 0x279e9eb9,
186         0xd9e1e138, 0xebf8f813, 0x2b9898b3, 0x22111133,
187         0xd26969bb, 0xa9d9d970, 0x078e8e89, 0x339494a7,
188         0x2d9b9bb6, 0x3c1e1e22, 0x15878792, 0xc9e9e920,
189         0x87cece49, 0xaa5555ff, 0x50282878, 0xa5dfdf7a,
190         0x038c8c8f, 0x59a1a1f8, 0x09898980, 0x1a0d0d17,
191         0x65bfbfda, 0xd7e6e631, 0x844242c6, 0xd06868b8,
192         0x824141c3, 0x299999b0, 0x5a2d2d77, 0x1e0f0f11,
193         0x7bb0b0cb, 0xa85454fc, 0x6dbbbbd6, 0x2c16163a);
194 $code.=<<___;
195 .byte   0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5
196 .byte   0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76
197 .byte   0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0
198 .byte   0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0
199 .byte   0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc
200 .byte   0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15
201 .byte   0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a
202 .byte   0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75
203 .byte   0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0
204 .byte   0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84
205 .byte   0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b
206 .byte   0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf
207 .byte   0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85
208 .byte   0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8
209 .byte   0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5
210 .byte   0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2
211 .byte   0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17
212 .byte   0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73
213 .byte   0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88
214 .byte   0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb
215 .byte   0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c
216 .byte   0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79
217 .byte   0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9
218 .byte   0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08
219 .byte   0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6
220 .byte   0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a
221 .byte   0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e
222 .byte   0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e
223 .byte   0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94
224 .byte   0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf
225 .byte   0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68
226 .byte   0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16
227 ___
228 &_data_word(
229         0x51f4a750, 0x7e416553, 0x1a17a4c3, 0x3a275e96,
230         0x3bab6bcb, 0x1f9d45f1, 0xacfa58ab, 0x4be30393,
231         0x2030fa55, 0xad766df6, 0x88cc7691, 0xf5024c25,
232         0x4fe5d7fc, 0xc52acbd7, 0x26354480, 0xb562a38f,
233         0xdeb15a49, 0x25ba1b67, 0x45ea0e98, 0x5dfec0e1,
234         0xc32f7502, 0x814cf012, 0x8d4697a3, 0x6bd3f9c6,
235         0x038f5fe7, 0x15929c95, 0xbf6d7aeb, 0x955259da,
236         0xd4be832d, 0x587421d3, 0x49e06929, 0x8ec9c844,
237         0x75c2896a, 0xf48e7978, 0x99583e6b, 0x27b971dd,
238         0xbee14fb6, 0xf088ad17, 0xc920ac66, 0x7dce3ab4,
239         0x63df4a18, 0xe51a3182, 0x97513360, 0x62537f45,
240         0xb16477e0, 0xbb6bae84, 0xfe81a01c, 0xf9082b94,
241         0x70486858, 0x8f45fd19, 0x94de6c87, 0x527bf8b7,
242         0xab73d323, 0x724b02e2, 0xe31f8f57, 0x6655ab2a,
243         0xb2eb2807, 0x2fb5c203, 0x86c57b9a, 0xd33708a5,
244         0x302887f2, 0x23bfa5b2, 0x02036aba, 0xed16825c,
245         0x8acf1c2b, 0xa779b492, 0xf307f2f0, 0x4e69e2a1,
246         0x65daf4cd, 0x0605bed5, 0xd134621f, 0xc4a6fe8a,
247         0x342e539d, 0xa2f355a0, 0x058ae132, 0xa4f6eb75,
248         0x0b83ec39, 0x4060efaa, 0x5e719f06, 0xbd6e1051,
249         0x3e218af9, 0x96dd063d, 0xdd3e05ae, 0x4de6bd46,
250         0x91548db5, 0x71c45d05, 0x0406d46f, 0x605015ff,
251         0x1998fb24, 0xd6bde997, 0x894043cc, 0x67d99e77,
252         0xb0e842bd, 0x07898b88, 0xe7195b38, 0x79c8eedb,
253         0xa17c0a47, 0x7c420fe9, 0xf8841ec9, 0x00000000,
254         0x09808683, 0x322bed48, 0x1e1170ac, 0x6c5a724e,
255         0xfd0efffb, 0x0f853856, 0x3daed51e, 0x362d3927,
256         0x0a0fd964, 0x685ca621, 0x9b5b54d1, 0x24362e3a,
257         0x0c0a67b1, 0x9357e70f, 0xb4ee96d2, 0x1b9b919e,
258         0x80c0c54f, 0x61dc20a2, 0x5a774b69, 0x1c121a16,
259         0xe293ba0a, 0xc0a02ae5, 0x3c22e043, 0x121b171d,
260         0x0e090d0b, 0xf28bc7ad, 0x2db6a8b9, 0x141ea9c8,
261         0x57f11985, 0xaf75074c, 0xee99ddbb, 0xa37f60fd,
262         0xf701269f, 0x5c72f5bc, 0x44663bc5, 0x5bfb7e34,
263         0x8b432976, 0xcb23c6dc, 0xb6edfc68, 0xb8e4f163,
264         0xd731dcca, 0x42638510, 0x13972240, 0x84c61120,
265         0x854a247d, 0xd2bb3df8, 0xaef93211, 0xc729a16d,
266         0x1d9e2f4b, 0xdcb230f3, 0x0d8652ec, 0x77c1e3d0,
267         0x2bb3166c, 0xa970b999, 0x119448fa, 0x47e96422,
268         0xa8fc8cc4, 0xa0f03f1a, 0x567d2cd8, 0x223390ef,
269         0x87494ec7, 0xd938d1c1, 0x8ccaa2fe, 0x98d40b36,
270         0xa6f581cf, 0xa57ade28, 0xdab78e26, 0x3fadbfa4,
271         0x2c3a9de4, 0x5078920d, 0x6a5fcc9b, 0x547e4662,
272         0xf68d13c2, 0x90d8b8e8, 0x2e39f75e, 0x82c3aff5,
273         0x9f5d80be, 0x69d0937c, 0x6fd52da9, 0xcf2512b3,
274         0xc8ac993b, 0x10187da7, 0xe89c636e, 0xdb3bbb7b,
275         0xcd267809, 0x6e5918f4, 0xec9ab701, 0x834f9aa8,
276         0xe6956e65, 0xaaffe67e, 0x21bccf08, 0xef15e8e6,
277         0xbae79bd9, 0x4a6f36ce, 0xea9f09d4, 0x29b07cd6,
278         0x31a4b2af, 0x2a3f2331, 0xc6a59430, 0x35a266c0,
279         0x744ebc37, 0xfc82caa6, 0xe090d0b0, 0x33a7d815,
280         0xf104984a, 0x41ecdaf7, 0x7fcd500e, 0x1791f62f,
281         0x764dd68d, 0x43efb04d, 0xccaa4d54, 0xe49604df,
282         0x9ed1b5e3, 0x4c6a881b, 0xc12c1fb8, 0x4665517f,
283         0x9d5eea04, 0x018c355d, 0xfa877473, 0xfb0b412e,
284         0xb3671d5a, 0x92dbd252, 0xe9105633, 0x6dd64713,
285         0x9ad7618c, 0x37a10c7a, 0x59f8148e, 0xeb133c89,
286         0xcea927ee, 0xb761c935, 0xe11ce5ed, 0x7a47b13c,
287         0x9cd2df59, 0x55f2733f, 0x1814ce79, 0x73c737bf,
288         0x53f7cdea, 0x5ffdaa5b, 0xdf3d6f14, 0x7844db86,
289         0xcaaff381, 0xb968c43e, 0x3824342c, 0xc2a3405f,
290         0x161dc372, 0xbce2250c, 0x283c498b, 0xff0d9541,
291         0x39a80171, 0x080cb3de, 0xd8b4e49c, 0x6456c190,
292         0x7bcb8461, 0xd532b670, 0x486c5c74, 0xd0b85742);
293 $code.=<<___;
294 .byte   0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38
295 .byte   0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb
296 .byte   0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87
297 .byte   0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb
298 .byte   0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d
299 .byte   0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e
300 .byte   0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2
301 .byte   0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25
302 .byte   0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16
303 .byte   0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92
304 .byte   0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda
305 .byte   0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84
306 .byte   0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a
307 .byte   0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06
308 .byte   0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02
309 .byte   0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b
310 .byte   0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea
311 .byte   0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73
312 .byte   0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85
313 .byte   0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e
314 .byte   0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89
315 .byte   0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b
316 .byte   0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20
317 .byte   0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4
318 .byte   0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31
319 .byte   0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f
320 .byte   0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d
321 .byte   0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef
322 .byte   0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0
323 .byte   0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61
324 .byte   0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26
325 .byte   0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d
326
327
328 .globl  .AES_encrypt
329 .align  7
330 .AES_encrypt:
331         mflr    r0
332         $STU    $sp,-$FRAME($sp)
333
334         $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
335         $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
336         $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
337         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
338         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
339         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
340         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
341         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
342         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
343         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
344         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
345         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
346         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
347         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
348         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
349         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
350         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
351         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
352         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
353         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
354         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
355
356         lwz     $s0,0($inp)
357         lwz     $s1,4($inp)
358         lwz     $s2,8($inp)
359         lwz     $s3,12($inp)
360         bl      LAES_Te
361         bl      Lppc_AES_encrypt_compact
362         stw     $s0,0($out)
363         stw     $s1,4($out)
364         stw     $s2,8($out)
365         stw     $s3,12($out)
366
367         $POP    r0,`$FRAME-$SIZE_T*21`($sp)
368         $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
369         $POP    r13,`$FRAME-$SIZE_T*19`($sp)
370         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
371         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
372         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
373         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
374         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
375         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
376         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
377         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
378         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
379         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
380         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
381         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
382         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
383         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
384         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
385         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
386         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
387         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
388         mtlr    r0
389         addi    $sp,$sp,$FRAME
390         blr
391
392 .align  5
393 Lppc_AES_encrypt:
394         lwz     $acc00,240($key)
395         lwz     $t0,0($key)
396         lwz     $t1,4($key)
397         lwz     $t2,8($key)
398         lwz     $t3,12($key)
399         addi    $Tbl1,$Tbl0,3
400         addi    $Tbl2,$Tbl0,2
401         addi    $Tbl3,$Tbl0,1
402         addi    $acc00,$acc00,-1
403         addi    $key,$key,16
404         xor     $s0,$s0,$t0
405         xor     $s1,$s1,$t1
406         xor     $s2,$s2,$t2
407         xor     $s3,$s3,$t3
408         mtctr   $acc00
409 .align  4
410 Lenc_loop:
411         rlwinm  $acc00,$s0,`32-24+3`,21,28
412         rlwinm  $acc01,$s1,`32-24+3`,21,28
413         rlwinm  $acc02,$s2,`32-24+3`,21,28
414         rlwinm  $acc03,$s3,`32-24+3`,21,28
415         lwz     $t0,0($key)
416         lwz     $t1,4($key)
417         rlwinm  $acc04,$s1,`32-16+3`,21,28
418         rlwinm  $acc05,$s2,`32-16+3`,21,28
419         lwz     $t2,8($key)
420         lwz     $t3,12($key)
421         rlwinm  $acc06,$s3,`32-16+3`,21,28
422         rlwinm  $acc07,$s0,`32-16+3`,21,28
423         lwzx    $acc00,$Tbl0,$acc00
424         lwzx    $acc01,$Tbl0,$acc01
425         rlwinm  $acc08,$s2,`32-8+3`,21,28
426         rlwinm  $acc09,$s3,`32-8+3`,21,28
427         lwzx    $acc02,$Tbl0,$acc02
428         lwzx    $acc03,$Tbl0,$acc03
429         rlwinm  $acc10,$s0,`32-8+3`,21,28
430         rlwinm  $acc11,$s1,`32-8+3`,21,28
431         lwzx    $acc04,$Tbl1,$acc04
432         lwzx    $acc05,$Tbl1,$acc05
433         rlwinm  $acc12,$s3,`0+3`,21,28
434         rlwinm  $acc13,$s0,`0+3`,21,28
435         lwzx    $acc06,$Tbl1,$acc06
436         lwzx    $acc07,$Tbl1,$acc07
437         rlwinm  $acc14,$s1,`0+3`,21,28
438         rlwinm  $acc15,$s2,`0+3`,21,28
439         lwzx    $acc08,$Tbl2,$acc08
440         lwzx    $acc09,$Tbl2,$acc09
441         xor     $t0,$t0,$acc00
442         xor     $t1,$t1,$acc01
443         lwzx    $acc10,$Tbl2,$acc10
444         lwzx    $acc11,$Tbl2,$acc11
445         xor     $t2,$t2,$acc02
446         xor     $t3,$t3,$acc03
447         lwzx    $acc12,$Tbl3,$acc12
448         lwzx    $acc13,$Tbl3,$acc13
449         xor     $t0,$t0,$acc04
450         xor     $t1,$t1,$acc05
451         lwzx    $acc14,$Tbl3,$acc14
452         lwzx    $acc15,$Tbl3,$acc15
453         xor     $t2,$t2,$acc06
454         xor     $t3,$t3,$acc07
455         xor     $t0,$t0,$acc08
456         xor     $t1,$t1,$acc09
457         xor     $t2,$t2,$acc10
458         xor     $t3,$t3,$acc11
459         xor     $s0,$t0,$acc12
460         xor     $s1,$t1,$acc13
461         xor     $s2,$t2,$acc14
462         xor     $s3,$t3,$acc15
463         addi    $key,$key,16
464         bdnz-   Lenc_loop
465
466         addi    $Tbl2,$Tbl0,2048
467         nop
468         lwz     $t0,0($key)
469         lwz     $t1,4($key)
470         rlwinm  $acc00,$s0,`32-24`,24,31
471         rlwinm  $acc01,$s1,`32-24`,24,31
472         lwz     $t2,8($key)
473         lwz     $t3,12($key)
474         rlwinm  $acc02,$s2,`32-24`,24,31
475         rlwinm  $acc03,$s3,`32-24`,24,31
476         lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Te4
477         lwz     $acc09,`2048+32`($Tbl0)
478         rlwinm  $acc04,$s1,`32-16`,24,31
479         rlwinm  $acc05,$s2,`32-16`,24,31
480         lwz     $acc10,`2048+64`($Tbl0)
481         lwz     $acc11,`2048+96`($Tbl0)
482         rlwinm  $acc06,$s3,`32-16`,24,31
483         rlwinm  $acc07,$s0,`32-16`,24,31
484         lwz     $acc12,`2048+128`($Tbl0)
485         lwz     $acc13,`2048+160`($Tbl0)
486         rlwinm  $acc08,$s2,`32-8`,24,31
487         rlwinm  $acc09,$s3,`32-8`,24,31
488         lwz     $acc14,`2048+192`($Tbl0)
489         lwz     $acc15,`2048+224`($Tbl0)
490         rlwinm  $acc10,$s0,`32-8`,24,31
491         rlwinm  $acc11,$s1,`32-8`,24,31
492         lbzx    $acc00,$Tbl2,$acc00
493         lbzx    $acc01,$Tbl2,$acc01
494         rlwinm  $acc12,$s3,`0`,24,31
495         rlwinm  $acc13,$s0,`0`,24,31
496         lbzx    $acc02,$Tbl2,$acc02
497         lbzx    $acc03,$Tbl2,$acc03
498         rlwinm  $acc14,$s1,`0`,24,31
499         rlwinm  $acc15,$s2,`0`,24,31
500         lbzx    $acc04,$Tbl2,$acc04
501         lbzx    $acc05,$Tbl2,$acc05
502         rlwinm  $s0,$acc00,24,0,7
503         rlwinm  $s1,$acc01,24,0,7
504         lbzx    $acc06,$Tbl2,$acc06
505         lbzx    $acc07,$Tbl2,$acc07
506         rlwinm  $s2,$acc02,24,0,7
507         rlwinm  $s3,$acc03,24,0,7
508         lbzx    $acc08,$Tbl2,$acc08
509         lbzx    $acc09,$Tbl2,$acc09
510         rlwimi  $s0,$acc04,16,8,15
511         rlwimi  $s1,$acc05,16,8,15
512         lbzx    $acc10,$Tbl2,$acc10
513         lbzx    $acc11,$Tbl2,$acc11
514         rlwimi  $s2,$acc06,16,8,15
515         rlwimi  $s3,$acc07,16,8,15
516         lbzx    $acc12,$Tbl2,$acc12
517         lbzx    $acc13,$Tbl2,$acc13
518         rlwimi  $s0,$acc08,8,16,23
519         rlwimi  $s1,$acc09,8,16,23
520         lbzx    $acc14,$Tbl2,$acc14
521         lbzx    $acc15,$Tbl2,$acc15
522         rlwimi  $s2,$acc10,8,16,23
523         rlwimi  $s3,$acc11,8,16,23
524         or      $s0,$s0,$acc12
525         or      $s1,$s1,$acc13
526         or      $s2,$s2,$acc14
527         or      $s3,$s3,$acc15
528         xor     $s0,$s0,$t0
529         xor     $s1,$s1,$t1
530         xor     $s2,$s2,$t2
531         xor     $s3,$s3,$t3
532         blr
533
534 .align  4
535 Lppc_AES_encrypt_compact:
536         lwz     $acc00,240($key)
537         lwz     $t0,0($key)
538         lwz     $t1,4($key)
539         lwz     $t2,8($key)
540         lwz     $t3,12($key)
541         addi    $Tbl1,$Tbl0,2048
542         lis     $mask80,0x8080
543         lis     $mask1b,0x1b1b
544         addi    $key,$key,16
545         ori     $mask80,$mask80,0x8080
546         ori     $mask1b,$mask1b,0x1b1b
547         mtctr   $acc00
548 .align  4
549 Lenc_compact_loop:
550         xor     $s0,$s0,$t0
551         xor     $s1,$s1,$t1
552         xor     $s2,$s2,$t2
553         xor     $s3,$s3,$t3
554         rlwinm  $acc00,$s0,`32-24`,24,31
555         rlwinm  $acc01,$s1,`32-24`,24,31
556         rlwinm  $acc02,$s2,`32-24`,24,31
557         rlwinm  $acc03,$s3,`32-24`,24,31
558         rlwinm  $acc04,$s1,`32-16`,24,31
559         rlwinm  $acc05,$s2,`32-16`,24,31
560         rlwinm  $acc06,$s3,`32-16`,24,31
561         rlwinm  $acc07,$s0,`32-16`,24,31
562         lbzx    $acc00,$Tbl1,$acc00
563         lbzx    $acc01,$Tbl1,$acc01
564         rlwinm  $acc08,$s2,`32-8`,24,31
565         rlwinm  $acc09,$s3,`32-8`,24,31
566         lbzx    $acc02,$Tbl1,$acc02
567         lbzx    $acc03,$Tbl1,$acc03
568         rlwinm  $acc10,$s0,`32-8`,24,31
569         rlwinm  $acc11,$s1,`32-8`,24,31
570         lbzx    $acc04,$Tbl1,$acc04
571         lbzx    $acc05,$Tbl1,$acc05
572         rlwinm  $acc12,$s3,`0`,24,31
573         rlwinm  $acc13,$s0,`0`,24,31
574         lbzx    $acc06,$Tbl1,$acc06
575         lbzx    $acc07,$Tbl1,$acc07
576         rlwinm  $acc14,$s1,`0`,24,31
577         rlwinm  $acc15,$s2,`0`,24,31
578         lbzx    $acc08,$Tbl1,$acc08
579         lbzx    $acc09,$Tbl1,$acc09
580         rlwinm  $s0,$acc00,24,0,7
581         rlwinm  $s1,$acc01,24,0,7
582         lbzx    $acc10,$Tbl1,$acc10
583         lbzx    $acc11,$Tbl1,$acc11
584         rlwinm  $s2,$acc02,24,0,7
585         rlwinm  $s3,$acc03,24,0,7
586         lbzx    $acc12,$Tbl1,$acc12
587         lbzx    $acc13,$Tbl1,$acc13
588         rlwimi  $s0,$acc04,16,8,15
589         rlwimi  $s1,$acc05,16,8,15
590         lbzx    $acc14,$Tbl1,$acc14
591         lbzx    $acc15,$Tbl1,$acc15
592         rlwimi  $s2,$acc06,16,8,15
593         rlwimi  $s3,$acc07,16,8,15
594         rlwimi  $s0,$acc08,8,16,23
595         rlwimi  $s1,$acc09,8,16,23
596         rlwimi  $s2,$acc10,8,16,23
597         rlwimi  $s3,$acc11,8,16,23
598         lwz     $t0,0($key)
599         lwz     $t1,4($key)
600         or      $s0,$s0,$acc12
601         or      $s1,$s1,$acc13
602         lwz     $t2,8($key)
603         lwz     $t3,12($key)
604         or      $s2,$s2,$acc14
605         or      $s3,$s3,$acc15
606
607         addi    $key,$key,16
608         bdz     Lenc_compact_done
609
610         and     $acc00,$s0,$mask80      # r1=r0&0x80808080
611         and     $acc01,$s1,$mask80
612         and     $acc02,$s2,$mask80
613         and     $acc03,$s3,$mask80
614         srwi    $acc04,$acc00,7         # r1>>7
615         srwi    $acc05,$acc01,7
616         srwi    $acc06,$acc02,7
617         srwi    $acc07,$acc03,7
618         andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
619         andc    $acc09,$s1,$mask80
620         andc    $acc10,$s2,$mask80
621         andc    $acc11,$s3,$mask80
622         sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
623         sub     $acc01,$acc01,$acc05
624         sub     $acc02,$acc02,$acc06
625         sub     $acc03,$acc03,$acc07
626         add     $acc08,$acc08,$acc08    # (r0&0x7f7f7f7f)<<1
627         add     $acc09,$acc09,$acc09
628         add     $acc10,$acc10,$acc10
629         add     $acc11,$acc11,$acc11
630         and     $acc00,$acc00,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
631         and     $acc01,$acc01,$mask1b
632         and     $acc02,$acc02,$mask1b
633         and     $acc03,$acc03,$mask1b
634         xor     $acc00,$acc00,$acc08    # r2
635         xor     $acc01,$acc01,$acc09
636         xor     $acc02,$acc02,$acc10
637         xor     $acc03,$acc03,$acc11
638
639         rotlwi  $acc12,$s0,16           # ROTATE(r0,16)
640         rotlwi  $acc13,$s1,16
641         rotlwi  $acc14,$s2,16
642         rotlwi  $acc15,$s3,16
643         xor     $s0,$s0,$acc00          # r0^r2
644         xor     $s1,$s1,$acc01
645         xor     $s2,$s2,$acc02
646         xor     $s3,$s3,$acc03
647         rotrwi  $s0,$s0,24              # ROTATE(r2^r0,24)
648         rotrwi  $s1,$s1,24
649         rotrwi  $s2,$s2,24
650         rotrwi  $s3,$s3,24
651         xor     $s0,$s0,$acc00          # ROTATE(r2^r0,24)^r2
652         xor     $s1,$s1,$acc01
653         xor     $s2,$s2,$acc02
654         xor     $s3,$s3,$acc03
655         rotlwi  $acc08,$acc12,8         # ROTATE(r0,24)
656         rotlwi  $acc09,$acc13,8
657         rotlwi  $acc10,$acc14,8
658         rotlwi  $acc11,$acc15,8
659         xor     $s0,$s0,$acc12          #
660         xor     $s1,$s1,$acc13
661         xor     $s2,$s2,$acc14
662         xor     $s3,$s3,$acc15
663         xor     $s0,$s0,$acc08          #
664         xor     $s1,$s1,$acc09
665         xor     $s2,$s2,$acc10
666         xor     $s3,$s3,$acc11
667
668         b       Lenc_compact_loop
669 .align  4
670 Lenc_compact_done:
671         xor     $s0,$s0,$t0
672         xor     $s1,$s1,$t1
673         xor     $s2,$s2,$t2
674         xor     $s3,$s3,$t3
675         blr
676
677 .globl  .AES_decrypt
678 .align  7
679 .AES_decrypt:
680         mflr    r0
681         $STU    $sp,-$FRAME($sp)
682
683         $PUSH   r0,`$FRAME-$SIZE_T*21`($sp)
684         $PUSH   $toc,`$FRAME-$SIZE_T*20`($sp)
685         $PUSH   r13,`$FRAME-$SIZE_T*19`($sp)
686         $PUSH   r14,`$FRAME-$SIZE_T*18`($sp)
687         $PUSH   r15,`$FRAME-$SIZE_T*17`($sp)
688         $PUSH   r16,`$FRAME-$SIZE_T*16`($sp)
689         $PUSH   r17,`$FRAME-$SIZE_T*15`($sp)
690         $PUSH   r18,`$FRAME-$SIZE_T*14`($sp)
691         $PUSH   r19,`$FRAME-$SIZE_T*13`($sp)
692         $PUSH   r20,`$FRAME-$SIZE_T*12`($sp)
693         $PUSH   r21,`$FRAME-$SIZE_T*11`($sp)
694         $PUSH   r22,`$FRAME-$SIZE_T*10`($sp)
695         $PUSH   r23,`$FRAME-$SIZE_T*9`($sp)
696         $PUSH   r24,`$FRAME-$SIZE_T*8`($sp)
697         $PUSH   r25,`$FRAME-$SIZE_T*7`($sp)
698         $PUSH   r26,`$FRAME-$SIZE_T*6`($sp)
699         $PUSH   r27,`$FRAME-$SIZE_T*5`($sp)
700         $PUSH   r28,`$FRAME-$SIZE_T*4`($sp)
701         $PUSH   r29,`$FRAME-$SIZE_T*3`($sp)
702         $PUSH   r30,`$FRAME-$SIZE_T*2`($sp)
703         $PUSH   r31,`$FRAME-$SIZE_T*1`($sp)
704
705         lwz     $s0,0($inp)
706         lwz     $s1,4($inp)
707         lwz     $s2,8($inp)
708         lwz     $s3,12($inp)
709         bl      LAES_Td
710         bl      Lppc_AES_decrypt_compact
711         stw     $s0,0($out)
712         stw     $s1,4($out)
713         stw     $s2,8($out)
714         stw     $s3,12($out)
715
716         $POP    r0,`$FRAME-$SIZE_T*21`($sp)
717         $POP    $toc,`$FRAME-$SIZE_T*20`($sp)
718         $POP    r13,`$FRAME-$SIZE_T*19`($sp)
719         $POP    r14,`$FRAME-$SIZE_T*18`($sp)
720         $POP    r15,`$FRAME-$SIZE_T*17`($sp)
721         $POP    r16,`$FRAME-$SIZE_T*16`($sp)
722         $POP    r17,`$FRAME-$SIZE_T*15`($sp)
723         $POP    r18,`$FRAME-$SIZE_T*14`($sp)
724         $POP    r19,`$FRAME-$SIZE_T*13`($sp)
725         $POP    r20,`$FRAME-$SIZE_T*12`($sp)
726         $POP    r21,`$FRAME-$SIZE_T*11`($sp)
727         $POP    r22,`$FRAME-$SIZE_T*10`($sp)
728         $POP    r23,`$FRAME-$SIZE_T*9`($sp)
729         $POP    r24,`$FRAME-$SIZE_T*8`($sp)
730         $POP    r25,`$FRAME-$SIZE_T*7`($sp)
731         $POP    r26,`$FRAME-$SIZE_T*6`($sp)
732         $POP    r27,`$FRAME-$SIZE_T*5`($sp)
733         $POP    r28,`$FRAME-$SIZE_T*4`($sp)
734         $POP    r29,`$FRAME-$SIZE_T*3`($sp)
735         $POP    r30,`$FRAME-$SIZE_T*2`($sp)
736         $POP    r31,`$FRAME-$SIZE_T*1`($sp)
737         mtlr    r0
738         addi    $sp,$sp,$FRAME
739         blr
740
741 .align  5
742 Lppc_AES_decrypt:
743         lwz     $acc00,240($key)
744         lwz     $t0,0($key)
745         lwz     $t1,4($key)
746         lwz     $t2,8($key)
747         lwz     $t3,12($key)
748         addi    $Tbl1,$Tbl0,3
749         addi    $Tbl2,$Tbl0,2
750         addi    $Tbl3,$Tbl0,1
751         addi    $acc00,$acc00,-1
752         addi    $key,$key,16
753         xor     $s0,$s0,$t0
754         xor     $s1,$s1,$t1
755         xor     $s2,$s2,$t2
756         xor     $s3,$s3,$t3
757         mtctr   $acc00
758 .align  4
759 Ldec_loop:
760         rlwinm  $acc00,$s0,`32-24+3`,21,28
761         rlwinm  $acc01,$s1,`32-24+3`,21,28
762         rlwinm  $acc02,$s2,`32-24+3`,21,28
763         rlwinm  $acc03,$s3,`32-24+3`,21,28
764         lwz     $t0,0($key)
765         lwz     $t1,4($key)
766         rlwinm  $acc04,$s3,`32-16+3`,21,28
767         rlwinm  $acc05,$s0,`32-16+3`,21,28
768         lwz     $t2,8($key)
769         lwz     $t3,12($key)
770         rlwinm  $acc06,$s1,`32-16+3`,21,28
771         rlwinm  $acc07,$s2,`32-16+3`,21,28
772         lwzx    $acc00,$Tbl0,$acc00
773         lwzx    $acc01,$Tbl0,$acc01
774         rlwinm  $acc08,$s2,`32-8+3`,21,28
775         rlwinm  $acc09,$s3,`32-8+3`,21,28
776         lwzx    $acc02,$Tbl0,$acc02
777         lwzx    $acc03,$Tbl0,$acc03
778         rlwinm  $acc10,$s0,`32-8+3`,21,28
779         rlwinm  $acc11,$s1,`32-8+3`,21,28
780         lwzx    $acc04,$Tbl1,$acc04
781         lwzx    $acc05,$Tbl1,$acc05
782         rlwinm  $acc12,$s1,`0+3`,21,28
783         rlwinm  $acc13,$s2,`0+3`,21,28
784         lwzx    $acc06,$Tbl1,$acc06
785         lwzx    $acc07,$Tbl1,$acc07
786         rlwinm  $acc14,$s3,`0+3`,21,28
787         rlwinm  $acc15,$s0,`0+3`,21,28
788         lwzx    $acc08,$Tbl2,$acc08
789         lwzx    $acc09,$Tbl2,$acc09
790         xor     $t0,$t0,$acc00
791         xor     $t1,$t1,$acc01
792         lwzx    $acc10,$Tbl2,$acc10
793         lwzx    $acc11,$Tbl2,$acc11
794         xor     $t2,$t2,$acc02
795         xor     $t3,$t3,$acc03
796         lwzx    $acc12,$Tbl3,$acc12
797         lwzx    $acc13,$Tbl3,$acc13
798         xor     $t0,$t0,$acc04
799         xor     $t1,$t1,$acc05
800         lwzx    $acc14,$Tbl3,$acc14
801         lwzx    $acc15,$Tbl3,$acc15
802         xor     $t2,$t2,$acc06
803         xor     $t3,$t3,$acc07
804         xor     $t0,$t0,$acc08
805         xor     $t1,$t1,$acc09
806         xor     $t2,$t2,$acc10
807         xor     $t3,$t3,$acc11
808         xor     $s0,$t0,$acc12
809         xor     $s1,$t1,$acc13
810         xor     $s2,$t2,$acc14
811         xor     $s3,$t3,$acc15
812         addi    $key,$key,16
813         bdnz-   Ldec_loop
814
815         addi    $Tbl2,$Tbl0,2048
816         nop
817         lwz     $t0,0($key)
818         lwz     $t1,4($key)
819         rlwinm  $acc00,$s0,`32-24`,24,31
820         rlwinm  $acc01,$s1,`32-24`,24,31
821         lwz     $t2,8($key)
822         lwz     $t3,12($key)
823         rlwinm  $acc02,$s2,`32-24`,24,31
824         rlwinm  $acc03,$s3,`32-24`,24,31
825         lwz     $acc08,`2048+0`($Tbl0)  ! prefetch Td4
826         lwz     $acc09,`2048+32`($Tbl0)
827         rlwinm  $acc04,$s3,`32-16`,24,31
828         rlwinm  $acc05,$s0,`32-16`,24,31
829         lwz     $acc10,`2048+64`($Tbl0)
830         lwz     $acc11,`2048+96`($Tbl0)
831         lbzx    $acc00,$Tbl2,$acc00
832         lbzx    $acc01,$Tbl2,$acc01
833         lwz     $acc12,`2048+128`($Tbl0)
834         lwz     $acc13,`2048+160`($Tbl0)
835         rlwinm  $acc06,$s1,`32-16`,24,31
836         rlwinm  $acc07,$s2,`32-16`,24,31
837         lwz     $acc14,`2048+192`($Tbl0)
838         lwz     $acc15,`2048+224`($Tbl0)
839         rlwinm  $acc08,$s2,`32-8`,24,31
840         rlwinm  $acc09,$s3,`32-8`,24,31
841         lbzx    $acc02,$Tbl2,$acc02
842         lbzx    $acc03,$Tbl2,$acc03
843         rlwinm  $acc10,$s0,`32-8`,24,31
844         rlwinm  $acc11,$s1,`32-8`,24,31
845         lbzx    $acc04,$Tbl2,$acc04
846         lbzx    $acc05,$Tbl2,$acc05
847         rlwinm  $acc12,$s1,`0`,24,31
848         rlwinm  $acc13,$s2,`0`,24,31
849         lbzx    $acc06,$Tbl2,$acc06
850         lbzx    $acc07,$Tbl2,$acc07
851         rlwinm  $acc14,$s3,`0`,24,31
852         rlwinm  $acc15,$s0,`0`,24,31
853         lbzx    $acc08,$Tbl2,$acc08
854         lbzx    $acc09,$Tbl2,$acc09
855         rlwinm  $s0,$acc00,24,0,7
856         rlwinm  $s1,$acc01,24,0,7
857         lbzx    $acc10,$Tbl2,$acc10
858         lbzx    $acc11,$Tbl2,$acc11
859         rlwinm  $s2,$acc02,24,0,7
860         rlwinm  $s3,$acc03,24,0,7
861         lbzx    $acc12,$Tbl2,$acc12
862         lbzx    $acc13,$Tbl2,$acc13
863         rlwimi  $s0,$acc04,16,8,15
864         rlwimi  $s1,$acc05,16,8,15
865         lbzx    $acc14,$Tbl2,$acc14
866         lbzx    $acc15,$Tbl2,$acc15
867         rlwimi  $s2,$acc06,16,8,15
868         rlwimi  $s3,$acc07,16,8,15
869         rlwimi  $s0,$acc08,8,16,23
870         rlwimi  $s1,$acc09,8,16,23
871         rlwimi  $s2,$acc10,8,16,23
872         rlwimi  $s3,$acc11,8,16,23
873         or      $s0,$s0,$acc12
874         or      $s1,$s1,$acc13
875         or      $s2,$s2,$acc14
876         or      $s3,$s3,$acc15
877         xor     $s0,$s0,$t0
878         xor     $s1,$s1,$t1
879         xor     $s2,$s2,$t2
880         xor     $s3,$s3,$t3
881         blr
882
883 .align  4
884 Lppc_AES_decrypt_compact:
885         lwz     $acc00,240($key)
886         lwz     $t0,0($key)
887         lwz     $t1,4($key)
888         lwz     $t2,8($key)
889         lwz     $t3,12($key)
890         addi    $Tbl1,$Tbl0,2048
891         lis     $mask80,0x8080
892         lis     $mask1b,0x1b1b
893         addi    $key,$key,16
894         ori     $mask80,$mask80,0x8080
895         ori     $mask1b,$mask1b,0x1b1b
896 ___
897 $code.=<<___ if ($SIZE_T==8);
898         insrdi  $mask80,$mask80,32,0
899         insrdi  $mask1b,$mask1b,32,0
900 ___
901 $code.=<<___;
902         mtctr   $acc00
903 .align  4
904 Ldec_compact_loop:
905         xor     $s0,$s0,$t0
906         xor     $s1,$s1,$t1
907         xor     $s2,$s2,$t2
908         xor     $s3,$s3,$t3
909         rlwinm  $acc00,$s0,`32-24`,24,31
910         rlwinm  $acc01,$s1,`32-24`,24,31
911         rlwinm  $acc02,$s2,`32-24`,24,31
912         rlwinm  $acc03,$s3,`32-24`,24,31
913         rlwinm  $acc04,$s3,`32-16`,24,31
914         rlwinm  $acc05,$s0,`32-16`,24,31
915         rlwinm  $acc06,$s1,`32-16`,24,31
916         rlwinm  $acc07,$s2,`32-16`,24,31
917         lbzx    $acc00,$Tbl1,$acc00
918         lbzx    $acc01,$Tbl1,$acc01
919         rlwinm  $acc08,$s2,`32-8`,24,31
920         rlwinm  $acc09,$s3,`32-8`,24,31
921         lbzx    $acc02,$Tbl1,$acc02
922         lbzx    $acc03,$Tbl1,$acc03
923         rlwinm  $acc10,$s0,`32-8`,24,31
924         rlwinm  $acc11,$s1,`32-8`,24,31
925         lbzx    $acc04,$Tbl1,$acc04
926         lbzx    $acc05,$Tbl1,$acc05
927         rlwinm  $acc12,$s1,`0`,24,31
928         rlwinm  $acc13,$s2,`0`,24,31
929         lbzx    $acc06,$Tbl1,$acc06
930         lbzx    $acc07,$Tbl1,$acc07
931         rlwinm  $acc14,$s3,`0`,24,31
932         rlwinm  $acc15,$s0,`0`,24,31
933         lbzx    $acc08,$Tbl1,$acc08
934         lbzx    $acc09,$Tbl1,$acc09
935         rlwinm  $s0,$acc00,24,0,7
936         rlwinm  $s1,$acc01,24,0,7
937         lbzx    $acc10,$Tbl1,$acc10
938         lbzx    $acc11,$Tbl1,$acc11
939         rlwinm  $s2,$acc02,24,0,7
940         rlwinm  $s3,$acc03,24,0,7
941         lbzx    $acc12,$Tbl1,$acc12
942         lbzx    $acc13,$Tbl1,$acc13
943         rlwimi  $s0,$acc04,16,8,15
944         rlwimi  $s1,$acc05,16,8,15
945         lbzx    $acc14,$Tbl1,$acc14
946         lbzx    $acc15,$Tbl1,$acc15
947         rlwimi  $s2,$acc06,16,8,15
948         rlwimi  $s3,$acc07,16,8,15
949         rlwimi  $s0,$acc08,8,16,23
950         rlwimi  $s1,$acc09,8,16,23
951         rlwimi  $s2,$acc10,8,16,23
952         rlwimi  $s3,$acc11,8,16,23
953         lwz     $t0,0($key)
954         lwz     $t1,4($key)
955         or      $s0,$s0,$acc12
956         or      $s1,$s1,$acc13
957         lwz     $t2,8($key)
958         lwz     $t3,12($key)
959         or      $s2,$s2,$acc14
960         or      $s3,$s3,$acc15
961
962         addi    $key,$key,16
963         bdz     Ldec_compact_done
964 ___
965 $code.=<<___ if ($SIZE_T==8);
966         # vectorized permutation improves decrypt performance by 10%
967         insrdi  $s0,$s1,32,0
968         insrdi  $s2,$s3,32,0
969
970         and     $acc00,$s0,$mask80      # r1=r0&0x80808080
971         and     $acc02,$s2,$mask80
972         srdi    $acc04,$acc00,7         # r1>>7
973         srdi    $acc06,$acc02,7
974         andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
975         andc    $acc10,$s2,$mask80
976         sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
977         sub     $acc02,$acc02,$acc06
978         add     $acc08,$acc08,$acc08    # (r0&0x7f7f7f7f)<<1
979         add     $acc10,$acc10,$acc10
980         and     $acc00,$acc00,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
981         and     $acc02,$acc02,$mask1b
982         xor     $acc00,$acc00,$acc08    # r2
983         xor     $acc02,$acc02,$acc10
984
985         and     $acc04,$acc00,$mask80   # r1=r2&0x80808080
986         and     $acc06,$acc02,$mask80
987         srdi    $acc08,$acc04,7         # r1>>7
988         srdi    $acc10,$acc06,7
989         andc    $acc12,$acc00,$mask80   # r2&0x7f7f7f7f
990         andc    $acc14,$acc02,$mask80
991         sub     $acc04,$acc04,$acc08    # r1-(r1>>7)
992         sub     $acc06,$acc06,$acc10
993         add     $acc12,$acc12,$acc12    # (r2&0x7f7f7f7f)<<1
994         add     $acc14,$acc14,$acc14
995         and     $acc04,$acc04,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
996         and     $acc06,$acc06,$mask1b
997         xor     $acc04,$acc04,$acc12    # r4
998         xor     $acc06,$acc06,$acc14
999
1000         and     $acc08,$acc04,$mask80   # r1=r4&0x80808080
1001         and     $acc10,$acc06,$mask80
1002         srdi    $acc12,$acc08,7         # r1>>7
1003         srdi    $acc14,$acc10,7
1004         sub     $acc08,$acc08,$acc12    # r1-(r1>>7)
1005         sub     $acc10,$acc10,$acc14
1006         andc    $acc12,$acc04,$mask80   # r4&0x7f7f7f7f
1007         andc    $acc14,$acc06,$mask80
1008         add     $acc12,$acc12,$acc12    # (r4&0x7f7f7f7f)<<1
1009         add     $acc14,$acc14,$acc14
1010         and     $acc08,$acc08,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
1011         and     $acc10,$acc10,$mask1b
1012         xor     $acc08,$acc08,$acc12    # r8
1013         xor     $acc10,$acc10,$acc14
1014
1015         xor     $acc00,$acc00,$s0       # r2^r0
1016         xor     $acc02,$acc02,$s2
1017         xor     $acc04,$acc04,$s0       # r4^r0
1018         xor     $acc06,$acc06,$s2
1019
1020         extrdi  $acc01,$acc00,32,0
1021         extrdi  $acc03,$acc02,32,0
1022         extrdi  $acc05,$acc04,32,0
1023         extrdi  $acc07,$acc06,32,0
1024         extrdi  $acc09,$acc08,32,0
1025         extrdi  $acc11,$acc10,32,0
1026 ___
1027 $code.=<<___ if ($SIZE_T==4);
1028         and     $acc00,$s0,$mask80      # r1=r0&0x80808080
1029         and     $acc01,$s1,$mask80
1030         and     $acc02,$s2,$mask80
1031         and     $acc03,$s3,$mask80
1032         srwi    $acc04,$acc00,7         # r1>>7
1033         srwi    $acc05,$acc01,7
1034         srwi    $acc06,$acc02,7
1035         srwi    $acc07,$acc03,7
1036         andc    $acc08,$s0,$mask80      # r0&0x7f7f7f7f
1037         andc    $acc09,$s1,$mask80
1038         andc    $acc10,$s2,$mask80
1039         andc    $acc11,$s3,$mask80
1040         sub     $acc00,$acc00,$acc04    # r1-(r1>>7)
1041         sub     $acc01,$acc01,$acc05
1042         sub     $acc02,$acc02,$acc06
1043         sub     $acc03,$acc03,$acc07
1044         add     $acc08,$acc08,$acc08    # (r0&0x7f7f7f7f)<<1
1045         add     $acc09,$acc09,$acc09
1046         add     $acc10,$acc10,$acc10
1047         add     $acc11,$acc11,$acc11
1048         and     $acc00,$acc00,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
1049         and     $acc01,$acc01,$mask1b
1050         and     $acc02,$acc02,$mask1b
1051         and     $acc03,$acc03,$mask1b
1052         xor     $acc00,$acc00,$acc08    # r2
1053         xor     $acc01,$acc01,$acc09
1054         xor     $acc02,$acc02,$acc10
1055         xor     $acc03,$acc03,$acc11
1056
1057         and     $acc04,$acc00,$mask80   # r1=r2&0x80808080
1058         and     $acc05,$acc01,$mask80
1059         and     $acc06,$acc02,$mask80
1060         and     $acc07,$acc03,$mask80
1061         srwi    $acc08,$acc04,7         # r1>>7
1062         srwi    $acc09,$acc05,7
1063         srwi    $acc10,$acc06,7
1064         srwi    $acc11,$acc07,7
1065         andc    $acc12,$acc00,$mask80   # r2&0x7f7f7f7f
1066         andc    $acc13,$acc01,$mask80
1067         andc    $acc14,$acc02,$mask80
1068         andc    $acc15,$acc03,$mask80
1069         sub     $acc04,$acc04,$acc08    # r1-(r1>>7)
1070         sub     $acc05,$acc05,$acc09
1071         sub     $acc06,$acc06,$acc10
1072         sub     $acc07,$acc07,$acc11
1073         add     $acc12,$acc12,$acc12    # (r2&0x7f7f7f7f)<<1
1074         add     $acc13,$acc13,$acc13
1075         add     $acc14,$acc14,$acc14
1076         add     $acc15,$acc15,$acc15
1077         and     $acc04,$acc04,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
1078         and     $acc05,$acc05,$mask1b
1079         and     $acc06,$acc06,$mask1b
1080         and     $acc07,$acc07,$mask1b
1081         xor     $acc04,$acc04,$acc12    # r4
1082         xor     $acc05,$acc05,$acc13
1083         xor     $acc06,$acc06,$acc14
1084         xor     $acc07,$acc07,$acc15
1085
1086         and     $acc08,$acc04,$mask80   # r1=r4&0x80808080
1087         and     $acc09,$acc05,$mask80
1088         and     $acc10,$acc06,$mask80
1089         and     $acc11,$acc07,$mask80
1090         srwi    $acc12,$acc08,7         # r1>>7
1091         srwi    $acc13,$acc09,7
1092         srwi    $acc14,$acc10,7
1093         srwi    $acc15,$acc11,7
1094         sub     $acc08,$acc08,$acc12    # r1-(r1>>7)
1095         sub     $acc09,$acc09,$acc13
1096         sub     $acc10,$acc10,$acc14
1097         sub     $acc11,$acc11,$acc15
1098         andc    $acc12,$acc04,$mask80   # r4&0x7f7f7f7f
1099         andc    $acc13,$acc05,$mask80
1100         andc    $acc14,$acc06,$mask80
1101         andc    $acc15,$acc07,$mask80
1102         add     $acc12,$acc12,$acc12    # (r4&0x7f7f7f7f)<<1
1103         add     $acc13,$acc13,$acc13
1104         add     $acc14,$acc14,$acc14
1105         add     $acc15,$acc15,$acc15
1106         and     $acc08,$acc08,$mask1b   # (r1-(r1>>7))&0x1b1b1b1b
1107         and     $acc09,$acc09,$mask1b
1108         and     $acc10,$acc10,$mask1b
1109         and     $acc11,$acc11,$mask1b
1110         xor     $acc08,$acc08,$acc12    # r8
1111         xor     $acc09,$acc09,$acc13
1112         xor     $acc10,$acc10,$acc14
1113         xor     $acc11,$acc11,$acc15
1114
1115         xor     $acc00,$acc00,$s0       # r2^r0
1116         xor     $acc01,$acc01,$s1
1117         xor     $acc02,$acc02,$s2
1118         xor     $acc03,$acc03,$s3
1119         xor     $acc04,$acc04,$s0       # r4^r0
1120         xor     $acc05,$acc05,$s1
1121         xor     $acc06,$acc06,$s2
1122         xor     $acc07,$acc07,$s3
1123 ___
1124 $code.=<<___;
1125         rotrwi  $s0,$s0,8               # = ROTATE(r0,8)
1126         rotrwi  $s1,$s1,8
1127         rotrwi  $s2,$s2,8
1128         rotrwi  $s3,$s3,8
1129         xor     $s0,$s0,$acc00          # ^= r2^r0
1130         xor     $s1,$s1,$acc01
1131         xor     $s2,$s2,$acc02
1132         xor     $s3,$s3,$acc03
1133         xor     $acc00,$acc00,$acc08
1134         xor     $acc01,$acc01,$acc09
1135         xor     $acc02,$acc02,$acc10
1136         xor     $acc03,$acc03,$acc11
1137         xor     $s0,$s0,$acc04          # ^= r4^r0
1138         xor     $s1,$s1,$acc05
1139         xor     $s2,$s2,$acc06
1140         xor     $s3,$s3,$acc07
1141         rotrwi  $acc00,$acc00,24
1142         rotrwi  $acc01,$acc01,24
1143         rotrwi  $acc02,$acc02,24
1144         rotrwi  $acc03,$acc03,24
1145         xor     $acc04,$acc04,$acc08
1146         xor     $acc05,$acc05,$acc09
1147         xor     $acc06,$acc06,$acc10
1148         xor     $acc07,$acc07,$acc11
1149         xor     $s0,$s0,$acc08          # ^= r8 [^((r4^r0)^(r2^r0)=r4^r2)]
1150         xor     $s1,$s1,$acc09
1151         xor     $s2,$s2,$acc10
1152         xor     $s3,$s3,$acc11
1153         rotrwi  $acc04,$acc04,16
1154         rotrwi  $acc05,$acc05,16
1155         rotrwi  $acc06,$acc06,16
1156         rotrwi  $acc07,$acc07,16
1157         xor     $s0,$s0,$acc00          # ^= ROTATE(r8^r2^r0,24)
1158         xor     $s1,$s1,$acc01
1159         xor     $s2,$s2,$acc02
1160         xor     $s3,$s3,$acc03
1161         rotrwi  $acc08,$acc08,8
1162         rotrwi  $acc09,$acc09,8
1163         rotrwi  $acc10,$acc10,8
1164         rotrwi  $acc11,$acc11,8
1165         xor     $s0,$s0,$acc04          # ^= ROTATE(r8^r4^r0,16)
1166         xor     $s1,$s1,$acc05
1167         xor     $s2,$s2,$acc06
1168         xor     $s3,$s3,$acc07
1169         xor     $s0,$s0,$acc08          # ^= ROTATE(r8,8)       
1170         xor     $s1,$s1,$acc09  
1171         xor     $s2,$s2,$acc10  
1172         xor     $s3,$s3,$acc11  
1173
1174         b       Ldec_compact_loop
1175 .align  4
1176 Ldec_compact_done:
1177         xor     $s0,$s0,$t0
1178         xor     $s1,$s1,$t1
1179         xor     $s2,$s2,$t2
1180         xor     $s3,$s3,$t3
1181         blr
1182 .long   0
1183 .asciz  "AES for PPC, CRYPTOGAMS by <appro\@openssl.org>"
1184 .align  7
1185 ___
1186
1187 $code =~ s/\`([^\`]*)\`/eval $1/gem;
1188 print $code;
1189 close STDOUT;